Merge remote-tracking branch 'upstream/master' into HEAD

This commit is contained in:
Anton Popov 2023-12-14 14:26:58 +00:00
commit 32c4b74067
315 changed files with 7122 additions and 3112 deletions

View File

@ -555,6 +555,27 @@ jobs:
cd "$REPO_COPY/tests/ci"
python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
##############################################################################################
########################### ClickBench #######################################################
##############################################################################################
ClickBenchAMD64:
needs: [BuilderDebRelease]
uses: ./.github/workflows/reusable_test.yml
with:
test_name: ClickBench (amd64)
runner_type: func-tester
run_command: |
cd "$REPO_COPY/tests/ci"
python3 clickbench.py "$CHECK_NAME"
ClickBenchAarch64:
needs: [BuilderDebAarch64]
uses: ./.github/workflows/reusable_test.yml
with:
test_name: ClickBench (aarch64)
runner_type: func-tester-aarch64
run_command: |
cd "$REPO_COPY/tests/ci"
python3 clickbench.py "$CHECK_NAME"
##############################################################################################
######################################### STRESS TESTS #######################################
##############################################################################################
StressTestAsan:

View File

@ -701,6 +701,27 @@ jobs:
cd "$REPO_COPY/tests/ci"
python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
##############################################################################################
########################### ClickBench #######################################################
##############################################################################################
ClickBenchAMD64:
needs: [BuilderDebRelease]
uses: ./.github/workflows/reusable_test.yml
with:
test_name: ClickBench (amd64)
runner_type: func-tester
run_command: |
cd "$REPO_COPY/tests/ci"
python3 clickbench.py "$CHECK_NAME"
ClickBenchAarch64:
needs: [BuilderDebAarch64]
uses: ./.github/workflows/reusable_test.yml
with:
test_name: ClickBench (aarch64)
runner_type: func-tester-aarch64
run_command: |
cd "$REPO_COPY/tests/ci"
python3 clickbench.py "$CHECK_NAME"
##############################################################################################
######################################### STRESS TESTS #######################################
##############################################################################################
StressTestAsan:

View File

@ -73,8 +73,3 @@ if (CMAKE_CROSSCOMPILING)
message (STATUS "Cross-compiling for target: ${CMAKE_CXX_COMPILE_TARGET}")
endif ()
if (USE_MUSL)
# Does not work for unknown reason
set (ENABLE_RUST OFF CACHE INTERNAL "")
endif ()

2
contrib/librdkafka vendored

@ -1 +1 @@
Subproject commit 6f3b483426a8c8ec950e27e446bec175cf8b553f
Subproject commit 2d2aab6f5b79db1cfca15d7bf0dee75d00d82082

View File

@ -125,6 +125,7 @@
"docker/test/server-jepsen",
"docker/test/sqllogic",
"docker/test/sqltest",
"docker/test/clickbench",
"docker/test/stateless"
]
},
@ -145,6 +146,10 @@
"name": "clickhouse/server-jepsen-test",
"dependent": []
},
"docker/test/clickbench": {
"name": "clickhouse/clickbench",
"dependent": []
},
"docker/test/install/deb": {
"name": "clickhouse/install-deb-test",
"dependent": []

View File

@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="23.11.1.2711"
ARG VERSION="23.11.2.11"
ARG PACKAGES="clickhouse-keeper"
# user/group precreated explicitly with fixed uid/gid on purpose.

View File

@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="23.11.1.2711"
ARG VERSION="23.11.2.11"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
# user/group precreated explicitly with fixed uid/gid on purpose.

View File

@ -30,7 +30,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
ARG VERSION="23.11.1.2711"
ARG VERSION="23.11.2.11"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
# set non-empty deb_location_url url to create a docker image

View File

@ -12,6 +12,7 @@ RUN apt-get update \
ripgrep \
zstd \
locales \
sudo \
--yes --no-install-recommends
# Sanitizer options for services (clickhouse-server)

View File

@ -21,7 +21,7 @@ EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "}
# trace_log needs more columns for symbolization
EXTRA_COLUMNS_TRACE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), lines Array(LowCardinality(String)), "
EXTRA_COLUMNS_EXPRESSION_TRACE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> toLowCardinality(demangle(addressToSymbol(x))), trace) AS symbols, arrayMap(x -> toLowCardinality(addressToLine(x)), trace) AS lines"
EXTRA_COLUMNS_EXPRESSION_TRACE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), trace)::Array(LowCardinality(String)) AS symbols, arrayMap(x -> addressToLine(x), trace)::Array(LowCardinality(String)) AS lines"
function __set_connection_args

View File

@ -0,0 +1,10 @@
ARG FROM_TAG=latest
FROM clickhouse/test-base:$FROM_TAG
ENV TZ=Europe/Amsterdam
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
COPY *.sh /
COPY *.sql /
CMD ["/bin/bash", "/run.sh"]

View File

@ -0,0 +1,112 @@
ATTACH TABLE hits UUID 'c449dfbf-ba06-4d13-abec-8396559eb955'
(
WatchID BIGINT NOT NULL,
JavaEnable SMALLINT NOT NULL,
Title TEXT NOT NULL,
GoodEvent SMALLINT NOT NULL,
EventTime TIMESTAMP NOT NULL,
EventDate Date NOT NULL,
CounterID INTEGER NOT NULL,
ClientIP INTEGER NOT NULL,
RegionID INTEGER NOT NULL,
UserID BIGINT NOT NULL,
CounterClass SMALLINT NOT NULL,
OS SMALLINT NOT NULL,
UserAgent SMALLINT NOT NULL,
URL TEXT NOT NULL,
Referer TEXT NOT NULL,
IsRefresh SMALLINT NOT NULL,
RefererCategoryID SMALLINT NOT NULL,
RefererRegionID INTEGER NOT NULL,
URLCategoryID SMALLINT NOT NULL,
URLRegionID INTEGER NOT NULL,
ResolutionWidth SMALLINT NOT NULL,
ResolutionHeight SMALLINT NOT NULL,
ResolutionDepth SMALLINT NOT NULL,
FlashMajor SMALLINT NOT NULL,
FlashMinor SMALLINT NOT NULL,
FlashMinor2 TEXT NOT NULL,
NetMajor SMALLINT NOT NULL,
NetMinor SMALLINT NOT NULL,
UserAgentMajor SMALLINT NOT NULL,
UserAgentMinor VARCHAR(255) NOT NULL,
CookieEnable SMALLINT NOT NULL,
JavascriptEnable SMALLINT NOT NULL,
IsMobile SMALLINT NOT NULL,
MobilePhone SMALLINT NOT NULL,
MobilePhoneModel TEXT NOT NULL,
Params TEXT NOT NULL,
IPNetworkID INTEGER NOT NULL,
TraficSourceID SMALLINT NOT NULL,
SearchEngineID SMALLINT NOT NULL,
SearchPhrase TEXT NOT NULL,
AdvEngineID SMALLINT NOT NULL,
IsArtifical SMALLINT NOT NULL,
WindowClientWidth SMALLINT NOT NULL,
WindowClientHeight SMALLINT NOT NULL,
ClientTimeZone SMALLINT NOT NULL,
ClientEventTime TIMESTAMP NOT NULL,
SilverlightVersion1 SMALLINT NOT NULL,
SilverlightVersion2 SMALLINT NOT NULL,
SilverlightVersion3 INTEGER NOT NULL,
SilverlightVersion4 SMALLINT NOT NULL,
PageCharset TEXT NOT NULL,
CodeVersion INTEGER NOT NULL,
IsLink SMALLINT NOT NULL,
IsDownload SMALLINT NOT NULL,
IsNotBounce SMALLINT NOT NULL,
FUniqID BIGINT NOT NULL,
OriginalURL TEXT NOT NULL,
HID INTEGER NOT NULL,
IsOldCounter SMALLINT NOT NULL,
IsEvent SMALLINT NOT NULL,
IsParameter SMALLINT NOT NULL,
DontCountHits SMALLINT NOT NULL,
WithHash SMALLINT NOT NULL,
HitColor CHAR NOT NULL,
LocalEventTime TIMESTAMP NOT NULL,
Age SMALLINT NOT NULL,
Sex SMALLINT NOT NULL,
Income SMALLINT NOT NULL,
Interests SMALLINT NOT NULL,
Robotness SMALLINT NOT NULL,
RemoteIP INTEGER NOT NULL,
WindowName INTEGER NOT NULL,
OpenerName INTEGER NOT NULL,
HistoryLength SMALLINT NOT NULL,
BrowserLanguage TEXT NOT NULL,
BrowserCountry TEXT NOT NULL,
SocialNetwork TEXT NOT NULL,
SocialAction TEXT NOT NULL,
HTTPError SMALLINT NOT NULL,
SendTiming INTEGER NOT NULL,
DNSTiming INTEGER NOT NULL,
ConnectTiming INTEGER NOT NULL,
ResponseStartTiming INTEGER NOT NULL,
ResponseEndTiming INTEGER NOT NULL,
FetchTiming INTEGER NOT NULL,
SocialSourceNetworkID SMALLINT NOT NULL,
SocialSourcePage TEXT NOT NULL,
ParamPrice BIGINT NOT NULL,
ParamOrderID TEXT NOT NULL,
ParamCurrency TEXT NOT NULL,
ParamCurrencyID SMALLINT NOT NULL,
OpenstatServiceName TEXT NOT NULL,
OpenstatCampaignID TEXT NOT NULL,
OpenstatAdID TEXT NOT NULL,
OpenstatSourceID TEXT NOT NULL,
UTMSource TEXT NOT NULL,
UTMMedium TEXT NOT NULL,
UTMCampaign TEXT NOT NULL,
UTMContent TEXT NOT NULL,
UTMTerm TEXT NOT NULL,
FromTag TEXT NOT NULL,
HasGCLID SMALLINT NOT NULL,
RefererHash BIGINT NOT NULL,
URLHash BIGINT NOT NULL,
CLID INTEGER NOT NULL,
PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID)
)
ENGINE = MergeTree
SETTINGS disk = disk(type = cache, path = '/dev/shm/clickhouse/', max_size = '16G',
disk = disk(type = web, endpoint = 'https://clickhouse-datasets-web.s3.us-east-1.amazonaws.com/'));

View File

@ -0,0 +1,43 @@
SELECT COUNT(*) FROM hits;
SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;
SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;
SELECT AVG(UserID) FROM hits;
SELECT COUNT(DISTINCT UserID) FROM hits;
SELECT COUNT(DISTINCT SearchPhrase) FROM hits;
SELECT MIN(EventDate), MAX(EventDate) FROM hits;
SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID FROM hits WHERE UserID = 435090932899640449;
SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;
SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;
SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;

79
docker/test/clickbench/run.sh Executable file
View File

@ -0,0 +1,79 @@
#!/bin/bash
SCRIPT_PID=$!
(sleep 1200 && kill -9 $SCRIPT_PID) &
# shellcheck disable=SC1091
source /setup_export_logs.sh
# fail on errors, verbose and export all env variables
set -e -x -a
dpkg -i package_folder/clickhouse-common-static_*.deb
dpkg -i package_folder/clickhouse-server_*.deb
dpkg -i package_folder/clickhouse-client_*.deb
# A directory for cache
mkdir /dev/shm/clickhouse
chown clickhouse:clickhouse /dev/shm/clickhouse
# Allow introspection functions, needed for sending the logs
echo "
profiles:
default:
allow_introspection_functions: 1
" > /etc/clickhouse-server/users.d/allow_introspection_functions.yaml
# Enable text_log
echo "
text_log:
" > /etc/clickhouse-server/config.d/text_log.yaml
config_logs_export_cluster /etc/clickhouse-server/config.d/system_logs_export.yaml
clickhouse start
# Wait for the server to start, but not for too long.
for _ in {1..100}
do
clickhouse-client --query "SELECT 1" && break
sleep 1
done
setup_logs_replication
# Load the data
clickhouse-client --time < /create.sql
# Run the queries
set +x
TRIES=3
QUERY_NUM=1
while read -r query; do
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$(clickhouse-client --query_id "q${QUERY_NUM}-${i}" --time --format Null --query "$query" --progress 0 2>&1 ||:)
echo -n "${RES}"
[[ "$i" != "$TRIES" ]] && echo -n ", "
echo "${QUERY_NUM},${i},${RES}" >> /test_output/test_results.tsv
done
echo "],"
QUERY_NUM=$((QUERY_NUM + 1))
done < /queries.sql
set -x
clickhouse-client --query "SELECT total_bytes FROM system.tables WHERE name = 'hits' AND database = 'default'"
clickhouse-client -q "system flush logs" ||:
stop_logs_replication
clickhouse stop
mv /var/log/clickhouse-server/* /test_output/
echo -e "success\tClickBench finished" > /test_output/check_status.tsv

View File

@ -24,6 +24,22 @@ azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
config_logs_export_cluster /etc/clickhouse-server/config.d/system_logs_export.yaml
cache_policy=""
if [ $(( $(date +%-d) % 2 )) -eq 1 ]; then
cache_policy="SLRU"
else
cache_policy="LRU"
fi
echo "Using cache policy: $cache_policy"
if [ "$cache_policy" = "SLRU" ]; then
sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \
| sed "s|<cache_policy>LRU</cache_policy>|<cache_policy>SLRU</cache_policy>|" \
> /etc/clickhouse-server/config.d/storage_conf.xml.tmp
mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml
fi
function start()
{
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
@ -135,7 +151,7 @@ function run_tests()
set +e
if [[ -n "$USE_PARALLEL_REPLICAS" ]] && [[ "$USE_PARALLEL_REPLICAS" -eq 1 ]]; then
clickhouse-test --client="clickhouse-client --use_hedged_requests=0 --allow_experimental_parallel_reading_from_replicas=1 --parallel_replicas_for_non_replicated_merge_tree=1 \
clickhouse-test --client="clickhouse-client --allow_experimental_parallel_reading_from_replicas=1 --parallel_replicas_for_non_replicated_merge_tree=1 \
--max_parallel_replicas=100 --cluster_for_parallel_replicas='parallel_replicas'" \
-j 2 --testname --shard --zookeeper --check-zookeeper-session --no-stateless --no-parallel-replicas --hung-check --print-time "${ADDITIONAL_OPTIONS[@]}" \
"$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt

View File

@ -30,7 +30,7 @@ def build_url(base_url, dataset):
return os.path.join(base_url, dataset, "partitions", AVAILABLE_DATASETS[dataset])
def dowload_with_progress(url, path):
def download_with_progress(url, path):
logging.info("Downloading from %s to temp path %s", url, path)
for i in range(RETRIES_COUNT):
try:
@ -110,7 +110,7 @@ if __name__ == "__main__":
temp_archive_path = _get_temp_file_name()
try:
download_url_for_dataset = build_url(args.url_prefix, dataset)
dowload_with_progress(download_url_for_dataset, temp_archive_path)
download_with_progress(download_url_for_dataset, temp_archive_path)
unpack_to_clickhouse_directory(temp_archive_path, args.clickhouse_data_path)
except Exception as ex:
logging.info("Some exception occured %s", str(ex))

View File

@ -65,9 +65,27 @@ chmod 777 -R /var/lib/clickhouse
clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary"
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test"
stop
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.initial.log
# Randomize cache policies.
cache_policy=""
if [ $(( $(date +%-d) % 2 )) -eq 1 ]; then
cache_policy="SLRU"
else
cache_policy="LRU"
fi
echo "Using cache policy: $cache_policy"
if [ "$cache_policy" = "SLRU" ]; then
sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \
| sed "s|<cache_policy>LRU</cache_policy>|<cache_policy>SLRU</cache_policy>|" \
> /etc/clickhouse-server/config.d/storage_conf.xml.tmp
mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml
fi
start
clickhouse-client --query "SHOW TABLES FROM datasets"
@ -191,6 +209,13 @@ sudo cat /etc/clickhouse-server/config.d/logger_trace.xml \
> /etc/clickhouse-server/config.d/logger_trace.xml.tmp
mv /etc/clickhouse-server/config.d/logger_trace.xml.tmp /etc/clickhouse-server/config.d/logger_trace.xml
if [ "$cache_policy" = "SLRU" ]; then
sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \
| sed "s|<cache_policy>LRU</cache_policy>|<cache_policy>SLRU</cache_policy>|" \
> /etc/clickhouse-server/config.d/storage_conf.xml.tmp
mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml
fi
# Randomize async_load_databases
if [ $(( $(date +%-d) % 2 )) -eq 1 ]; then
sudo echo "<clickhouse><async_load_databases>true</async_load_databases></clickhouse>" \

View File

@ -0,0 +1,22 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v23.11.2.11-stable (6e5411358c8) FIXME as compared to v23.11.1.2711-stable (05bc8ef1e02)
#### Improvement
* Backported in [#57661](https://github.com/ClickHouse/ClickHouse/issues/57661): Handle sigabrt case when getting PostgreSQl table structure with empty array. [#57618](https://github.com/ClickHouse/ClickHouse/pull/57618) ([Mike Kot (Михаил Кот)](https://github.com/myrrc)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Ignore ON CLUSTER clause in grant/revoke queries for management of replicated access entities. [#57538](https://github.com/ClickHouse/ClickHouse/pull/57538) ([MikhailBurdukov](https://github.com/MikhailBurdukov)).
* Fix SIGSEGV for aggregation of sparse columns with any() RESPECT NULL [#57710](https://github.com/ClickHouse/ClickHouse/pull/57710) ([Azat Khuzhin](https://github.com/azat)).
* Fix bug window functions: revert [#39631](https://github.com/ClickHouse/ClickHouse/issues/39631) [#57766](https://github.com/ClickHouse/ClickHouse/pull/57766) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Pin alpine version of integration tests helper container [#57669](https://github.com/ClickHouse/ClickHouse/pull/57669) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).

View File

@ -72,7 +72,7 @@ You can also add original ClickHouse repo address to your local repository to pu
After successfully running this command you will be able to pull updates from the main ClickHouse repo by running `git pull upstream master`.
:::note
Instructions below assume you are building on Linux. If you are cross-compiling or using building on macOS, please also check for operating system and architecture specific guides, such as building [on macOS for macOS](build-osx.md), [on Linux for macOS](build-cross-osx.md), [on Linux for Linux/RISC-V](build-cross-riscv.md) and so on.
Instructions below assume you are building on Linux. If you are cross-compiling or building on macOS, please also check for operating system and architecture specific guides, such as building [on macOS for macOS](build-osx.md), [on Linux for macOS](build-cross-osx.md), [on Linux for Linux/RISC-V](build-cross-riscv.md) and so on.
:::
## Build System {#build-system}

View File

@ -1,13 +1,16 @@
---
slug: /en/engines/table-engines/special/distributed
sidebar_label: "Distributed"
sidebar_position: 10
sidebar_label: Distributed
slug: /en/engines/table-engines/special/distributed
---
# Distributed Table Engine
Tables with Distributed engine do not store any data of their own, but allow distributed query processing on multiple servers.
Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any.
:::warning
To create a distributed table engine in the cloud, you can use the [remote and remoteSecure](../../../sql-reference/table-functions/remote) table functions. The `Distributed(...)` syntax cannot be used in ClickHouse Cloud.
:::
Tables with Distributed engine do not store any data of their own, but allow distributed query processing on multiple servers. Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any.
## Creating a Table {#distributed-creating-a-table}
@ -22,6 +25,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
```
### From a Table {#distributed-from-a-table}
When the `Distributed` table is pointing to a table on the current server you can adopt that table's schema:
``` sql
@ -48,7 +52,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2
Specifying the `sharding_key` is necessary for the following:
- For `INSERTs` into a distributed table (as the table engine needs the `sharding_key` to determine how to split the data). However, if `insert_distributed_one_random_shard` setting is enabled, then `INSERTs` do not need the sharding key
- For `INSERTs` into a distributed table (as the table engine needs the `sharding_key` to determine how to split the data). However, if `insert_distributed_one_random_shard` setting is enabled, then `INSERTs` do not need the sharding key.
- For use with `optimize_skip_unused_shards` as the `sharding_key` is necessary to determine what shards should be queried
#### policy_name
@ -122,9 +126,7 @@ SETTINGS
fsync_directories=0;
```
Data will be read from all servers in the `logs` cluster, from the `default.hits` table located on every server in the cluster.
Data is not only read but is partially processed on the remote servers (to the extent that this is possible).
For example, for a query with `GROUP BY`, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated.
Data will be read from all servers in the `logs` cluster, from the `default.hits` table located on every server in the cluster. Data is not only read but is partially processed on the remote servers (to the extent that this is possible). For example, for a query with `GROUP BY`, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated.
Instead of the database name, you can use a constant expression that returns a string. For example: `currentDatabase()`.
@ -183,9 +185,7 @@ Clusters are configured in the [server configuration file](../../../operations/c
</remote_servers>
```
Here a cluster is defined with the name `logs` that consists of two shards, each of which contains two replicas.
Shards refer to the servers that contain different parts of the data (in order to read all the data, you must access all the shards).
Replicas are duplicating servers (in order to read all the data, you can access the data on any one of the replicas).
Here a cluster is defined with the name `logs` that consists of two shards, each of which contains two replicas. Shards refer to the servers that contain different parts of the data (in order to read all the data, you must access all the shards). Replicas are duplicating servers (in order to read all the data, you can access the data on any one of the replicas).
Cluster names must not contain dots.
@ -198,9 +198,7 @@ The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `com
- `secure` - Whether to use a secure SSL/TLS connection. Usually also requires specifying the port (the default secure port is `9440`). The server should listen on `<tcp_port_secure>9440</tcp_port_secure>` and be configured with correct certificates.
- `compression` - Use data compression. Default value: `true`.
When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) see the [load_balancing](../../../operations/settings/settings.md#settings-load_balancing) setting.
If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times.
This works in favour of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly.
When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) see the [load_balancing](../../../operations/settings/settings.md#settings-load_balancing) setting. If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times. This works in favour of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly.
You can specify just one of the shards (in this case, query processing should be called remote, rather than distributed) or up to any number of shards. In each shard, you can specify from one to any number of replicas. You can specify a different number of replicas for each shard.

View File

@ -478,6 +478,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`.
- [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`.
- [input_format_csv_try_infer_numbers_from_strings](/docs/en/operations/settings/settings-formats.md/#input_format_csv_try_infer_numbers_from_strings) - Try to infer numbers from string fields while schema inference. Default value - `false`.
## CSVWithNames {#csvwithnames}

View File

@ -834,6 +834,27 @@ $$)
└──────────────┴───────────────┘
```
#### CSV settings {#csv-settings}
##### input_format_csv_try_infer_numbers_from_strings
Enabling this setting allows inferring numbers from string values.
This setting is disabled by default.
**Example:**
```sql
SET input_format_json_try_infer_numbers_from_strings = 1;
DESC format(CSV, '"42","42.42"');
```
```reponse
┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Int64) │ │ │ │ │ │
│ c2 │ Nullable(Float64) │ │ │ │ │ │
└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
### TSV/TSKV {#tsv-tskv}
In TSV/TSKV formats ClickHouse extracts column value from the row according to tabular delimiters and then parses extracted value using
@ -1846,3 +1867,102 @@ DESC format(JSONAsString, '{"x" : 42, "y" : "Hello, World!"}') SETTINGS allow_ex
│ json │ Object('json') │ │ │ │ │ │
└──────┴────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
## Schema inference modes {#schema-inference-modes}
Schema inference from the set of data files can work in 2 different modes: `default` and `union`.
The mode is controlled by the setting `schema_inference_mode`.
### Default mode {#default-schema-inference-mode}
In default mode, ClickHouse assumes that all files have the same schema and tries to infer the schema by reading files one by one until it succeeds.
Example:
Let's say we have 3 files `data1.jsonl`, `data2.jsonl` and `data3.jsonl` with the next content:
`data1.jsonl`:
```json
{"field1" : 1, "field2" : null}
{"field1" : 2, "field2" : null}
{"field1" : 3, "field2" : null}
```
`data2.jsonl`:
```json
{"field1" : 4, "field2" : "Data4"}
{"field1" : 5, "field2" : "Data5"}
{"field1" : 6, "field2" : "Data5"}
```
`data3.jsonl`:
```json
{"field1" : 7, "field2" : "Data7", "field3" : [1, 2, 3]}
{"field1" : 8, "field2" : "Data8", "field3" : [4, 5, 6]}
{"field1" : 9, "field2" : "Data9", "field3" : [7, 8, 9]}
```
Let's try to use schema inference on these 3 files:
```sql
:) DESCRIBE file('data{1,2,3}.jsonl') SETTINGS schema_inference_mode='default'
```
Result:
```text
┌─name───┬─type─────────────┐
│ field1 │ Nullable(Int64) │
│ field2 │ Nullable(String) │
└────────┴──────────────────┘
```
As we can see, we don't have `field3` from file `data3.jsonl`.
It happens because ClickHouse first tried to infer schema from file `data1.jsonl`, failed because of only nulls for field `field2`,
and then tried to infer schema from `data2.jsonl` and succeeded, so data from file `data3.jsonl` wasn't read.
### Union mode {#default-schema-inference-mode}
In union mode, ClickHouse assumes that files can have different schemas, so it infer schemas of all files and then union them to the common schema.
Let's say we have 3 files `data1.jsonl`, `data2.jsonl` and `data3.jsonl` with the next content:
`data1.jsonl`:
```json
{"field1" : 1}
{"field1" : 2}
{"field1" : 3}
```
`data2.jsonl`:
```json
{"field2" : "Data4"}
{"field2" : "Data5"}
{"field2" : "Data5"}
```
`data3.jsonl`:
```json
{"field3" : [1, 2, 3]}
{"field3" : [4, 5, 6]}
{"field3" : [7, 8, 9]}
```
Let's try to use schema inference on these 3 files:
```sql
:) DESCRIBE file('data{1,2,3}.jsonl') SETTINGS schema_inference_mode='union'
```
Result:
```text
┌─name───┬─type───────────────────┐
│ field1 │ Nullable(Int64) │
│ field2 │ Nullable(String) │
│ field3 │ Array(Nullable(Int64)) │
└────────┴────────────────────────┘
```
As we can see, we have all fields from all files.
Note:
- As some of the files may not contain some columns from the resulting schema, union mode is supported only for formats that support reading subset of columns (like JSONEachRow, Parquet, TSVWithNames, etc) and won't work for other formats (like CSV, TSV, JSONCompactEachRow, etc).
- If ClickHouse cannot infer the schema from one of the files, the exception will be thrown.
- If you have a lot of files, reading schema from all of them can take a lot of time.

View File

@ -472,6 +472,39 @@ The value 0 means that you can delete all tables without any restrictions.
``` xml
<max_table_size_to_drop>0</max_table_size_to_drop>
```
## max\_database\_num\_to\_warn {#max-database-num-to-warn}
If the number of attached databases exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
Default value: 1000
**Example**
``` xml
<max_database_num_to_warn>50</max_database_num_to_warn>
```
## max\_table\_num\_to\_warn {#max-table-num-to-warn}
If the number of attached tables exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
Default value: 5000
**Example**
``` xml
<max_table_num_to_warn>400</max_table_num_to_warn>
```
## max\_part\_num\_to\_warn {#max-part-num-to-warn}
If the number of active parts exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
Default value: 100000
**Example**
``` xml
<max_part_num_to_warn>400</max_part_num_to_warn>
```
## max_temporary_data_on_disk_size

View File

@ -1130,6 +1130,13 @@ Result
a 0 1971-01-01
```
## input_format_csv_try_infer_numbers_from_strings {#input_format_csv_try_infer_numbers_from_strings}
If enabled, during schema inference ClickHouse will try to infer numbers from string fields.
It can be useful if CSV data contains quoted UInt64 numbers.
Disabled by default.
## Values format settings {#values-format-settings}
### input_format_values_interpret_expressions {#input_format_values_interpret_expressions}

View File

@ -4349,6 +4349,8 @@ Default value: `1GiB`.
## Schema Inference settings
See [schema inference](../../interfaces/schema-inference.md#schema-inference-modes) documentation for more details.
### schema_inference_use_cache_for_file {schema_inference_use_cache_for_file}
Enable schemas cache for schema inference in `file` table function.
@ -4390,6 +4392,13 @@ Possible values:
Default value: 2.
### schema_inference_mode {schema_inference_mode}
The mode of schema inference. Possible values: `default` and `union`.
See [schema inference modes](../../interfaces/schema-inference.md#schema-inference-modes) section for more details.
Default value: `default`.
## compatibility {#compatibility}
The `compatibility` setting causes ClickHouse to use the default settings of a previous version of ClickHouse, where the previous version is provided as the setting.

View File

@ -1081,10 +1081,6 @@ Result:
└─────────────────────────────────────────────────────────────┘
```
**See also**
- [arrayFold](#arrayfold)
## arrayReduceInRanges
Applies an aggregate function to array elements in given ranges and returns an array containing the result corresponding to each range. The function will return the same result as multiple `arrayReduce(agg_func, arraySlice(arr1, index, length), ...)`.
@ -1127,56 +1123,6 @@ Result:
└─────────────────────────────┘
```
## arrayFold
Applies a lambda function to one or more equally-sized arrays and collects the result in an accumulator.
**Syntax**
``` sql
arrayFold(lambda_function, arr1, arr2, ..., accumulator)
```
**Example**
Query:
``` sql
SELECT arrayFold( acc,x -> acc + x*2, [1, 2, 3, 4], toInt64(3)) AS res;
```
Result:
``` text
┌─res─┐
│ 23 │
└─────┘
```
**Example with the Fibonacci sequence**
```sql
SELECT arrayFold( acc,x -> (acc.2, acc.2 + acc.1), range(number), (1::Int64, 0::Int64)).1 AS fibonacci
FROM numbers(1,10);
┌─fibonacci─┐
│ 0 │
│ 1 │
│ 1 │
│ 2 │
│ 3 │
│ 5 │
│ 8 │
│ 13 │
│ 21 │
│ 34 │
└───────────┘
```
**See also**
- [arrayReduce](#arrayreduce)
## arrayReverse(arr)
Returns an array of the same size as the original array containing the elements in reverse order.

View File

@ -543,26 +543,52 @@ Like `concatWithSeparator` but assumes that `concatWithSeparator(sep, expr1, exp
A function is called injective if it returns for different arguments different results. In other words: different arguments never produce identical result.
## substring(s, offset, length)
## substring
Returns a substring with `length` many bytes, starting at the byte at index `offset`. Character indexing starts from 1.
Returns the substring of a string `s` which starts at the specified byte index `offset`. Byte counting starts from 1. If `offset` is 0, an empty string is returned. If `offset` is negative, the substring starts `pos` characters from the end of the string, rather than from the beginning. An optional argument `length` specifies the maximum number of bytes the returned substring may have.
**Syntax**
```sql
substring(s, offset, length)
substring(s, offset[, length])
```
Alias:
- `substr`
- `mid`
**Arguments**
- `s` — The string to calculate a substring from. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md) or [Enum](../../sql-reference/data-types/enum.md)
- `offset` — The starting position of the substring in `s` . [(U)Int*](../../sql-reference/data-types/int-uint.md).
- `length` — The maximum length of the substring. [(U)Int*](../../sql-reference/data-types/int-uint.md). Optional.
**Returned value**
A substring of `s` with `length` many bytes, starting at index `offset`.
Type: `String`.
**Example**
``` sql
SELECT 'database' AS db, substr(db, 5), substr(db, 5, 1)
```
Result:
```result
┌─db───────┬─substring('database', 5)─┬─substring('database', 5, 1)─┐
│ database │ base │ b │
└──────────┴──────────────────────────┴─────────────────────────────┘
```
## substringUTF8
Like `substring` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.
## substringIndex(s, delim, count)
## substringIndex
Returns the substring of `s` before `count` occurrences of the delimiter `delim`, as in Spark or MySQL.
@ -593,7 +619,7 @@ Result:
└──────────────────────────────────────────────┘
```
## substringIndexUTF8(s, delim, count)
## substringIndexUTF8
Like `substringIndex` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.
@ -1225,7 +1251,7 @@ This function also replaces numeric character references with Unicode characters
**Syntax**
``` sql
decodeHTMComponent(x)
decodeHTMLComponent(x)
```
**Arguments**
@ -1242,7 +1268,7 @@ Type: [String](../../sql-reference/data-types/string.md).
``` sql
SELECT decodeHTMLComponent(''CH');
SELECT decodeHMLComponent('I&heartsuit;ClickHouse');
SELECT decodeHTMLComponent('I&heartsuit;ClickHouse');
```
Result:

View File

@ -12,7 +12,7 @@ Compressed files are supported. Compression type is detected by the extension of
**Syntax**
```sql
SELECT <expr_list> INTO OUTFILE file_name [AND STDOUT] [APPEND] [COMPRESSION type [LEVEL level]]
SELECT <expr_list> INTO OUTFILE file_name [AND STDOUT] [APPEND | TRUNCATE] [COMPRESSION type [LEVEL level]]
```
`file_name` and `type` are string literals. Supported compression types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`.
@ -26,6 +26,7 @@ SELECT <expr_list> INTO OUTFILE file_name [AND STDOUT] [APPEND] [COMPRESSION typ
- The default [output format](../../../interfaces/formats.md) is `TabSeparated` (like in the command-line client batch mode). Use [FORMAT](format.md) clause to change it.
- If `AND STDOUT` is mentioned in the query then the output that is written to the file is also displayed on standard output. If used with compression, the plaintext is displayed on standard output.
- If `APPEND` is mentioned in the query then the output is appended to an existing file. If compression is used, append cannot be used.
- When writing to a file that already exists, `APPEND` or `TRUNCATE` must be used.
**Example**

View File

@ -19,6 +19,7 @@ fuzzJSON({ named_collection [option=value [,..]] | json_str[, random_seed] })
- `json_str` (String) - The source string representing structured data in JSON format.
- `random_seed` (UInt64) - Manual random seed for producing stable results.
- `reuse_output` (boolean) - Reuse the output from a fuzzing process as input for the next fuzzer.
- `malform_output` (boolean) - Generate a string that cannot be parsed as a JSON object.
- `max_output_length` (UInt64) - Maximum allowable length of the generated or perturbed JSON string.
- `probability` (Float64) - The probability to fuzz a JSON field (a key-value pair). Must be within [0, 1] range.
- `max_nesting_level` (UInt64) - The maximum allowed depth of nested structures within the JSON data.
@ -84,3 +85,13 @@ SELECT * FROM fuzzJSON('{"id":1}', 1234) LIMIT 3;
{"BRjE":16137826149911306846}
{"XjKE":15076727133550123563}
```
``` sql
SELECT * FROM fuzzJSON(json_nc, json_str='{"name" : "FuzzJSON"}', random_seed=1337, malform_output=true) LIMIT 3;
```
``` text
U"name":"FuzzJSON*"SpByjZKtr2VAyHCO"falseh
{"name"keFuzzJSON, "g6vVO7TCIk":jTt^
{"DBhz":YFuzzJSON5}
```

View File

@ -68,6 +68,7 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/waitServersToFinish.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/ServerType.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTPRequestHandlerFactoryMain.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/KeeperReadinessHandler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/ReadHeaders.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServerConnection.cpp

View File

@ -14,6 +14,7 @@
#include <Common/assertProcessUserMatchesDataOwner.h>
#include <Common/makeSocketAddress.h>
#include <Server/waitServersToFinish.h>
#include <base/getMemoryAmount.h>
#include <base/scope_guard.h>
#include <base/safeExit.h>
#include <Poco/Net/NetException.h>
@ -32,6 +33,7 @@
#include <Server/HTTP/HTTPServer.h>
#include <Server/TCPServer.h>
#include <Server/HTTPHandlerFactory.h>
#include <Server/KeeperReadinessHandler.h>
#include "Core/Defines.h"
#include "config.h"
@ -289,6 +291,33 @@ try
if (!config().has("keeper_server"))
throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Keeper configuration (<keeper_server> section) not found in config");
auto updateMemorySoftLimitInConfig = [&](Poco::Util::AbstractConfiguration & config)
{
UInt64 memory_soft_limit = 0;
if (config.has("keeper_server.max_memory_usage_soft_limit"))
{
memory_soft_limit = config.getUInt64("keeper_server.max_memory_usage_soft_limit");
}
/// if memory soft limit is not set, we will use default value
if (memory_soft_limit == 0)
{
Float64 ratio = 0.9;
if (config.has("keeper_server.max_memory_usage_soft_limit_ratio"))
ratio = config.getDouble("keeper_server.max_memory_usage_soft_limit_ratio");
size_t physical_server_memory = getMemoryAmount();
if (ratio > 0 && physical_server_memory > 0)
{
memory_soft_limit = static_cast<UInt64>(physical_server_memory * ratio);
config.setUInt64("keeper_server.max_memory_usage_soft_limit", memory_soft_limit);
}
}
LOG_INFO(log, "keeper_server.max_memory_usage_soft_limit is set to {}", formatReadableSizeWithBinarySuffix(memory_soft_limit));
};
updateMemorySoftLimitInConfig(config());
std::string path;
if (config().has("keeper_server.storage_path"))
@ -466,6 +495,29 @@ try
std::make_unique<HTTPServer>(
std::move(my_http_context), createPrometheusMainHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params));
});
/// HTTP control endpoints
port_name = "keeper_server.http_control.port";
createServer(listen_host, port_name, listen_try, [&](UInt16 port) mutable
{
auto my_http_context = httpContext();
Poco::Timespan my_keep_alive_timeout(config.getUInt("keep_alive_timeout", 10), 0);
Poco::Net::HTTPServerParams::Ptr my_http_params = new Poco::Net::HTTPServerParams;
my_http_params->setTimeout(my_http_context->getReceiveTimeout());
my_http_params->setKeepAliveTimeout(my_keep_alive_timeout);
Poco::Net::ServerSocket socket;
auto address = socketBindListen(socket, listen_host, port);
socket.setReceiveTimeout(my_http_context->getReceiveTimeout());
socket.setSendTimeout(my_http_context->getSendTimeout());
servers->emplace_back(
listen_host,
port_name,
"HTTP Control: http://" + address.toString(),
std::make_unique<HTTPServer>(
std::move(my_http_context), createKeeperHTTPControlMainHandlerFactory(config_getter(), global_context->getKeeperDispatcher(), "KeeperHTTPControlHandler-factory"), server_pool, socket, http_params)
);
});
}
for (auto & server : *servers)
@ -499,6 +551,8 @@ try
{
updateLevels(*config, logger());
updateMemorySoftLimitInConfig(*config);
if (config->has("keeper_server"))
global_context->updateKeeperConfiguration(*config);

View File

@ -92,6 +92,7 @@
#include <Server/ProxyV1HandlerFactory.h>
#include <Server/TLSHandlerFactory.h>
#include <Server/ProtocolServerAdapter.h>
#include <Server/KeeperReadinessHandler.h>
#include <Server/HTTP/HTTPServer.h>
#include <Interpreters/AsynchronousInsertQueue.h>
#include <Core/ServerSettings.h>
@ -1344,6 +1345,9 @@ try
global_context->setMaxTableSizeToDrop(server_settings_.max_table_size_to_drop);
global_context->setMaxPartitionSizeToDrop(server_settings_.max_partition_size_to_drop);
global_context->setMaxTableNumToWarn(server_settings_.max_table_num_to_warn);
global_context->setMaxDatabaseNumToWarn(server_settings_.max_database_num_to_warn);
global_context->setMaxPartNumToWarn(server_settings_.max_part_num_to_warn);
ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited;
if (server_settings_.concurrent_threads_soft_limit_num > 0 && server_settings_.concurrent_threads_soft_limit_num < concurrent_threads_soft_limit)
@ -1551,6 +1555,34 @@ try
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.");
#endif
});
/// HTTP control endpoints
port_name = "keeper_server.http_control.port";
createServer(config(), listen_host, port_name, listen_try, /* start_server: */ false,
servers_to_start_before_tables,
[&](UInt16 port) -> ProtocolServerAdapter
{
auto http_context = httpContext();
Poco::Timespan keep_alive_timeout(config().getUInt("keep_alive_timeout", 10), 0);
Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams;
http_params->setTimeout(http_context->getReceiveTimeout());
http_params->setKeepAliveTimeout(keep_alive_timeout);
Poco::Net::ServerSocket socket;
auto address = socketBindListen(config(), socket, listen_host, port);
socket.setReceiveTimeout(http_context->getReceiveTimeout());
socket.setSendTimeout(http_context->getSendTimeout());
return ProtocolServerAdapter(
listen_host,
port_name,
"HTTP Control: http://" + address.toString(),
std::make_unique<HTTPServer>(
std::move(http_context),
createKeeperHTTPControlMainHandlerFactory(
config_getter(),
global_context->getKeeperDispatcher(),
"KeeperHTTPControlHandler-factory"), server_pool, socket, http_params));
});
}
#else
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination.");

View File

@ -14,6 +14,10 @@ macro(configure_rustc)
set(RUST_CFLAGS "${RUST_CFLAGS} --sysroot ${CMAKE_SYSROOT}")
endif()
if (USE_MUSL)
set(RUST_CXXFLAGS "${RUST_CXXFLAGS} -D_LIBCPP_HAS_MUSL_LIBC=1")
endif ()
if(CCACHE_EXECUTABLE MATCHES "/sccache$")
message(STATUS "Using RUSTC_WRAPPER: ${CCACHE_EXECUTABLE}")
set(RUSTCWRAPPER "rustc-wrapper = \"${CCACHE_EXECUTABLE}\"")

View File

@ -183,6 +183,7 @@ enum class AccessType
M(SYSTEM_REPLICATION_QUEUES, "SYSTEM STOP REPLICATION QUEUES, SYSTEM START REPLICATION QUEUES, STOP REPLICATION QUEUES, START REPLICATION QUEUES", TABLE, SYSTEM) \
M(SYSTEM_DROP_REPLICA, "DROP REPLICA", TABLE, SYSTEM) \
M(SYSTEM_SYNC_REPLICA, "SYNC REPLICA", TABLE, SYSTEM) \
M(SYSTEM_REPLICA_READINESS, "SYSTEM REPLICA READY, SYSTEM REPLICA UNREADY", GLOBAL, SYSTEM) \
M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \
M(SYSTEM_RESTORE_REPLICA, "RESTORE REPLICA", TABLE, SYSTEM) \
M(SYSTEM_WAIT_LOADING_PARTS, "WAIT LOADING PARTS", TABLE, SYSTEM) \

View File

@ -254,11 +254,20 @@ public:
if (it != merged_maps.end())
{
for (size_t col = 0; col < values_types.size(); ++col)
{
if (!elem.second[col].isNull())
applyVisitor(Visitor(elem.second[col]), it->second[col]);
{
if (it->second[col].isNull())
it->second[col] = elem.second[col];
else
applyVisitor(Visitor(elem.second[col]), it->second[col]);
}
}
}
else
{
merged_maps[elem.first] = elem.second;
}
}
}

View File

@ -184,12 +184,12 @@ BackupCoordinationRemote::BackupCoordinationRemote(
if (my_is_internal)
{
String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
auto code = zk->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
if (code == Coordination::Error::ZNODEEXISTS)
zk->handleEphemeralNodeExistenceNoFailureInjection(alive_node_path, "");
else if (code != Coordination::Error::ZOK)
throw zkutil::KeeperException::fromPath(code, alive_node_path);
/// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
zk->tryRemove(alive_node_path);
zk->createAncestors(alive_node_path);
zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
}
})
{

View File

@ -60,12 +60,6 @@ void BackupCoordinationStageSync::set(const String & current_host, const String
}
else
{
/// Make an ephemeral node so the initiator can track if the current host is still working.
String alive_node_path = zookeeper_path + "/alive|" + current_host;
auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS)
throw zkutil::KeeperException::fromPath(code, alive_node_path);
zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, "");
zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message);
}
@ -106,39 +100,36 @@ Strings BackupCoordinationStageSync::waitFor(const Strings & all_hosts, const St
namespace
{
struct UnreadyHostState
struct UnreadyHost
{
String host;
bool started = false;
bool alive = false;
};
}
struct BackupCoordinationStageSync::State
{
Strings results;
std::map<String, UnreadyHostState> unready_hosts;
std::optional<Strings> results;
std::optional<std::pair<String, Exception>> error;
std::optional<String> host_terminated;
std::optional<String> disconnected_host;
std::optional<UnreadyHost> unready_host;
};
BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState(
const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const
WithRetries::RetriesControlHolder & retries_control_holder,
const Strings & zk_nodes,
const Strings & all_hosts,
const String & stage_to_wait) const
{
auto zookeeper = retries_control_holder.faulty_zookeeper;
auto & retries_ctl = retries_control_holder.retries_ctl;
std::unordered_set<std::string_view> zk_nodes_set{zk_nodes.begin(), zk_nodes.end()};
State state;
if (zk_nodes_set.contains("error"))
{
String errors;
{
auto holder = with_retries.createRetriesControlHolder("readCurrentState");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
errors = zookeeper->get(zookeeper_path + "/error");
});
}
String errors = zookeeper->get(zookeeper_path + "/error");
ReadBufferFromOwnString buf{errors};
String host;
readStringBinary(host, buf);
@ -146,64 +137,50 @@ BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState
return state;
}
std::optional<UnreadyHost> unready_host;
for (const auto & host : all_hosts)
{
if (!zk_nodes_set.contains("current|" + host + "|" + stage_to_wait))
{
UnreadyHostState unready_host_state;
const String started_node_name = "started|" + host;
const String alive_node_name = "alive|" + host;
const String alive_node_path = zookeeper_path + "/" + alive_node_name;
unready_host_state.started = zk_nodes_set.contains(started_node_name);
/// Because we do retries everywhere we can't fully rely on ephemeral nodes anymore.
/// Though we recreate "alive" node when reconnecting it might be not enough and race condition is possible.
/// And everything we can do here - just retry.
/// In worst case when we won't manage to see the alive node for a long time we will just abort the backup.
unready_host_state.alive = zk_nodes_set.contains(alive_node_name);
if (!unready_host_state.alive)
bool started = zk_nodes_set.contains(started_node_name);
bool alive = zk_nodes_set.contains(alive_node_name);
if (!alive)
{
LOG_TRACE(log, "Seems like host ({}) is dead. Will retry the check to confirm", host);
auto holder = with_retries.createRetriesControlHolder("readCurrentState::checkAliveNode");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
if (zookeeper->existsNoFailureInjection(alive_node_path))
{
unready_host_state.alive = true;
return;
}
// Retry with backoff. We also check whether it is last retry or no, because we won't to rethrow an exception.
if (!holder.retries_ctl.isLastRetry())
holder.retries_ctl.setKeeperError(Coordination::Error::ZNONODE, "There is no alive node for host {}. Will retry", host);
});
/// If the "alive" node doesn't exist then we don't have connection to the corresponding host.
/// This node is ephemeral so probably it will be recreated soon. We use zookeeper retries to wait.
/// In worst case when we won't manage to see the alive node for a long time we will just abort the backup.
String message;
if (started)
message = fmt::format("Lost connection to host {}", host);
else
message = fmt::format("No connection to host {} yet", host);
if (!retries_ctl.isLastRetry())
message += ", will retry";
retries_ctl.setUserError(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, message);
state.disconnected_host = host;
return state;
}
LOG_TRACE(log, "Host ({}) appeared to be {}", host, unready_host_state.alive ? "alive" : "dead");
state.unready_hosts.emplace(host, unready_host_state);
if (!unready_host_state.alive && unready_host_state.started && !state.host_terminated)
state.host_terminated = host;
if (!unready_host)
unready_host.emplace(UnreadyHost{.host = host, .started = started});
}
}
if (state.host_terminated || !state.unready_hosts.empty())
return state;
auto holder = with_retries.createRetriesControlHolder("waitImpl::collectStagesToWait");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
if (unready_host)
{
with_retries.renewZooKeeper(zookeeper);
Strings results;
state.unready_host = std::move(unready_host);
return state;
}
for (const auto & host : all_hosts)
results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
state.results = std::move(results);
});
Strings results;
for (const auto & host : all_hosts)
results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
state.results = std::move(results);
return state;
}
@ -229,7 +206,7 @@ Strings BackupCoordinationStageSync::waitImpl(
auto watch = std::make_shared<Poco::Event>();
Strings zk_nodes;
{
auto holder = with_retries.createRetriesControlHolder("waitImpl::getChildren");
auto holder = with_retries.createRetriesControlHolder("waitImpl");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
@ -237,17 +214,23 @@ Strings BackupCoordinationStageSync::waitImpl(
watch->reset();
/// Get zk nodes and subscribe on their changes.
zk_nodes = zookeeper->getChildren(zookeeper_path, nullptr, watch);
/// Read the current state of zk nodes.
state = readCurrentState(holder, zk_nodes, all_hosts, stage_to_wait);
});
}
/// Read and analyze the current state of zk nodes.
state = readCurrentState(zk_nodes, all_hosts, stage_to_wait);
if (state.error || state.host_terminated || state.unready_hosts.empty())
break; /// Error happened or everything is ready.
/// Analyze the current state of zk nodes.
chassert(state.results || state.error || state.disconnected_host || state.unready_host);
/// Log that we will wait
const auto & unready_host = state.unready_hosts.begin()->first;
LOG_INFO(log, "Waiting on ZooKeeper watch for any node to be changed (currently waiting for host {})", unready_host);
if (state.results || state.error || state.disconnected_host)
break; /// Everything is ready or error happened.
/// Log what we will wait.
const auto & unready_host = *state.unready_host;
LOG_INFO(log, "Waiting on ZooKeeper watch for any node to be changed (currently waiting for host {}{})",
unready_host.host,
(!unready_host.started ? " which didn't start the operation yet" : ""));
/// Wait until `watch_callback` is called by ZooKeeper meaning that zk nodes have changed.
{
@ -270,23 +253,23 @@ Strings BackupCoordinationStageSync::waitImpl(
state.error->second.rethrow();
/// Another host terminated without errors.
if (state.host_terminated)
throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Host {} suddenly stopped working", *state.host_terminated);
if (state.disconnected_host)
throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "No connection to host {}", *state.disconnected_host);
/// Something's unready, timeout is probably not enough.
if (!state.unready_hosts.empty())
if (state.unready_host)
{
const auto & [unready_host, unready_host_state] = *state.unready_hosts.begin();
const auto & unready_host = *state.unready_host;
throw Exception(
ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
"Waited for host {} too long (> {}){}",
unready_host,
unready_host.host,
to_string(*timeout),
unready_host_state.started ? "" : ": Operation didn't start");
unready_host.started ? "" : ": Operation didn't start");
}
LOG_TRACE(log, "Everything is Ok. All hosts achieved stage {}", stage_to_wait);
return state.results;
return std::move(*state.results);
}
}

View File

@ -29,7 +29,7 @@ private:
void createRootNodes();
struct State;
State readCurrentState(const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
State readCurrentState(WithRetries::RetriesControlHolder & retries_control_holder, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;

View File

@ -43,12 +43,12 @@ RestoreCoordinationRemote::RestoreCoordinationRemote(
if (my_is_internal)
{
String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
auto code = zk->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
if (code == Coordination::Error::ZNODEEXISTS)
zk->handleEphemeralNodeExistenceNoFailureInjection(alive_node_path, "");
else if (code != Coordination::Error::ZOK)
throw zkutil::KeeperException::fromPath(code, alive_node_path);
/// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
zk->tryRemove(alive_node_path);
zk->createAncestors(alive_node_path);
zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
}
})
{

View File

@ -519,8 +519,9 @@ void ConfigProcessor::doIncludesRecursive(
if (attr_nodes["from_zk"]) /// we have zookeeper subst
{
if (node->hasChildNodes()) /// only allow substitution for nodes with no value
throw Poco::Exception("Element <" + node->nodeName() + "> has value, can't process from_zk substitution");
/// only allow substitution for nodes with no value and without "replace"
if (node->hasChildNodes() && !replace)
throw Poco::Exception("Element <" + node->nodeName() + "> has value and does not have 'replace' attribute, can't process from_zk substitution");
contributing_zk_paths.insert(attr_nodes["from_zk"]->getNodeValue());
@ -544,8 +545,9 @@ void ConfigProcessor::doIncludesRecursive(
if (attr_nodes["from_env"]) /// we have env subst
{
if (node->hasChildNodes()) /// only allow substitution for nodes with no value
throw Poco::Exception("Element <" + node->nodeName() + "> has value, can't process from_env substitution");
/// only allow substitution for nodes with no value and without "replace"
if (node->hasChildNodes() && !replace)
throw Poco::Exception("Element <" + node->nodeName() + "> has value and does not have 'replace' attribute, can't process from_env substitution");
XMLDocumentPtr env_document;
auto get_env_node = [&](const std::string & name) -> const Node *

View File

@ -212,6 +212,8 @@
M(PartsCommitted, "Deprecated. See PartsActive.") \
M(PartsPreActive, "The part is in data_parts, but not used for SELECTs.") \
M(PartsActive, "Active data part, used by current and upcoming SELECTs.") \
M(AttachedDatabase, "Active database, used by current and upcoming SELECTs.") \
M(AttachedTable, "Active table, used by current and upcoming SELECTs.") \
M(PartsOutdated, "Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.") \
M(PartsDeleting, "Not active data part with identity refcounter, it is deleting right now by a cleaner.") \
M(PartsDeleteOnDestroy, "Part was moved to another disk and should be deleted in own destructor.") \
@ -258,6 +260,7 @@
#define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M)
#endif
namespace CurrentMetrics
{
#define M(NAME, DOCUMENTATION) extern const Metric NAME = Metric(__COUNTER__);

View File

@ -462,7 +462,8 @@ The server successfully detected this situation and will download merged part fr
M(ReadBufferSeekCancelConnection, "Number of seeks which lead to new connection (s3, http)") \
\
M(SleepFunctionCalls, "Number of times a sleep function (sleep, sleepEachRow) has been called.") \
M(SleepFunctionMicroseconds, "Time spent sleeping due to a sleep function call.") \
M(SleepFunctionMicroseconds, "Time set to sleep in a sleep function (sleep, sleepEachRow).") \
M(SleepFunctionElapsedMicroseconds, "Time spent sleeping in a sleep function (sleep, sleepEachRow).") \
\
M(ThreadPoolReaderPageCacheHit, "Number of times the read inside ThreadPoolReader was done from page cache.") \
M(ThreadPoolReaderPageCacheHitBytes, "Number of bytes read inside ThreadPoolReader when it was done from page cache.") \

View File

@ -99,6 +99,7 @@ struct TestKeeperExistsRequest final : ExistsRequest, TestKeeperRequest
struct TestKeeperGetRequest final : GetRequest, TestKeeperRequest
{
TestKeeperGetRequest() = default;
explicit TestKeeperGetRequest(const GetRequest & base) : GetRequest(base) {}
ResponsePtr createResponse() const override;
std::pair<ResponsePtr, Undo> process(TestKeeper::Container & container, int64_t zxid) const override;
};
@ -118,6 +119,8 @@ struct TestKeeperSetRequest final : SetRequest, TestKeeperRequest
struct TestKeeperListRequest : ListRequest, TestKeeperRequest
{
TestKeeperListRequest() = default;
explicit TestKeeperListRequest(const ListRequest & base) : ListRequest(base) {}
ResponsePtr createResponse() const override;
std::pair<ResponsePtr, Undo> process(TestKeeper::Container & container, int64_t zxid) const override;
};
@ -176,6 +179,14 @@ struct TestKeeperMultiRequest final : MultiRequest, TestKeeperRequest
{
requests.push_back(std::make_shared<TestKeeperCheckRequest>(*concrete_request_check));
}
else if (const auto * concrete_request_get = dynamic_cast<const GetRequest *>(generic_request.get()))
{
requests.push_back(std::make_shared<TestKeeperGetRequest>(*concrete_request_get));
}
else if (const auto * concrete_request_list = dynamic_cast<const ListRequest *>(generic_request.get()))
{
requests.push_back(std::make_shared<TestKeeperListRequest>(*concrete_request_list));
}
else
throw Exception::fromMessage(Error::ZBADARGUMENTS, "Illegal command as part of multi ZooKeeper request");
}

View File

@ -497,6 +497,17 @@ bool ZooKeeper::exists(const std::string & path, Coordination::Stat * stat, cons
return existsWatch(path, stat, callbackForEvent(watch));
}
bool ZooKeeper::anyExists(const std::vector<std::string> & paths)
{
auto exists_multi_response = exists(paths);
for (size_t i = 0; i < exists_multi_response.size(); ++i)
{
if (exists_multi_response[i].error == Coordination::Error::ZOK)
return true;
}
return false;
}
bool ZooKeeper::existsWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback)
{
Coordination::Error code = existsImpl(path, stat, watch_callback);

View File

@ -286,6 +286,8 @@ public:
return exists(paths.begin(), paths.end());
}
bool anyExists(const std::vector<std::string> & paths);
std::string get(const std::string & path, Coordination::Stat * stat = nullptr, const EventPtr & watch = nullptr);
std::string getWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback);
@ -422,8 +424,9 @@ public:
/// Performs several operations in a transaction.
/// Throws on every error.
Coordination::Responses multi(const Coordination::Requests & requests);
/// Throws only if some operation has returned an "unexpected" error
/// - an error that would cause the corresponding try- method to throw.
/// Throws only if some operation has returned an "unexpected" error - an error that would cause
/// the corresponding try- method to throw.
/// On exception, `responses` may or may not be populated.
Coordination::Error tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses);
/// Throws nothing (even session expired errors)
Coordination::Error tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses);
@ -567,8 +570,11 @@ public:
void setZooKeeperLog(std::shared_ptr<DB::ZooKeeperLog> zk_log_);
UInt32 getSessionUptime() const { return static_cast<UInt32>(session_uptime.elapsedSeconds()); }
bool hasReachedDeadline() const { return impl->hasReachedDeadline(); }
uint64_t getSessionTimeoutMS() const { return args.session_timeout_ms; }
void setServerCompletelyStarted();
Int8 getConnectedHostIdx() const;

View File

@ -184,7 +184,7 @@ std::vector<std::pair<String, uint16_t>> parseRemoteDescriptionForExternalDataba
}
else
{
result.emplace_back(std::make_pair(address.substr(0, colon), DB::parseFromString<UInt16>(address.substr(colon + 1))));
result.emplace_back(std::make_pair(address.substr(0, colon), parseFromString<UInt16>(address.substr(colon + 1))));
}
}

View File

@ -1,8 +1,12 @@
#pragma once
#include <base/types.h>
#include <vector>
namespace DB
{
/* Parse a string that generates shards and replicas. Separator - one of two characters '|' or ','
* depending on whether shards or replicas are generated.
* For example:

View File

@ -43,7 +43,6 @@ struct Settings;
M(UInt64, max_requests_batch_bytes_size, 100*1024, "Max size in bytes of batch of requests that can be sent to RAFT", 0) \
M(UInt64, max_flush_batch_size, 1000, "Max size of batch of requests that can be flushed together", 0) \
M(UInt64, max_requests_quick_batch_size, 100, "Max size of batch of requests to try to get before proceeding with RAFT. Keeper will not wait for requests but take only requests that are already in queue" , 0) \
M(UInt64, max_memory_usage_soft_limit, 0, "Soft limit in bytes of keeper memory usage", 0) \
M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \
M(Bool, force_sync, true, "Call fsync on each change in RAFT changelog", 0) \
M(Bool, compress_logs, false, "Write compressed coordination logs in ZSTD format", 0) \

View File

@ -59,6 +59,8 @@ void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config,
}
}
updateKeeperMemorySoftLimit(config);
digest_enabled = config.getBool("keeper_server.digest_enabled", false);
ignore_system_path_on_startup = config.getBool("keeper_server.ignore_system_path_on_startup", false);
@ -375,4 +377,10 @@ void KeeperContext::initializeFeatureFlags(const Poco::Util::AbstractConfigurati
feature_flags.logFlags(&Poco::Logger::get("KeeperContext"));
}
void KeeperContext::updateKeeperMemorySoftLimit(const Poco::Util::AbstractConfiguration & config)
{
if (config.hasProperty("keeper_server.max_memory_usage_soft_limit"))
memory_soft_limit = config.getUInt64("keeper_server.max_memory_usage_soft_limit");
}
}

View File

@ -53,6 +53,9 @@ public:
constexpr KeeperDispatcher * getDispatcher() const { return dispatcher; }
UInt64 getKeeperMemorySoftLimit() const { return memory_soft_limit; }
void updateKeeperMemorySoftLimit(const Poco::Util::AbstractConfiguration & config);
/// set to true when we have preprocessed or committed all the logs
/// that were already present locally during startup
std::atomic<bool> local_logs_preprocessed = false;
@ -92,6 +95,8 @@ private:
KeeperFeatureFlags feature_flags;
KeeperDispatcher * dispatcher{nullptr};
std::atomic<UInt64> memory_soft_limit = 0;
};
using KeeperContextPtr = std::shared_ptr<KeeperContext>;

View File

@ -143,7 +143,7 @@ void KeeperDispatcher::requestThread()
if (shutdown_called)
break;
Int64 mem_soft_limit = configuration_and_settings->coordination_settings->max_memory_usage_soft_limit;
Int64 mem_soft_limit = keeper_context->getKeeperMemorySoftLimit();
if (configuration_and_settings->standalone_keeper && mem_soft_limit > 0 && total_memory_tracker.get() >= mem_soft_limit && checkIfRequestIncreaseMem(request.request))
{
LOG_TRACE(log, "Processing requests refused because of max_memory_usage_soft_limit {}, the total used memory is {}, request type is {}", mem_soft_limit, total_memory_tracker.get(), request.request->getOpNum());
@ -930,6 +930,8 @@ void KeeperDispatcher::updateConfiguration(const Poco::Util::AbstractConfigurati
throw Exception(ErrorCodes::SYSTEM_ERROR, "Cannot push configuration update to queue");
snapshot_s3.updateS3Configuration(config, macros);
keeper_context->updateKeeperMemorySoftLimit(config);
}
void KeeperDispatcher::updateKeeperStatLatency(uint64_t process_time_ms)

View File

@ -208,6 +208,9 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh
return;
}
/// To avoid reference to binding
const auto & snapshot_path_ref = snapshot_path;
SCOPE_EXIT(
{
LOG_INFO(log, "Removing lock file");
@ -223,7 +226,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh
}
catch (...)
{
LOG_INFO(log, "Failed to delete lock file for {} from S3", snapshot_file_info.path);
LOG_INFO(log, "Failed to delete lock file for {} from S3", snapshot_path_ref);
tryLogCurrentException(__PRETTY_FUNCTION__);
}
});

View File

@ -35,6 +35,7 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int UNSUPPORTED_METHOD;
}
struct ContextSharedPart : boost::noncopyable
@ -376,4 +377,9 @@ void Context::updateKeeperConfiguration([[maybe_unused]] const Poco::Util::Abstr
shared->keeper_dispatcher->updateConfiguration(getConfigRef(), getMacros());
}
std::shared_ptr<zkutil::ZooKeeper> Context::getZooKeeper() const
{
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot connect to ZooKeeper from Keeper");
}
}

View File

@ -21,6 +21,11 @@
#include <memory>
#include "config.h"
namespace zkutil
{
class ZooKeeper;
using ZooKeeperPtr = std::shared_ptr<ZooKeeper>;
}
namespace DB
{
@ -153,6 +158,8 @@ public:
void initializeKeeperDispatcher(bool start_async) const;
void shutdownKeeperDispatcher() const;
void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config);
zkutil::ZooKeeperPtr getZooKeeper() const;
};
}

View File

@ -20,6 +20,11 @@ namespace ErrorCodes
extern const int UNKNOWN_TYPE;
}
ExternalResultDescription::ExternalResultDescription(const Block & sample_block_)
{
init(sample_block_);
}
void ExternalResultDescription::init(const Block & sample_block_)
{
sample_block = sample_block_;

View File

@ -41,6 +41,9 @@ struct ExternalResultDescription
Block sample_block;
std::vector<std::pair<ValueType, bool /* is_nullable */>> types;
ExternalResultDescription() = default;
explicit ExternalResultDescription(const Block & sample_block_);
void init(const Block & sample_block_);
};

View File

@ -36,7 +36,7 @@ void insertDefaultPostgreSQLValue(IColumn & column, const IColumn & sample_colum
void insertPostgreSQLValue(
IColumn & column, std::string_view value,
const ExternalResultDescription::ValueType type, const DataTypePtr data_type,
std::unordered_map<size_t, PostgreSQLArrayInfo> & array_info, size_t idx)
const std::unordered_map<size_t, PostgreSQLArrayInfo> & array_info, size_t idx)
{
switch (type)
{
@ -125,8 +125,8 @@ void insertPostgreSQLValue(
pqxx::array_parser parser{value};
std::pair<pqxx::array_parser::juncture, std::string> parsed = parser.get_next();
size_t dimension = 0, max_dimension = 0, expected_dimensions = array_info[idx].num_dimensions;
const auto parse_value = array_info[idx].pqxx_parser;
size_t dimension = 0, max_dimension = 0, expected_dimensions = array_info.at(idx).num_dimensions;
const auto parse_value = array_info.at(idx).pqxx_parser;
std::vector<Row> dimensions(expected_dimensions + 1);
while (parsed.first != pqxx::array_parser::juncture::done)
@ -138,7 +138,7 @@ void insertPostgreSQLValue(
dimensions[dimension].emplace_back(parse_value(parsed.second));
else if (parsed.first == pqxx::array_parser::juncture::null_value)
dimensions[dimension].emplace_back(array_info[idx].default_value);
dimensions[dimension].emplace_back(array_info.at(idx).default_value);
else if (parsed.first == pqxx::array_parser::juncture::row_end)
{

View File

@ -23,7 +23,7 @@ struct PostgreSQLArrayInfo
void insertPostgreSQLValue(
IColumn & column, std::string_view value,
const ExternalResultDescription::ValueType type, const DataTypePtr data_type,
std::unordered_map<size_t, PostgreSQLArrayInfo> & array_info, size_t idx);
const std::unordered_map<size_t, PostgreSQLArrayInfo> & array_info, size_t idx);
void preparePostgreSQLArrayInfo(
std::unordered_map<size_t, PostgreSQLArrayInfo> & array_info, size_t column_idx, const DataTypePtr data_type);

View File

@ -79,6 +79,9 @@ namespace DB
\
M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \
M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \
M(UInt64, max_table_num_to_warn, 5000lu, "If number of tables is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, max_database_num_to_warn, 1000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, max_part_num_to_warn, 100000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \
M(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \
M(UInt64, concurrent_threads_soft_limit_ratio_to_cores, 0, "Same as concurrent_threads_soft_limit_num, but with ratio to cores.", 0) \
\

View File

@ -928,12 +928,12 @@ class IColumn;
M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \
M(Bool, input_format_parquet_filter_push_down, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.", 0) \
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_orc_allow_missing_columns, true, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \
M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
M(Bool, input_format_parquet_allow_missing_columns, true, "Allow missing columns while reading Parquet input formats", 0) \
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \
M(Bool, input_format_arrow_allow_missing_columns, true, "Allow missing columns while reading Arrow input formats", 0) \
M(Char, input_format_hive_text_fields_delimiter, '\x01', "Delimiter between fields in Hive Text File", 0) \
M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \
M(Char, input_format_hive_text_map_keys_delimiter, '\x03', "Delimiter between a pair of map key/values in Hive Text File", 0) \
@ -942,6 +942,7 @@ class IColumn;
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
@ -961,6 +962,7 @@ class IColumn;
M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \
M(SchemaInferenceMode, schema_inference_mode, "default", "Mode of schema inference. 'default' - assume that all files have the same schema and schema can be inferred from any file, 'union' - files can have different schemas and the resulting schema should be the a union of schemas of all files", 0) \
M(Bool, schema_inference_make_columns_nullable, true, "If set to true, all inferred types will be Nullable in schema inference for formats without information about nullability.", 0) \
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
M(Bool, input_format_json_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference", 0) \

View File

@ -81,7 +81,10 @@ namespace SettingsChangesHistory
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
{
{"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}}},
{"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."},
{"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"},
{"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"},
{"input_format_arrow_allow_missing_columns", false, true, "Allow missing columns in Arrow files by default"}}},
{"23.9", {{"optimize_group_by_constant_keys", false, true, "Optimize group by constant keys by default"},
{"input_format_json_try_infer_named_tuples_from_objects", false, true, "Try to infer named Tuples from JSON objects by default"},
{"input_format_json_read_numbers_as_strings", false, true, "Allow to read numbers as strings in JSON formats by default"},

View File

@ -196,9 +196,12 @@ IMPLEMENT_SETTING_ENUM(ExternalCommandStderrReaction, ErrorCodes::BAD_ARGUMENTS,
{"log_last", ExternalCommandStderrReaction::LOG_LAST},
{"throw", ExternalCommandStderrReaction::THROW}})
IMPLEMENT_SETTING_ENUM(DateTimeOverflowBehavior, ErrorCodes::BAD_ARGUMENTS,
{{"throw", FormatSettings::DateTimeOverflowBehavior::Throw},
{"ignore", FormatSettings::DateTimeOverflowBehavior::Ignore},
{"saturate", FormatSettings::DateTimeOverflowBehavior::Saturate}})
IMPLEMENT_SETTING_ENUM(SchemaInferenceMode, ErrorCodes::BAD_ARGUMENTS,
{{"default", SchemaInferenceMode::DEFAULT},
{"union", SchemaInferenceMode::UNION}})
IMPLEMENT_SETTING_ENUM(DateTimeOverflowBehavior, ErrorCodes::BAD_ARGUMENTS,
{{"throw", FormatSettings::DateTimeOverflowBehavior::Throw},
{"ignore", FormatSettings::DateTimeOverflowBehavior::Ignore},
{"saturate", FormatSettings::DateTimeOverflowBehavior::Saturate}})
}

View File

@ -133,6 +133,8 @@ enum class DefaultTableEngine
ReplacingMergeTree,
ReplicatedMergeTree,
ReplicatedReplacingMergeTree,
SharedMergeTree,
SharedReplacingMergeTree,
Memory,
};
@ -252,6 +254,14 @@ DECLARE_SETTING_ENUM(S3QueueAction)
DECLARE_SETTING_ENUM(ExternalCommandStderrReaction)
enum class SchemaInferenceMode
{
DEFAULT,
UNION,
};
DECLARE_SETTING_ENUM(SchemaInferenceMode)
DECLARE_SETTING_ENUM_WITH_RENAME(DateTimeOverflowBehavior, FormatSettings::DateTimeOverflowBehavior)
}

View File

@ -440,6 +440,8 @@ template <typename T> inline bool isFloat(const T & data_type) { return WhichDat
template <typename T> inline bool isNativeNumber(const T & data_type) { return WhichDataType(data_type).isNativeNumber(); }
template <typename T> inline bool isNumber(const T & data_type) { return WhichDataType(data_type).isNumber(); }
template <typename T> inline bool isEnum8(const T & data_type) { return WhichDataType(data_type).isEnum8(); }
template <typename T> inline bool isEnum16(const T & data_type) { return WhichDataType(data_type).isEnum16(); }
template <typename T> inline bool isEnum(const T & data_type) { return WhichDataType(data_type).isEnum(); }
template <typename T> inline bool isDate(const T & data_type) { return WhichDataType(data_type).isDate(); }

View File

@ -152,6 +152,9 @@ template <int UNROLL_TIMES>
static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit)
{
size_t offset = data.size();
/// Avoiding calling resize in a loop improves the performance.
data.resize(std::max(data.capacity(), static_cast<size_t>(4096)));
for (size_t i = 0; i < limit; ++i)
{
if (istr.eof())
@ -171,7 +174,8 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnSt
offset += size + 1;
offsets.push_back(offset);
data.resize(offset);
if (unlikely(offset > data.size()))
data.resize_exact(roundUpToPowerOfTwoOrZero(std::max(offset, data.size() * 2)));
if (size)
{
@ -203,6 +207,8 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnSt
data[offset - 1] = 0;
}
data.resize(offset);
}

View File

@ -1,6 +1,10 @@
#include <Databases/DDLLoadingDependencyVisitor.h>
#include <Databases/DDLDependencyVisitor.h>
#include <Dictionaries/getDictionaryConfigurationFromAST.h>
#include "config.h"
#if USE_LIBPQXX
#include <Storages/PostgreSQL/StorageMaterializedPostgreSQL.h>
#endif
#include <Interpreters/Context.h>
#include <Interpreters/misc.h>
#include <Parsers/ASTCreateQuery.h>
@ -131,6 +135,14 @@ void DDLLoadingDependencyVisitor::visit(const ASTStorage & storage, Data & data)
extractTableNameFromArgument(*storage.engine, data, 3);
else if (storage.engine->name == "Dictionary")
extractTableNameFromArgument(*storage.engine, data, 0);
#if USE_LIBPQXX
else if (storage.engine->name == "MaterializedPostgreSQL")
{
const auto * create_query = data.create_query->as<ASTCreateQuery>();
auto nested_table = toString(create_query->uuid) + StorageMaterializedPostgreSQL::NESTED_TABLE_SUFFIX;
data.dependencies.emplace(QualifiedTableName{ .database = create_query->getDatabase(), .table = nested_table });
}
#endif
}

View File

@ -18,6 +18,13 @@
namespace fs = std::filesystem;
namespace CurrentMetrics
{
extern const Metric AttachedTable;
}
namespace DB
{
@ -173,6 +180,7 @@ void DatabaseLazy::attachTableUnlocked(ContextPtr /* context_ */, const String &
throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists.", backQuote(database_name), backQuote(table_name));
it->second.expiration_iterator = cache_expiration_queue.emplace(cache_expiration_queue.end(), current_time, table_name);
CurrentMetrics::add(CurrentMetrics::AttachedTable, 1);
}
StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & table_name)
@ -188,6 +196,7 @@ StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & ta
if (it->second.expiration_iterator != cache_expiration_queue.end())
cache_expiration_queue.erase(it->second.expiration_iterator);
tables_cache.erase(it);
CurrentMetrics::sub(CurrentMetrics::AttachedTable, 1);
}
return res;
}

View File

@ -139,6 +139,8 @@ void DatabaseOrdinary::loadTableFromMetadata(
assert(name.database == TSA_SUPPRESS_WARNING_FOR_READ(database_name));
const auto & query = ast->as<const ASTCreateQuery &>();
LOG_TRACE(log, "Loading table {}", name.getFullName());
try
{
auto [table_name, table] = createTableFromAST(

View File

@ -9,11 +9,17 @@
#include <Storages/StorageFactory.h>
#include <Common/quoteString.h>
#include <Common/typeid_cast.h>
#include <Common/CurrentMetrics.h>
#include <Common/escapeForFileName.h>
#include <TableFunctions/TableFunctionFactory.h>
#include <Backups/BackupEntriesCollector.h>
#include <Backups/RestorerFromBackup.h>
namespace CurrentMetrics
{
extern const Metric AttachedTable;
}
namespace DB
{
@ -243,6 +249,7 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n
res = it->second;
tables.erase(it);
res->is_detached = true;
CurrentMetrics::sub(CurrentMetrics::AttachedTable, 1);
auto table_id = res->getStorageID();
if (table_id.hasUUID())
@ -277,6 +284,7 @@ void DatabaseWithOwnTablesBase::attachTableUnlocked(ContextPtr, const String & n
/// It is important to reset is_detached here since in case of RENAME in
/// non-Atomic database the is_detached is set to true before RENAME.
table->is_detached = false;
CurrentMetrics::add(CurrentMetrics::AttachedTable, 1);
}
void DatabaseWithOwnTablesBase::registerLazyTableUnlocked(const String & table_name, LazyTableCreator table_creator, const String & relative_table_path)

View File

@ -5,8 +5,14 @@
#include <Common/quoteString.h>
#include <Interpreters/DatabaseCatalog.h>
#include <Common/NamePrompter.h>
#include <Common/CurrentMetrics.h>
namespace CurrentMetrics
{
extern const Metric AttachedDatabase;
}
namespace DB
{
@ -29,6 +35,16 @@ StoragePtr IDatabase::getTable(const String & name, ContextPtr context) const
throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist. Maybe you meant {}?", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name), backQuoteIfNeed(names[0]));
}
IDatabase::IDatabase(String database_name_) : database_name(std::move(database_name_))
{
CurrentMetrics::add(CurrentMetrics::AttachedDatabase, 1);
}
IDatabase::~IDatabase()
{
CurrentMetrics::sub(CurrentMetrics::AttachedDatabase, 1);
}
std::vector<std::pair<ASTPtr, StoragePtr>> IDatabase::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const
{
/// Cannot backup any table because IDatabase doesn't own any tables.

View File

@ -142,7 +142,7 @@ public:
using LazyTables = std::map<String, std::pair<String, LazyTableCreator>>;
IDatabase() = delete;
explicit IDatabase(String database_name_) : database_name(std::move(database_name_)) {}
explicit IDatabase(String database_name_);
/// Get name of database engine.
virtual String getEngineName() const = 0;
@ -429,7 +429,7 @@ public:
/// Creates a table restored from backup.
virtual void createTableRestoredFromBackup(const ASTPtr & create_table_query, ContextMutablePtr context, std::shared_ptr<IRestoreCoordination> restore_coordination, UInt64 timeout_ms);
virtual ~IDatabase() = default;
virtual ~IDatabase();
protected:
virtual ASTPtr getCreateTableQueryImpl(const String & /*name*/, ContextPtr /*context*/, bool throw_on_error) const

View File

@ -25,6 +25,7 @@ namespace ErrorCodes
{
extern const int UNKNOWN_TABLE;
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
}
@ -158,6 +159,15 @@ static DataTypePtr convertPostgreSQLDataType(String & type, Fn<void()> auto && r
return res;
}
/// Check if PostgreSQL relation is empty.
/// postgres_table must be already quoted + schema-qualified.
template <typename T>
bool isTableEmpty(T & tx, const String & postgres_table)
{
auto query = fmt::format("SELECT NOT EXISTS (SELECT * FROM {} LIMIT 1);", postgres_table);
pqxx::result result{tx.exec(query)};
return result[0][0].as<bool>();
}
template<typename T>
PostgreSQLTableStructure::ColumnsInfoPtr readNamesAndTypesList(
@ -186,20 +196,25 @@ PostgreSQLTableStructure::ColumnsInfoPtr readNamesAndTypesList(
}
else
{
std::tuple<std::string, std::string, std::string, uint16_t, std::string, std::string> row;
std::tuple<std::string, std::string, std::string, uint16_t, std::string, std::string, std::string> row;
while (stream >> row)
{
auto data_type = convertPostgreSQLDataType(
const auto column_name = std::get<0>(row);
const auto data_type = convertPostgreSQLDataType(
std::get<1>(row), recheck_array,
use_nulls && (std::get<2>(row) == /* not nullable */"f"),
std::get<3>(row));
columns.push_back(NameAndTypePair(std::get<0>(row), data_type));
columns.push_back(NameAndTypePair(column_name, data_type));
auto attgenerated = std::get<6>(row);
LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: attgenerated: {}", attgenerated);
attributes.emplace_back(
PostgreSQLTableStructure::PGAttribute{
.atttypid = parse<int>(std::get<4>(row)),
.atttypmod = parse<int>(std::get<5>(row)),
attributes.emplace(
column_name,
PostgreSQLTableStructure::PGAttribute{
.atttypid = parse<int>(std::get<4>(row)),
.atttypmod = parse<int>(std::get<5>(row)),
.attgenerated = attgenerated.empty() ? char{} : char(attgenerated[0])
});
++i;
@ -213,12 +228,37 @@ PostgreSQLTableStructure::ColumnsInfoPtr readNamesAndTypesList(
{
const auto & name_and_type = columns[i];
/// All rows must contain the same number of dimensions, so limit 1 is ok. If number of dimensions in all rows is not the same -
/// If the relation is empty, then array_ndims returns NULL.
/// ClickHouse cannot support this use case.
if (isTableEmpty(tx, postgres_table))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "PostgreSQL relation containing arrays cannot be empty: {}", postgres_table);
/// All rows must contain the same number of dimensions.
/// 1 is ok. If number of dimensions in all rows is not the same -
/// such arrays are not able to be used as ClickHouse Array at all.
pqxx::result result{tx.exec(fmt::format("SELECT array_ndims({}) FROM {} LIMIT 1", name_and_type.name, postgres_table))};
// array_ndims() may return null for empty array, but we expect 0:
// https://github.com/postgres/postgres/blob/d16a0c1e2e3874cd5adfa9ee968008b6c4b1ae01/src/backend/utils/adt/arrayfuncs.c#L1658
auto dimensions = result[0][0].as<std::optional<int>>().value_or(0);
///
/// For empty arrays, array_ndims([]) will return NULL.
auto postgres_column = doubleQuoteString(name_and_type.name);
pqxx::result result{tx.exec(
fmt::format("SELECT {} IS NULL, array_ndims({}) FROM {} LIMIT 1;", postgres_column, postgres_column, postgres_table))};
/// Nullable(Array) is not supported.
auto is_null_array = result[0][0].as<bool>();
if (is_null_array)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "PostgreSQL array cannot be NULL: {}.{}", postgres_table, postgres_column);
/// Cannot infer dimension of empty arrays.
auto is_empty_array = result[0][1].is_null();
if (is_empty_array)
{
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"PostgreSQL cannot infer dimensions of an empty array: {}.{}",
postgres_table,
postgres_column);
}
int dimensions = result[0][1].as<int>();
/// It is always 1d array if it is in recheck.
DataTypePtr type = assert_cast<const DataTypeArray *>(name_and_type.type.get())->getNestedType();
@ -255,14 +295,19 @@ PostgreSQLTableStructure fetchPostgreSQLTableStructure(
PostgreSQLTableStructure table;
auto where = fmt::format("relname = {}", quoteString(postgres_table));
if (postgres_schema.empty())
where += " AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'public')";
else
where += fmt::format(" AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = {})", quoteString(postgres_schema));
where += postgres_schema.empty()
? " AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'public')"
: fmt::format(" AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = {})", quoteString(postgres_schema));
std::string query = fmt::format(
"SELECT attname AS name, format_type(atttypid, atttypmod) AS type, "
"attnotnull AS not_null, attndims AS dims, atttypid as type_id, atttypmod as type_modifier "
"SELECT attname AS name, " /// column name
"format_type(atttypid, atttypmod) AS type, " /// data type
"attnotnull AS not_null, " /// is nullable
"attndims AS dims, " /// array dimensions
"atttypid as type_id, "
"atttypmod as type_modifier, "
"attgenerated as generated " /// if column has GENERATED
"FROM pg_attribute "
"WHERE attrelid = (SELECT oid FROM pg_class WHERE {}) "
"AND NOT attisdropped AND attnum > 0 "
@ -274,11 +319,44 @@ PostgreSQLTableStructure fetchPostgreSQLTableStructure(
if (!table.physical_columns)
throw Exception(ErrorCodes::UNKNOWN_TABLE, "PostgreSQL table {} does not exist", postgres_table_with_schema);
for (const auto & column : table.physical_columns->columns)
{
table.physical_columns->names.push_back(column.name);
}
bool check_generated = table.physical_columns->attributes.end() != std::find_if(
table.physical_columns->attributes.begin(),
table.physical_columns->attributes.end(),
[](const auto & attr){ return attr.second.attgenerated == 's'; });
if (check_generated)
{
std::string attrdef_query = fmt::format(
"SELECT adnum, pg_get_expr(adbin, adrelid) as generated_expression "
"FROM pg_attrdef "
"WHERE adrelid = (SELECT oid FROM pg_class WHERE {});", where);
pqxx::result result{tx.exec(attrdef_query)};
for (const auto row : result)
{
size_t adnum = row[0].as<int>();
if (!adnum || adnum > table.physical_columns->names.size())
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Received adnum {}, but currently fetched columns list has {} columns",
adnum, table.physical_columns->attributes.size());
}
const auto column_name = table.physical_columns->names[adnum - 1];
table.physical_columns->attributes.at(column_name).attr_def = row[1].as<std::string>();
}
}
if (with_primary_key)
{
/// wiki.postgresql.org/wiki/Retrieve_primary_key_columns
query = fmt::format(
"SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS data_type "
"SELECT a.attname, " /// column name
"format_type(a.atttypid, a.atttypmod) AS data_type " /// data type
"FROM pg_index i "
"JOIN pg_attribute a ON a.attrelid = i.indrelid "
"AND a.attnum = ANY(i.indkey) "

View File

@ -16,13 +16,17 @@ struct PostgreSQLTableStructure
{
Int32 atttypid;
Int32 atttypmod;
bool atthasdef;
char attgenerated;
std::string attr_def;
};
using Attributes = std::vector<PGAttribute>;
using Attributes = std::unordered_map<std::string, PGAttribute>;
struct ColumnsInfo
{
NamesAndTypesList columns;
Attributes attributes;
std::vector<std::string> names;
ColumnsInfo(NamesAndTypesList && columns_, Attributes && attributes_) : columns(columns_), attributes(attributes_) {}
};
using ColumnsInfoPtr = std::shared_ptr<ColumnsInfo>;

View File

@ -69,12 +69,6 @@ DictionaryPtr DictionaryFactory::create(
layout_type);
}
DictionaryPtr DictionaryFactory::create(const std::string & name, const ASTCreateQuery & ast, ContextPtr global_context) const
{
auto configuration = getDictionaryConfigurationFromAST(ast, global_context);
return DictionaryFactory::create(name, *configuration, "dictionary", global_context, true);
}
bool DictionaryFactory::isComplex(const std::string & layout_type) const
{
auto it = registered_layouts.find(layout_type);

View File

@ -39,11 +39,6 @@ public:
ContextPtr global_context,
bool created_from_ddl) const;
/// Create dictionary from DDL-query
DictionaryPtr create(const std::string & name,
const ASTCreateQuery & ast,
ContextPtr global_context) const;
using LayoutCreateFunction = std::function<DictionaryPtr(
const std::string & name,
const DictionaryStructure & dict_struct,

View File

@ -540,7 +540,7 @@ bool CachedOnDiskReadBufferFromFile::completeFileSegmentAndGetNext()
return false;
current_file_segment = &file_segments->front();
current_file_segment->use();
current_file_segment->increasePriority();
implementation_buffer = getImplementationBuffer(*current_file_segment);
LOG_TEST(
@ -868,7 +868,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
else
{
implementation_buffer = getImplementationBuffer(file_segments->front());
file_segments->front().use();
file_segments->front().increasePriority();
}
chassert(!internal_buffer.empty());

View File

@ -6,9 +6,8 @@
#include <Common/filesystemHelpers.h>
#include <Common/NamedCollections/NamedCollections.h>
#include <Disks/DiskFactory.h>
#include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
#include <Disks/ObjectStorages/DiskObjectStorage.h>
#include <Interpreters/Context.h>
namespace DB
{

View File

@ -303,8 +303,8 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
/// Try to determine the type of value inside quotes
auto type = tryInferDataTypeForSingleField(data, format_settings);
/// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string.
if (!type || isNumber(removeNullable(type)) || isTuple(type))
/// If we couldn't infer any type or it's tuple in quotes or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string.
if (!type || isTuple(type) || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings))
return std::make_shared<DataTypeString>();
return type;

View File

@ -74,6 +74,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter;
format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns;
format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values;
format_settings.csv.try_infer_numbers_from_strings = settings.input_format_csv_try_infer_numbers_from_strings;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
@ -347,7 +348,13 @@ InputFormatPtr FormatFactory::getInput(
if (owned_buf)
format->addBuffer(std::move(owned_buf));
if (!settings.input_format_record_errors_file_path.toString().empty())
format->setErrorsLogger(std::make_shared<ParallelInputFormatErrorsLogger>(context));
{
if (parallel_parsing)
format->setErrorsLogger(std::make_shared<ParallelInputFormatErrorsLogger>(context));
else
format->setErrorsLogger(std::make_shared<InputFormatErrorsLogger>(context));
}
/// It's a kludge. Because I cannot remove context from values format.
/// (Not needed in the parallel_parsing case above because VALUES format doesn't support it.)

View File

@ -164,6 +164,7 @@ struct FormatSettings
bool allow_whitespace_or_tab_as_delimiter = false;
bool allow_variable_number_of_columns = false;
bool use_default_on_bad_values = false;
bool try_infer_numbers_from_strings = true;
} csv;
struct HiveText

View File

@ -564,6 +564,15 @@ namespace JSONUtils
skipWhitespaceIfAny(in);
}
bool checkAndSkipColon(ReadBuffer & in)
{
skipWhitespaceIfAny(in);
if (!checkChar(':', in))
return false;
skipWhitespaceIfAny(in);
return true;
}
String readFieldName(ReadBuffer & in)
{
skipWhitespaceIfAny(in);
@ -573,6 +582,12 @@ namespace JSONUtils
return field;
}
bool tryReadFieldName(ReadBuffer & in, String & field)
{
skipWhitespaceIfAny(in);
return tryReadJSONStringInto(field, in) && checkAndSkipColon(in);
}
String readStringField(ReadBuffer & in)
{
skipWhitespaceIfAny(in);
@ -582,6 +597,15 @@ namespace JSONUtils
return value;
}
bool tryReadStringField(ReadBuffer & in, String & value)
{
skipWhitespaceIfAny(in);
if (!tryReadJSONStringInto(value, in))
return false;
skipWhitespaceIfAny(in);
return true;
}
void skipArrayStart(ReadBuffer & in)
{
skipWhitespaceIfAny(in);
@ -628,6 +652,15 @@ namespace JSONUtils
skipWhitespaceIfAny(in);
}
bool checkAndSkipObjectStart(ReadBuffer & in)
{
skipWhitespaceIfAny(in);
if (!checkChar('{', in))
return false;
skipWhitespaceIfAny(in);
return true;
}
bool checkAndSkipObjectEnd(ReadBuffer & in)
{
skipWhitespaceIfAny(in);
@ -644,6 +677,15 @@ namespace JSONUtils
skipWhitespaceIfAny(in);
}
bool checkAndSkipComma(ReadBuffer & in)
{
skipWhitespaceIfAny(in);
if (!checkChar(',', in))
return false;
skipWhitespaceIfAny(in);
return true;
}
std::pair<String, String> readStringFieldNameAndValue(ReadBuffer & in)
{
auto field_name = readFieldName(in);
@ -651,6 +693,11 @@ namespace JSONUtils
return {field_name, field_value};
}
bool tryReadStringFieldNameAndValue(ReadBuffer & in, std::pair<String, String> & field_and_value)
{
return tryReadFieldName(in, field_and_value.first) && tryReadStringField(in, field_and_value.second);
}
NameAndTypePair readObjectWithNameAndType(ReadBuffer & in)
{
skipObjectStart(in);
@ -673,6 +720,44 @@ namespace JSONUtils
return name_and_type;
}
bool tryReadObjectWithNameAndType(ReadBuffer & in, NameAndTypePair & name_and_type)
{
if (!checkAndSkipObjectStart(in))
return false;
std::pair<String, String> first_field_and_value;
if (!tryReadStringFieldNameAndValue(in, first_field_and_value))
return false;
if (!checkAndSkipComma(in))
return false;
std::pair<String, String> second_field_and_value;
if (!tryReadStringFieldNameAndValue(in, second_field_and_value))
return false;
if (first_field_and_value.first == "name" && second_field_and_value.first == "type")
{
auto type = DataTypeFactory::instance().tryGet(second_field_and_value.second);
if (!type)
return false;
name_and_type = {first_field_and_value.second, type};
}
else if (second_field_and_value.first == "name" && first_field_and_value.first == "type")
{
auto type = DataTypeFactory::instance().tryGet(first_field_and_value.second);
if (!type)
return false;
name_and_type = {second_field_and_value.second, type};
}
else
{
return false;
}
return checkAndSkipObjectEnd(in);
}
NamesAndTypesList readMetadata(ReadBuffer & in)
{
auto field_name = readFieldName(in);
@ -693,6 +778,37 @@ namespace JSONUtils
return names_and_types;
}
bool tryReadMetadata(ReadBuffer & in, NamesAndTypesList & names_and_types)
{
String field_name;
if (!tryReadFieldName(in, field_name) || field_name != "meta")
return false;
if (!checkAndSkipArrayStart(in))
return false;
bool first = true;
while (!checkAndSkipArrayEnd(in))
{
if (!first)
{
if (!checkAndSkipComma(in))
return false;
}
else
{
first = false;
}
NameAndTypePair name_and_type;
if (!tryReadObjectWithNameAndType(in, name_and_type))
return false;
names_and_types.push_back(name_and_type);
}
return !names_and_types.empty();
}
void validateMetadataByHeader(const NamesAndTypesList & names_and_types_from_metadata, const Block & header)
{
for (const auto & [name, type] : names_and_types_from_metadata)

View File

@ -112,6 +112,7 @@ namespace JSONUtils
void skipColon(ReadBuffer & in);
void skipComma(ReadBuffer & in);
bool checkAndSkipComma(ReadBuffer & in);
String readFieldName(ReadBuffer & in);
@ -122,9 +123,11 @@ namespace JSONUtils
void skipObjectStart(ReadBuffer & in);
void skipObjectEnd(ReadBuffer & in);
bool checkAndSkipObjectStart(ReadBuffer & in);
bool checkAndSkipObjectEnd(ReadBuffer & in);
NamesAndTypesList readMetadata(ReadBuffer & in);
bool tryReadMetadata(ReadBuffer & in, NamesAndTypesList & names_and_types);
NamesAndTypesList readMetadataAndValidateHeader(ReadBuffer & in, const Block & header);
void validateMetadataByHeader(const NamesAndTypesList & names_and_types_from_metadata, const Block & header);

View File

@ -10,6 +10,13 @@
namespace DB
{
/// It's a bug in clang with three-way comparison operator
/// https://github.com/llvm/llvm-project/issues/55919
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
#endif
/** Mark is the position in the compressed file. The compressed file consists of adjacent compressed blocks.
* Mark is a tuple - the offset in the file to the start of the compressed block, the offset in the decompressed block to the start of the data.
*/
@ -18,12 +25,7 @@ struct MarkInCompressedFile
size_t offset_in_compressed_file;
size_t offset_in_decompressed_block;
bool operator==(const MarkInCompressedFile & rhs) const
{
return std::tie(offset_in_compressed_file, offset_in_decompressed_block)
== std::tie(rhs.offset_in_compressed_file, rhs.offset_in_decompressed_block);
}
bool operator!=(const MarkInCompressedFile & rhs) const { return !(*this == rhs); }
auto operator<=>(const MarkInCompressedFile &) const = default;
auto asTuple() const { return std::make_tuple(offset_in_compressed_file, offset_in_decompressed_block); }
@ -39,6 +41,10 @@ struct MarkInCompressedFile
}
};
#ifdef __clang__
#pragma clang diagnostic pop
#endif
/**
* In-memory representation of an array of marks.
*

View File

@ -1,12 +1,9 @@
#include <DataTypes/DataTypeMap.h>
#include <Formats/ReadSchemaUtils.h>
#include <Interpreters/Context.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Storages/IStorage.h>
#include <Common/assert_cast.h>
#include <IO/WithFileName.h>
#include <IO/WithFileSize.h>
#include <IO/EmptyReadBuffer.h>
namespace DB
{
@ -17,6 +14,7 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
extern const int ONLY_NULLS_WHILE_READING_SCHEMA;
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
extern const int TYPE_MISMATCH;
}
static std::optional<NamesAndTypesList> getOrderedColumnsList(const NamesAndTypesList & columns_list, const Names & columns_order_hint)
@ -55,6 +53,17 @@ ColumnsDescription readSchemaFromFormat(
try
{
NamesAndTypesList names_and_types;
SchemaInferenceMode mode = context->getSettingsRef().schema_inference_mode;
if (mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings))
{
String additional_message;
/// Better exception message for WithNames(AndTypes) formats.
if (format_name.ends_with("WithNames") || format_name.ends_with("WithNamesAndTypes"))
additional_message = " (formats -WithNames(AndTypes) support reading subset of columns only when setting input_format_with_names_use_header is enabled)";
throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", format_name, additional_message);
}
if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name))
{
auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings);
@ -71,6 +80,11 @@ try
}
else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name))
{
if (mode == SchemaInferenceMode::UNION)
retry = false;
std::vector<std::pair<NamesAndTypesList, String>> schemas_for_union_mode;
std::optional<ColumnsDescription> cached_columns;
std::string exception_messages;
SchemaReaderPtr schema_reader;
size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference
@ -84,7 +98,15 @@ try
try
{
read_buffer_iterator.setPreviousReadBuffer(std::move(buf));
buf = read_buffer_iterator.next();
std::tie(buf, cached_columns) = read_buffer_iterator.next();
if (cached_columns)
{
if (mode == SchemaInferenceMode::DEFAULT)
return *cached_columns;
schemas_for_union_mode.emplace_back(cached_columns->getAll(), read_buffer_iterator.getLastFileName());
continue;
}
if (!buf)
break;
@ -136,12 +158,19 @@ try
auto num_rows = schema_reader->readNumberOrRows();
if (num_rows)
read_buffer_iterator.setNumRowsToLastFile(*num_rows);
break;
/// In default mode, we finish when schema is inferred successfully from any file.
if (mode == SchemaInferenceMode::DEFAULT)
break;
if (!names_and_types.empty())
read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types));
schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName());
}
catch (...)
{
auto exception_message = getCurrentExceptionMessage(false);
if (schema_reader)
if (schema_reader && mode == SchemaInferenceMode::DEFAULT)
{
size_t rows_read = schema_reader->getNumRowsRead();
assert(rows_read <= max_rows_to_read);
@ -190,8 +219,58 @@ try
}
}
if (auto cached_columns = read_buffer_iterator.getCachedColumns())
return *cached_columns;
/// If we got all schemas from cache, schema_reader can be uninitialized.
/// But we still need some stateless methods of ISchemaReader,
/// let's initialize it with empty buffer.
EmptyReadBuffer empty;
if (!schema_reader)
schema_reader = FormatFactory::instance().getSchemaReader(format_name, empty, context, format_settings);
if (mode == SchemaInferenceMode::UNION)
{
Names names_order; /// Try to save original columns order;
std::unordered_map<String, DataTypePtr> names_to_types;
for (const auto & [schema, file_name] : schemas_for_union_mode)
{
for (const auto & [name, type] : schema)
{
auto it = names_to_types.find(name);
if (it == names_to_types.end())
{
names_order.push_back(name);
names_to_types[name] = type;
}
else
{
/// We already have column with such name.
/// Check if types are the same.
if (!type->equals(*it->second))
{
/// If types are not the same, try to transform them according
/// to the format to find common type.
auto new_type_copy = type;
schema_reader->transformTypesFromDifferentFilesIfNeeded(it->second, new_type_copy);
/// If types are not the same after transform, we cannot do anything, throw an exception.
if (!it->second->equals(*new_type_copy))
throw Exception(
ErrorCodes::TYPE_MISMATCH,
"Automatically inferred type {} for column '{}'{} differs from type inferred from previous files: {}",
type->getName(),
name,
file_name.empty() ? "" : " in file " + file_name,
it->second->getName());
}
}
}
}
names_and_types.clear();
for (const auto & name : names_order)
names_and_types.emplace_back(name, names_to_types[name]);
}
if (names_and_types.empty())
throw Exception(
@ -206,7 +285,7 @@ try
/// It will allow to execute simple data loading with query
/// "INSERT INTO table SELECT * FROM ..."
const auto & insertion_table = context->getInsertionTable();
if (!schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty())
if (schema_reader && !schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty())
{
auto storage = DatabaseCatalog::instance().getTable(insertion_table, context);
auto metadata = storage->getInMemoryMetadataPtr();
@ -226,13 +305,15 @@ try
names_and_types.erase(
std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }),
names_and_types.end());
return ColumnsDescription(names_and_types);
auto columns = ColumnsDescription(names_and_types);
if (mode == SchemaInferenceMode::DEFAULT)
read_buffer_iterator.setResultingSchema(columns);
return columns;
}
catch (Exception & e)
{
if (!buf)
throw;
auto file_name = getFileNameFromReadBuffer(*buf);
auto file_name = read_buffer_iterator.getLastFileName();
if (!file_name.empty())
e.addMessage(fmt::format("(in file/uri {})", file_name));
throw;
@ -256,9 +337,9 @@ SchemaCache::Key getKeyForSchemaCache(
return getKeysForSchemaCache({source}, format, format_settings, context).front();
}
static SchemaCache::Key makeSchemaCacheKey(const String & source, const String & format, const String & additional_format_info)
static SchemaCache::Key makeSchemaCacheKey(const String & source, const String & format, const String & additional_format_info, const String & schema_inference_mode)
{
return SchemaCache::Key{source, format, additional_format_info};
return SchemaCache::Key{source, format, additional_format_info, schema_inference_mode};
}
SchemaCache::Keys getKeysForSchemaCache(
@ -270,13 +351,14 @@ SchemaCache::Keys getKeysForSchemaCache(
/// For example, for Protobuf format additional information is the path to the schema
/// and message name.
String additional_format_info = FormatFactory::instance().getAdditionalInfoForSchemaCache(format, context, format_settings);
String schema_inference_mode(magic_enum::enum_name(context->getSettingsRef().schema_inference_mode.value));
SchemaCache::Keys cache_keys;
cache_keys.reserve(sources.size());
std::transform(
sources.begin(),
sources.end(),
std::back_inserter(cache_keys),
[&](const auto & source) { return makeSchemaCacheKey(source, format, additional_format_info); });
[&](const auto & source) { return makeSchemaCacheKey(source, format, additional_format_info, schema_inference_mode); });
return cache_keys;
}

View File

@ -13,11 +13,23 @@ struct IReadBufferIterator
virtual void setPreviousReadBuffer(std::unique_ptr<ReadBuffer> /* buffer */) {}
virtual std::unique_ptr<ReadBuffer> next() = 0;
virtual std::optional<ColumnsDescription> getCachedColumns() { return std::nullopt; }
/// Return read buffer of the next file or cached schema.
/// In DEFAULT schema inference mode cached schema can be from any file.
/// In UNION mode cached schema can be only from current file.
/// When there is no files to process, return pair (nullptr, nullopt)
virtual std::pair<std::unique_ptr<ReadBuffer>, std::optional<ColumnsDescription>> next() = 0;
virtual void setNumRowsToLastFile(size_t /*num_rows*/) {}
/// Set schema inferred from last file. Used for UNION mode to cache schema
/// per file.
virtual void setSchemaToLastFile(const ColumnsDescription & /*columns*/) {}
/// Set resulting inferred schema. Used for DEFAULT mode to cache schema
/// for all files.
virtual void setResultingSchema(const ColumnsDescription & /*columns*/) {}
/// Get last processed file name for better exception messages.
virtual String getLastFileName() const { return ""; }
};
struct SingleReadBufferIterator : public IReadBufferIterator
@ -27,12 +39,12 @@ public:
{
}
std::unique_ptr<ReadBuffer> next() override
std::pair<std::unique_ptr<ReadBuffer>, std::optional<ColumnsDescription>> next() override
{
if (done)
return nullptr;
return {nullptr, {}};
done = true;
return std::move(buf);
return {std::move(buf), {}};
}
private:
@ -45,11 +57,18 @@ private:
/// use it and won't create a read buffer.
/// For formats that have a schema reader from the data,
/// read buffer will be created by the provided iterator and
/// the schema will be extracted from the data. If schema reader
/// couldn't determine the schema we will try the next read buffer
/// from the provided iterator if it makes sense. If the format doesn't
/// have any schema reader or we couldn't determine the schema,
/// an exception will be thrown.
/// the schema will be extracted from the data. If the format doesn't
/// have any schema reader an exception will be thrown.
/// Reading schema can be performed in 2 modes depending on setting schema_inference_mode:
/// 1) Default mode. In this mode ClickHouse assumes that all files have the same schema
/// and tries to infer the schema by reading files one by one until it succeeds.
/// If schema reader couldn't determine the schema for some file, ClickHouse will try the next
/// file (next read buffer from the provided iterator) if it makes sense. If ClickHouse couldn't determine
/// the resulting schema, an exception will be thrown.
/// 2) Union mode. In this mode ClickHouse assumes that files can have different schemas,
/// so it infer schemas of all files and then union them to the common schema. In this mode
/// all read buffers from provided iterator will be used. If ClickHouse couldn't determine
/// the schema for some file, an exception will be thrown.
ColumnsDescription readSchemaFromFormat(
const String & format_name,
const std::optional<FormatSettings> & format_settings,

View File

@ -547,6 +547,54 @@ namespace
}
}
void mergeNamedTuples(DataTypes & data_types, TypeIndexesSet & type_indexes, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
if (!type_indexes.contains(TypeIndex::Tuple))
return;
/// Collect all names and their types from all named tuples.
std::unordered_map<String, DataTypes> names_to_types;
/// Try to save original order of element names.
Names element_names;
for (auto & type : data_types)
{
const auto * tuple_type = typeid_cast<const DataTypeTuple *>(type.get());
if (tuple_type && tuple_type->haveExplicitNames())
{
const auto & elements = tuple_type->getElements();
const auto & names = tuple_type->getElementNames();
for (size_t i = 0; i != elements.size(); ++i)
{
if (!names_to_types.contains(names[i]))
element_names.push_back(names[i]);
names_to_types[names[i]].push_back(elements[i]);
}
}
}
/// Try to find common type for each tuple element with the same name.
DataTypes element_types;
element_types.reserve(names_to_types.size());
for (const auto & name : element_names)
{
auto & types = names_to_types[name];
transformInferredTypesIfNeededImpl<true>(types, settings, json_info);
/// If some element have different types in different tuples, we can't do anything
if (!checkIfTypesAreEqual(types))
return;
element_types.push_back(types.front());
}
DataTypePtr result_tuple = std::make_shared<DataTypeTuple>(element_types, element_names);
for (auto & type : data_types)
{
const auto * tuple_type = typeid_cast<const DataTypeTuple *>(type.get());
if (tuple_type && tuple_type->haveExplicitNames())
type = result_tuple;
}
}
template <bool is_json>
void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
@ -604,6 +652,9 @@ namespace
if (settings.json.read_objects_as_strings)
transformMapsAndStringsToStrings(data_types, type_indexes);
if (json_info && json_info->allow_merging_named_tuples)
mergeNamedTuples(data_types, type_indexes, settings, json_info);
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
@ -1180,6 +1231,13 @@ void transformInferredJSONTypesIfNeeded(
second = std::move(types[1]);
}
void transformInferredJSONTypesFromDifferentFilesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
{
JSONInferenceInfo json_info;
json_info.allow_merging_named_tuples = true;
transformInferredJSONTypesIfNeeded(first, second, settings, &json_info);
}
void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info, bool remain_nothing_types = false)
{
if (!data_type)
@ -1247,11 +1305,22 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
return;
}
/// First, try to transform nested types without final transformations to see if there is a common type.
auto nested_types_copy = nested_types;
transformInferredTypesIfNeededImpl<true>(nested_types_copy, settings, json_info);
if (checkIfTypesAreEqual(nested_types_copy))
{
data_type = std::make_shared<DataTypeArray>(nested_types_copy.back());
transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info);
return;
}
/// Apply final transformation to nested types, and then try to find common type.
for (auto & nested_type : nested_types)
/// Don't change Nothing to String in nested types here, because we are not sure yet if it's Array or actual Tuple
transformFinalInferredJSONTypeIfNeededImpl(nested_type, settings, json_info, /*remain_nothing_types=*/ true);
auto nested_types_copy = nested_types;
nested_types_copy = nested_types;
transformInferredTypesIfNeededImpl<true>(nested_types_copy, settings, json_info);
if (checkIfTypesAreEqual(nested_types_copy))
{
@ -1381,7 +1450,6 @@ DataTypePtr makeNullableRecursively(DataTypePtr type)
return std::make_shared<DataTypeTuple>(std::move(nested_types), tuple_type->getElementNames());
return std::make_shared<DataTypeTuple>(std::move(nested_types));
}
if (which.isMap())

View File

@ -14,6 +14,11 @@ struct JSONInferenceInfo
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
/// Indicates if currently we are inferring type for Map/Object key.
bool is_object_key = false;
/// When we transform types for the same column from different files
/// we cannot use DataTypeJSONPaths for inferring named tuples from JSON objects,
/// because DataTypeJSONPaths was already finalized to named tuple. IN this case
/// we can only merge named tuples from different files together.
bool allow_merging_named_tuples = false;
};
/// Try to determine datatype of the value in buffer/string. If the type cannot be inferred, return nullptr.
@ -64,9 +69,7 @@ void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, c
/// from strings in json_info while inference and use it here, so we will know that Array(Int64) contains
/// integer inferred from a string.
/// Example 2:
/// When we have maps with different value types, we convert all types to JSON object type.
/// For example, if we have Map(String, UInt64) (like `{"a" : 123}`) and Map(String, String) (like `{"b" : 'abc'}`)
/// we will convert both types to Object('JSON').
/// We merge DataTypeJSONPaths types to a single DataTypeJSONPaths type with union of all JSON paths.
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Make final transform for types inferred in JSON format. It does 3 types of transformation:
@ -78,6 +81,11 @@ void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & secon
/// 3) Converts all Nothing types to String types if input_format_json_infer_incomplete_types_as_strings is enabled.
void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Transform types for the same column inferred from different files.
/// Does the same as transformInferredJSONTypesIfNeeded, but also merges named Tuples together,
/// because DataTypeJSONPaths types were finalized when we finished inference for a file.
void transformInferredJSONTypesFromDifferentFilesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings);
/// Make type Nullable recursively:
/// - Type -> Nullable(type)
/// - Array(Type) -> Array(Nullable(Type))

View File

@ -1483,6 +1483,17 @@ public:
return getReturnTypeImplStatic(new_arguments, context);
}
/// Special case - one or both arguments are IPv6
if (isIPv6(arguments[0]) || isIPv6(arguments[1]))
{
DataTypes new_arguments {
isIPv6(arguments[0]) ? std::make_shared<DataTypeUInt128>() : arguments[0],
isIPv6(arguments[1]) ? std::make_shared<DataTypeUInt128>() : arguments[1],
};
return getReturnTypeImplStatic(new_arguments, context);
}
if constexpr (is_plus || is_minus)
{
@ -2181,6 +2192,25 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A
return executeImpl2(new_arguments, result_type, input_rows_count, right_nullmap);
}
/// Special case - one or both arguments are IPv6
if (isIPv6(arguments[0].type) || isIPv6(arguments[1].type))
{
ColumnsWithTypeAndName new_arguments {
{
isIPv6(arguments[0].type) ? castColumn(arguments[0], std::make_shared<DataTypeUInt128>()) : arguments[0].column,
isIPv6(arguments[0].type) ? std::make_shared<DataTypeUInt128>() : arguments[0].type,
arguments[0].name,
},
{
isIPv6(arguments[1].type) ? castColumn(arguments[1], std::make_shared<DataTypeUInt128>()) : arguments[1].column,
isIPv6(arguments[1].type) ? std::make_shared<DataTypeUInt128>() : arguments[1].type,
arguments[1].name
}
};
return executeImpl2(new_arguments, result_type, input_rows_count, right_nullmap);
}
const auto * const left_generic = left_argument.type.get();
const auto * const right_generic = right_argument.type.get();
ColumnPtr res;

View File

@ -221,6 +221,18 @@ struct ConvertImpl
continue;
}
if constexpr (std::is_same_v<FromDataType, DataTypeIPv6> && std::is_same_v<ToDataType, DataTypeUInt128>)
{
static_assert(
std::is_same_v<DataTypeUInt128::FieldType, DataTypeUUID::FieldType::UnderlyingType>,
"UInt128 and IPv6 types must be same");
vec_to[i].items[1] = std::byteswap(vec_from[i].toUnderType().items[0]);
vec_to[i].items[0] = std::byteswap(vec_from[i].toUnderType().items[1]);
continue;
}
if constexpr (std::is_same_v<FromDataType, DataTypeUUID> != std::is_same_v<ToDataType, DataTypeUUID>)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,

View File

@ -11,6 +11,8 @@
#include <Common/typeid_cast.h>
#include <Common/UTF8Helpers.h>
#include <DataTypes/EnumValues.h>
#include "IArraySource.h"
#include "IValueSource.h"
#include "Slices.h"
@ -56,8 +58,8 @@ struct NumericArraySource : public ArraySourceImpl<NumericArraySource<T>>
}
explicit NumericArraySource(const ColumnArray & arr)
: column(typeid_cast<const ColVecType &>(arr.getData()))
, elements(typeid_cast<const ColVecType &>(arr.getData()).getData()), offsets(arr.getOffsets())
: column(typeid_cast<const ColVecType &>(arr.getData()))
, elements(typeid_cast<const ColVecType &>(arr.getData()).getData()), offsets(arr.getOffsets())
{
}
@ -154,17 +156,22 @@ struct ConstSource : public Base
size_t row_num = 0;
explicit ConstSource(const ColumnConst & col_)
: Base(static_cast<const typename Base::Column &>(col_.getDataColumn())), total_rows(col_.size())
: Base(static_cast<const typename Base::Column &>(col_.getDataColumn()))
, total_rows(col_.size())
{
}
template <typename ColumnType>
ConstSource(const ColumnType & col_, size_t total_rows_) : Base(col_), total_rows(total_rows_)
ConstSource(const ColumnType & col_, size_t total_rows_)
: Base(col_)
, total_rows(total_rows_)
{
}
template <typename ColumnType>
ConstSource(const ColumnType & col_, const NullMap & null_map_, size_t total_rows_) : Base(col_, null_map_), total_rows(total_rows_)
ConstSource(const ColumnType & col_, const NullMap & null_map_, size_t total_rows_)
: Base(col_, null_map_)
, total_rows(total_rows_)
{
}
@ -240,7 +247,8 @@ struct StringSource
ColumnString::Offset prev_offset = 0;
explicit StringSource(const ColumnString & col)
: elements(col.getChars()), offsets(col.getOffsets())
: elements(col.getChars())
, offsets(col.getOffsets())
{
}
@ -313,6 +321,96 @@ struct StringSource
}
};
/// Treats Enum values as Strings, modeled after StringSource
template <typename EnumDataType>
struct EnumSource
{
using Column = typename EnumDataType::ColumnType;
using Slice = NumericArraySlice<UInt8>;
using SinkType = StringSink;
const typename Column::Container & data;
const EnumDataType & data_type;
size_t row_num = 0;
EnumSource(const Column & col, const EnumDataType & data_type_)
: data(col.getData())
, data_type(data_type_)
{
}
void next()
{
++row_num;
}
bool isEnd() const
{
return row_num == data.size();
}
size_t rowNum() const
{
return row_num;
}
size_t getSizeForReserve() const
{
return data.size();
}
size_t getElementSize() const
{
std::string_view name = data_type.getNameForValue(data[row_num]).toView();
return name.size();
}
size_t getColumnSize() const
{
return data.size();
}
Slice getWhole() const
{
std::string_view name = data_type.getNameForValue(data[row_num]).toView();
return {reinterpret_cast<const UInt8 *>(name.data()), name.size()};
}
Slice getSliceFromLeft(size_t offset) const
{
std::string_view name = data_type.getNameForValue(data[row_num]).toView();
if (offset >= name.size())
return {reinterpret_cast<const UInt8 *>(name.data()), 0};
return {reinterpret_cast<const UInt8 *>(name.data()) + offset, name.size() - offset};
}
Slice getSliceFromLeft(size_t offset, size_t length) const
{
std::string_view name = data_type.getNameForValue(data[row_num]).toView();
if (offset >= name.size())
return {reinterpret_cast<const UInt8 *>(name.data()), 0};
return {reinterpret_cast<const UInt8 *>(name.data()) + offset, std::min(length, name.size() - offset)};
}
Slice getSliceFromRight(size_t offset) const
{
std::string_view name = data_type.getNameForValue(data[row_num]).toView();
if (offset > name.size())
return {reinterpret_cast<const UInt8 *>(name.data()), name.size()};
return {reinterpret_cast<const UInt8 *>(name.data()) + name.size() - offset, offset};
}
Slice getSliceFromRight(size_t offset, size_t length) const
{
std::string_view name = data_type.getNameForValue(data[row_num]).toView();
if (offset > name.size())
return {reinterpret_cast<const UInt8 *>(name.data()), length + name.size() > offset ? std::min(name.size(), length + name.size() - offset) : 0};
return {reinterpret_cast<const UInt8 *>(name.data()) + name.size() - offset, std::min(length, offset)};
}
};
/// Differs to StringSource by having 'offset' and 'length' in code points instead of bytes in getSlice* methods.
/** NOTE: The behaviour of substring and substringUTF8 is inconsistent when negative offset is greater than string size:
@ -419,7 +517,7 @@ struct FixedStringSource
size_t column_size = 0;
explicit FixedStringSource(const ColumnFixedString & col)
: string_size(col.getN())
: string_size(col.getN())
{
const auto & chars = col.getChars();
pos = chars.data();
@ -553,7 +651,8 @@ struct GenericArraySource : public ArraySourceImpl<GenericArraySource>
}
explicit GenericArraySource(const ColumnArray & arr)
: elements(arr.getData()), offsets(arr.getOffsets())
: elements(arr.getData())
, offsets(arr.getOffsets())
{
}
@ -813,7 +912,10 @@ struct NullableValueSource : public ValueSource
const NullMap & null_map;
template <typename Column>
explicit NullableValueSource(const Column & col, const NullMap & null_map_) : ValueSource(col), null_map(null_map_) {}
NullableValueSource(const Column & col, const NullMap & null_map_)
: ValueSource(col)
, null_map(null_map_)
{}
void accept(ValueSourceVisitor & visitor) override { visitor.visit(*this); }

View File

@ -1,236 +0,0 @@
#include "FunctionArrayMapped.h"
#include <Functions/FunctionFactory.h>
#include <Common/Exception.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int SIZES_OF_ARRAYS_DONT_MATCH;
extern const int TYPE_MISMATCH;
}
/**
* arrayFold(x1,...,xn,accum -> expression, array1,...,arrayn, accum_initial) - apply the expression to each element of the array (or set of arrays).
*/
class ArrayFold : public IFunction
{
public:
static constexpr auto name = "arrayFold";
static FunctionPtr create(ContextPtr) { return std::make_shared<ArrayFold>(); }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
void getLambdaArgumentTypes(DataTypes & arguments) const override
{
if (arguments.size() < 3)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires as arguments a lambda function, at least one array and an accumulator", getName());
DataTypes accumulator_and_array_types(arguments.size() - 1);
accumulator_and_array_types[0] = arguments.back();
for (size_t i = 1; i < accumulator_and_array_types.size(); ++i)
{
const auto * array_type = checkAndGetDataType<DataTypeArray>(&*arguments[i]);
if (!array_type)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument {} of function {} must be of type Array, found {} instead", i + 1, getName(), arguments[i]->getName());
accumulator_and_array_types[i] = recursiveRemoveLowCardinality(array_type->getNestedType());
}
const auto * lambda_function_type = checkAndGetDataType<DataTypeFunction>(arguments[0].get());
if (!lambda_function_type || lambda_function_type->getArgumentTypes().size() != accumulator_and_array_types.size())
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument of function {} must be a lambda function with {} arguments, found {} instead.",
getName(), accumulator_and_array_types.size(), arguments[0]->getName());
arguments[0] = std::make_shared<DataTypeFunction>(accumulator_and_array_types);
}
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (arguments.size() < 3)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires as arguments a lambda function, at least one array and an accumulator", getName());
const auto * lambda_function_type = checkAndGetDataType<DataTypeFunction>(arguments[0].type.get());
if (!lambda_function_type)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be a function", getName());
auto accumulator_type = arguments.back().type;
auto lambda_type = lambda_function_type->getReturnType();
if (!accumulator_type->equals(*lambda_type))
throw Exception(ErrorCodes::TYPE_MISMATCH,
"Return type of lambda function must be the same as the accumulator type, inferred return type of lambda: {}, inferred type of accumulator: {}",
lambda_type->getName(), accumulator_type->getName());
return accumulator_type;
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
const auto & lambda_function_with_type_and_name = arguments[0];
if (!lambda_function_with_type_and_name.column)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be a function", getName());
const auto * lambda_function = typeid_cast<const ColumnFunction *>(lambda_function_with_type_and_name.column.get());
if (!lambda_function)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be a function", getName());
ColumnPtr offsets_column;
ColumnPtr column_first_array_ptr;
const ColumnArray * column_first_array = nullptr;
ColumnsWithTypeAndName arrays;
arrays.reserve(arguments.size() - 1);
/// Validate input types and get input array columns in convenient form
for (size_t i = 1; i < arguments.size() - 1; ++i)
{
const auto & array_with_type_and_name = arguments[i];
ColumnPtr column_array_ptr = array_with_type_and_name.column;
const auto * column_array = checkAndGetColumn<ColumnArray>(column_array_ptr.get());
if (!column_array)
{
const ColumnConst * column_const_array = checkAndGetColumnConst<ColumnArray>(column_array_ptr.get());
if (!column_const_array)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Expected array column, found {}", column_array_ptr->getName());
column_array_ptr = recursiveRemoveLowCardinality(column_const_array->convertToFullColumn());
column_array = checkAndGetColumn<ColumnArray>(column_array_ptr.get());
}
const DataTypePtr & array_type_ptr = array_with_type_and_name.type;
const auto * array_type = checkAndGetDataType<DataTypeArray>(array_type_ptr.get());
if (!array_type)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Expected array type, found {}", array_type_ptr->getName());
if (!offsets_column)
offsets_column = column_array->getOffsetsPtr();
else
{
/// The first condition is optimization: do not compare data if the pointers are equal.
if (column_array->getOffsetsPtr() != offsets_column
&& column_array->getOffsets() != typeid_cast<const ColumnArray::ColumnOffsets &>(*offsets_column).getData())
throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, "Arrays passed to {} must have equal size", getName());
}
if (i == 1)
{
column_first_array_ptr = column_array_ptr;
column_first_array = column_array;
}
arrays.emplace_back(ColumnWithTypeAndName(column_array->getDataPtr(),
recursiveRemoveLowCardinality(array_type->getNestedType()),
array_with_type_and_name.name));
}
ssize_t rows_count = input_rows_count;
ssize_t data_row_count = arrays[0].column->size();
size_t array_count = arrays.size();
if (rows_count == 0)
return arguments.back().column->convertToFullColumnIfConst()->cloneEmpty();
ColumnPtr current_column = arguments.back().column->convertToFullColumnIfConst();
MutableColumnPtr result_data = arguments.back().column->convertToFullColumnIfConst()->cloneEmpty();
size_t max_array_size = 0;
const auto & offsets = column_first_array->getOffsets();
IColumn::Selector selector(data_row_count);
size_t cur_ind = 0;
ssize_t cur_arr = 0;
/// skip to the first non empty array
if (data_row_count)
while (offsets[cur_arr] == 0)
++cur_arr;
/// selector[i] is an index that i_th data element has in an array it corresponds to
for (ssize_t i = 0; i < data_row_count; ++i)
{
selector[i] = cur_ind;
cur_ind++;
if (cur_ind > max_array_size)
max_array_size = cur_ind;
while (cur_arr < rows_count && cur_ind >= offsets[cur_arr] - offsets[cur_arr - 1])
{
++cur_arr;
cur_ind = 0;
}
}
std::vector<MutableColumns> data_arrays;
data_arrays.resize(array_count);
/// Split each data column to columns containing elements of only Nth index in array
if (max_array_size > 0)
for (size_t i = 0; i < array_count; ++i)
data_arrays[i] = arrays[i].column->scatter(max_array_size, selector);
size_t prev_size = rows_count;
IColumn::Permutation inverse_permutation(rows_count);
size_t inverse_permutation_count = 0;
/// current_column after each iteration contains value of accumulator after applying values under indexes of arrays.
/// At each iteration only rows of current_column with arrays that still has unapplied elements are kept.
/// Discarded rows which contain finished calculations are added to result_data column and as we insert them we save their original row_number in inverse_permutation vector
for (size_t ind = 0; ind < max_array_size; ++ind)
{
IColumn::Selector prev_selector(prev_size);
size_t prev_ind = 0;
for (ssize_t irow = 0; irow < rows_count; ++irow)
{
if (offsets[irow] - offsets[irow - 1] > ind)
prev_selector[prev_ind++] = 1;
else if (offsets[irow] - offsets[irow - 1] == ind)
{
inverse_permutation[inverse_permutation_count++] = irow;
prev_selector[prev_ind++] = 0;
}
}
auto prev = current_column->scatter(2, prev_selector);
result_data->insertRangeFrom(*(prev[0]), 0, prev[0]->size());
auto res_lambda = lambda_function->cloneResized(prev[1]->size());
auto * res_lambda_ptr = typeid_cast<ColumnFunction *>(res_lambda.get());
res_lambda_ptr->appendArguments(std::vector({ColumnWithTypeAndName(std::move(prev[1]), arguments.back().type, arguments.back().name)}));
for (size_t i = 0; i < array_count; i++)
res_lambda_ptr->appendArguments(std::vector({ColumnWithTypeAndName(std::move(data_arrays[i][ind]), arrays[i].type, arrays[i].name)}));
current_column = IColumn::mutate(res_lambda_ptr->reduce().column);
prev_size = current_column->size();
}
result_data->insertRangeFrom(*current_column, 0, current_column->size());
for (ssize_t irow = 0; irow < rows_count; ++irow)
if (offsets[irow] - offsets[irow - 1] == max_array_size)
inverse_permutation[inverse_permutation_count++] = irow;
/// We have result_data containing result for every row and inverse_permutation which contains indexes of rows in input it corresponds to.
/// Now we need to invert inverse_permuation and apply it to result_data to get rows in right order.
IColumn::Permutation perm(rows_count);
for (ssize_t i = 0; i < rows_count; i++)
perm[inverse_permutation[i]] = i;
return result_data->permute(perm, 0);
}
private:
String getName() const override
{
return name;
}
};
REGISTER_FUNCTION(ArrayFold)
{
factory.registerFunction<ArrayFold>(FunctionDocumentation{.description=R"(
Function arrayFold(x1,...,xn,accum -> expression, array1,...,arrayn, accum_initial) applies lambda function to a number of equally-sized arrays
and collects the result in an accumulator.
)", .examples{{"sum", "SELECT arrayFold(x,acc -> acc+x, [1,2,3,4], toInt64(1));", "11"}}, .categories{"Array"}});
}
}

View File

@ -10,12 +10,14 @@
#include <base/sleep.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <Interpreters/ProcessList.h>
namespace ProfileEvents
{
extern const Event SleepFunctionCalls;
extern const Event SleepFunctionMicroseconds;
extern const Event SleepFunctionElapsedMicroseconds;
}
namespace DB
@ -43,15 +45,20 @@ class FunctionSleep : public IFunction
{
private:
UInt64 max_microseconds;
QueryStatusPtr query_status;
public:
static constexpr auto name = variant == FunctionSleepVariant::PerBlock ? "sleep" : "sleepEachRow";
static FunctionPtr create(ContextPtr context)
{
return std::make_shared<FunctionSleep<variant>>(context->getSettingsRef().function_sleep_max_microseconds_per_block);
return std::make_shared<FunctionSleep<variant>>(
context->getSettingsRef().function_sleep_max_microseconds_per_block,
context->getProcessListElementSafe());
}
FunctionSleep(UInt64 max_microseconds_)
FunctionSleep(UInt64 max_microseconds_, QueryStatusPtr query_status_)
: max_microseconds(std::min(max_microseconds_, static_cast<UInt64>(std::numeric_limits<UInt32>::max())))
, query_status(query_status_)
{
}
@ -128,9 +135,23 @@ public:
"The maximum sleep time is {} microseconds. Requested: {} microseconds per block (of size {})",
max_microseconds, microseconds, size);
sleepForMicroseconds(microseconds);
UInt64 elapsed = 0;
while (elapsed < microseconds)
{
UInt64 sleep_time = microseconds - elapsed;
if (query_status)
sleep_time = std::min(sleep_time, /* 1 second */ static_cast<UInt64>(1000000));
sleepForMicroseconds(sleep_time);
elapsed += sleep_time;
if (query_status && !query_status->checkTimeLimit())
break;
}
ProfileEvents::increment(ProfileEvents::SleepFunctionCalls, count);
ProfileEvents::increment(ProfileEvents::SleepFunctionMicroseconds, microseconds);
ProfileEvents::increment(ProfileEvents::SleepFunctionElapsedMicroseconds, elapsed);
}
}

View File

@ -1,15 +1,16 @@
#include <DataTypes/DataTypeString.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/GatherUtils/Algorithms.h>
#include <Functions/GatherUtils/GatherUtils.h>
#include <Functions/GatherUtils/Sources.h>
#include <Functions/GatherUtils/Sinks.h>
#include <Functions/GatherUtils/Slices.h>
#include <Functions/GatherUtils/Algorithms.h>
#include <Functions/GatherUtils/Sources.h>
#include <Functions/IFunction.h>
#include <IO/WriteHelpers.h>
@ -20,50 +21,50 @@ using namespace GatherUtils;
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ZERO_ARRAY_OR_TUPLE_INDEX;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ZERO_ARRAY_OR_TUPLE_INDEX;
}
namespace
{
/// If 'is_utf8' - measure offset and length in code points instead of bytes.
/// UTF8 variant is not available for FixedString arguments.
template <bool is_utf8>
class FunctionSubstring : public IFunction
{
public:
static constexpr auto name = is_utf8 ? "substringUTF8" : "substring";
static FunctionPtr create(ContextPtr)
{
return std::make_shared<FunctionSubstring>();
}
String getName() const override
{
return name;
}
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSubstring>(); }
String getName() const override { return name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
size_t number_of_arguments = arguments.size();
const size_t number_of_arguments = arguments.size();
if (number_of_arguments < 2 || number_of_arguments > 3)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: "
"passed {}, should be 2 or 3", getName(), number_of_arguments);
if ((is_utf8 && !isString(arguments[0])) || !isStringOrFixedString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
arguments[0]->getName(), getName());
if constexpr (is_utf8)
{
/// UTF8 variant is not available for FixedString and Enum arguments.
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}",
arguments[0]->getName(), getName());
}
else
{
if (!isStringOrFixedString(arguments[0]) && !isEnum(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}",
arguments[0]->getName(), getName());
}
if (!isNativeNumber(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of second argument of function {}",
@ -77,44 +78,40 @@ public:
}
template <typename Source>
ColumnPtr executeForSource(const ColumnPtr & column_start, const ColumnPtr & column_length,
const ColumnConst * column_start_const, const ColumnConst * column_length_const,
Int64 start_value, Int64 length_value, Source && source,
size_t input_rows_count) const
ColumnPtr executeForSource(const ColumnPtr & column_offset, const ColumnPtr & column_length,
bool column_offset_const, bool column_length_const,
Int64 offset, Int64 length,
Source && source, size_t input_rows_count) const
{
auto col_res = ColumnString::create();
if (!column_length)
{
if (column_start_const)
if (column_offset_const)
{
if (start_value > 0)
sliceFromLeftConstantOffsetUnbounded(
source, StringSink(*col_res, input_rows_count), static_cast<size_t>(start_value - 1));
else if (start_value < 0)
sliceFromRightConstantOffsetUnbounded(
source, StringSink(*col_res, input_rows_count), -static_cast<size_t>(start_value));
if (offset > 0)
sliceFromLeftConstantOffsetUnbounded(source, StringSink(*col_res, input_rows_count), static_cast<size_t>(offset - 1));
else if (offset < 0)
sliceFromRightConstantOffsetUnbounded(source, StringSink(*col_res, input_rows_count), -static_cast<size_t>(offset));
else
throw Exception(ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX, "Indices in strings are 1-based");
}
else
sliceDynamicOffsetUnbounded(source, StringSink(*col_res, input_rows_count), *column_start);
sliceDynamicOffsetUnbounded(source, StringSink(*col_res, input_rows_count), *column_offset);
}
else
{
if (column_start_const && column_length_const)
if (column_offset_const && column_length_const)
{
if (start_value > 0)
sliceFromLeftConstantOffsetBounded(
source, StringSink(*col_res, input_rows_count), static_cast<size_t>(start_value - 1), length_value);
else if (start_value < 0)
sliceFromRightConstantOffsetBounded(
source, StringSink(*col_res, input_rows_count), -static_cast<size_t>(start_value), length_value);
if (offset > 0)
sliceFromLeftConstantOffsetBounded(source, StringSink(*col_res, input_rows_count), static_cast<size_t>(offset - 1), length);
else if (offset < 0)
sliceFromRightConstantOffsetBounded(source, StringSink(*col_res, input_rows_count), -static_cast<size_t>(offset), length);
else
throw Exception(ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX, "Indices in strings are 1-based");
}
else
sliceDynamicOffsetBounded(source, StringSink(*col_res, input_rows_count), *column_start, *column_length);
sliceDynamicOffsetBounded(source, StringSink(*col_res, input_rows_count), *column_offset, *column_length);
}
return col_res;
@ -122,58 +119,60 @@ public:
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
size_t number_of_arguments = arguments.size();
const size_t number_of_arguments = arguments.size();
ColumnPtr column_string = arguments[0].column;
ColumnPtr column_start = arguments[1].column;
ColumnPtr column_offset = arguments[1].column;
ColumnPtr column_length;
if (number_of_arguments == 3)
column_length = arguments[2].column;
const ColumnConst * column_start_const = checkAndGetColumn<ColumnConst>(column_start.get());
const ColumnConst * column_offset_const = checkAndGetColumn<ColumnConst>(column_offset.get());
const ColumnConst * column_length_const = nullptr;
if (number_of_arguments == 3)
column_length_const = checkAndGetColumn<ColumnConst>(column_length.get());
Int64 start_value = 0;
Int64 length_value = 0;
Int64 offset = 0;
Int64 length = 0;
if (column_start_const)
start_value = column_start_const->getInt(0);
if (column_offset_const)
offset = column_offset_const->getInt(0);
if (column_length_const)
length_value = column_length_const->getInt(0);
length = column_length_const->getInt(0);
if constexpr (is_utf8)
{
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
return executeForSource(column_start, column_length, column_start_const, column_length_const, start_value,
length_value, UTF8StringSource(*col), input_rows_count);
else if (const ColumnConst * col_const = checkAndGetColumnConst<ColumnString>(column_string.get()))
return executeForSource(column_start, column_length, column_start_const, column_length_const, start_value,
length_value, ConstSource<UTF8StringSource>(*col_const), input_rows_count);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
arguments[0].column->getName(), getName());
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, UTF8StringSource(*col), input_rows_count);
if (const ColumnConst * col_const = checkAndGetColumnConst<ColumnString>(column_string.get()))
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource<UTF8StringSource>(*col_const), input_rows_count);
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName());
}
else
{
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
return executeForSource(column_start, column_length, column_start_const, column_length_const, start_value,
length_value, StringSource(*col), input_rows_count);
else if (const ColumnFixedString * col_fixed = checkAndGetColumn<ColumnFixedString>(column_string.get()))
return executeForSource(column_start, column_length, column_start_const, column_length_const, start_value,
length_value, FixedStringSource(*col_fixed), input_rows_count);
else if (const ColumnConst * col_const = checkAndGetColumnConst<ColumnString>(column_string.get()))
return executeForSource(column_start, column_length, column_start_const, column_length_const, start_value,
length_value, ConstSource<StringSource>(*col_const), input_rows_count);
else if (const ColumnConst * col_const_fixed = checkAndGetColumnConst<ColumnFixedString>(column_string.get()))
return executeForSource(column_start, column_length, column_start_const, column_length_const, start_value,
length_value, ConstSource<FixedStringSource>(*col_const_fixed), input_rows_count);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
arguments[0].column->getName(), getName());
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, StringSource(*col), input_rows_count);
if (const ColumnFixedString * col_fixed = checkAndGetColumn<ColumnFixedString>(column_string.get()))
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, FixedStringSource(*col_fixed), input_rows_count);
if (const ColumnConst * col_const = checkAndGetColumnConst<ColumnString>(column_string.get()))
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource<StringSource>(*col_const), input_rows_count);
if (const ColumnConst * col_const_fixed = checkAndGetColumnConst<ColumnFixedString>(column_string.get()))
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource<FixedStringSource>(*col_const_fixed), input_rows_count);
if (isEnum(arguments[0].type))
{
if (const typename DataTypeEnum8::ColumnType * col_enum8 = checkAndGetColumn<typename DataTypeEnum8::ColumnType>(column_string.get()))
{
const auto * type_enum8 = assert_cast<const DataTypeEnum8 *>(arguments[0].type.get());
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, EnumSource<DataTypeEnum8>(*col_enum8, *type_enum8), input_rows_count);
}
if (const typename DataTypeEnum16::ColumnType * col_enum16 = checkAndGetColumn<typename DataTypeEnum16::ColumnType>(column_string.get()))
{
const auto * type_enum16 = assert_cast<const DataTypeEnum16 *>(arguments[0].type.get());
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, EnumSource<DataTypeEnum16>(*col_enum16, *type_enum16), input_rows_count);
}
}
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName());
}
}
};
@ -183,8 +182,8 @@ public:
REGISTER_FUNCTION(Substring)
{
factory.registerFunction<FunctionSubstring<false>>({}, FunctionFactory::CaseInsensitive);
factory.registerAlias("substr", "substring", FunctionFactory::CaseInsensitive);
factory.registerAlias("mid", "substring", FunctionFactory::CaseInsensitive); /// from MySQL dialect
factory.registerAlias("substr", "substring", FunctionFactory::CaseInsensitive); // MySQL alias
factory.registerAlias("mid", "substring", FunctionFactory::CaseInsensitive); /// MySQL alias
factory.registerFunction<FunctionSubstring<true>>({}, FunctionFactory::CaseSensitive);
}

View File

@ -1,7 +1,7 @@
#include <base/arithmeticOverflow.h>
#include <Common/DateLUTImpl.h>
#include <Columns/ColumnsDateTime.h>
#include <Columns/ColumnsNumber.h>
#include <Common/DateLUTImpl.h>
#include <Common/IntervalKind.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDate32.h>
#include <DataTypes/DataTypeDateTime.h>
@ -11,6 +11,7 @@
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
#include <IO/WriteHelpers.h>
#include <base/arithmeticOverflow.h>
namespace DB
@ -24,9 +25,6 @@ namespace ErrorCodes
}
namespace
{
class FunctionToStartOfInterval : public IFunction
{
public:
@ -34,86 +32,90 @@ public:
static constexpr auto name = "toStartOfInterval";
String getName() const override { return name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
bool hasInformationAboutMonotonicity() const override { return true; }
Monotonicity getMonotonicityForRange(const IDataType &, const Field &, const Field &) const override
{
return { .is_monotonic = true, .is_always_monotonic = true };
}
Monotonicity getMonotonicityForRange(const IDataType &, const Field &, const Field &) const override { return { .is_monotonic = true, .is_always_monotonic = true }; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
bool first_argument_is_date = false;
bool value_is_date = false;
auto check_first_argument = [&]
{
if (!isDate(arguments[0].type) && !isDateTime(arguments[0].type) && !isDateTime64(arguments[0].type))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. "
"Should be a date or a date with time", arguments[0].type->getName(), getName());
first_argument_is_date = isDate(arguments[0].type);
const DataTypePtr & type_arg1 = arguments[0].type;
if (!isDate(type_arg1) && !isDateTime(type_arg1) && !isDateTime64(type_arg1))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of 1st argument of function {}, expected a Date, DateTime or DateTime64",
type_arg1->getName(), getName());
value_is_date = isDate(type_arg1);
};
const DataTypeInterval * interval_type = nullptr;
bool result_type_is_date = false;
bool result_type_is_datetime = false;
bool result_type_is_datetime_64 = false;
auto check_interval_argument = [&]
enum class ResultType
{
interval_type = checkAndGetDataType<DataTypeInterval>(arguments[1].type.get());
Date,
DateTime,
DateTime64
};
ResultType result_type;
auto check_second_argument = [&]
{
const DataTypePtr & type_arg2 = arguments[1].type;
interval_type = checkAndGetDataType<DataTypeInterval>(type_arg2.get());
if (!interval_type)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. "
"Should be an interval of time", arguments[1].type->getName(), getName());
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of 2nd argument of function {}, expected a time interval",
type_arg2->getName(), getName());
switch (interval_type->getKind()) // NOLINT(bugprone-switch-missing-default-case)
{
case IntervalKind::Nanosecond:
case IntervalKind::Microsecond:
case IntervalKind::Millisecond:
result_type_is_datetime_64 = true;
result_type = ResultType::DateTime64;
break;
case IntervalKind::Second:
case IntervalKind::Minute:
case IntervalKind::Hour:
case IntervalKind::Day:
result_type_is_datetime = true;
case IntervalKind::Day: /// weird why Day leads to DateTime but too afraid to change it
result_type = ResultType::DateTime;
break;
case IntervalKind::Week:
case IntervalKind::Month:
case IntervalKind::Quarter:
case IntervalKind::Year:
result_type_is_date = true;
result_type = ResultType::Date;
break;
}
};
auto check_timezone_argument = [&]
auto check_third_argument = [&]
{
if (!isString(arguments[2].type))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. "
"This argument is optional and must be a constant string with timezone name",
arguments[2].type->getName(), getName());
if (first_argument_is_date && result_type_is_date)
const DataTypePtr & type_arg3 = arguments[2].type;
if (!isString(type_arg3))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"The timezone argument of function {} with interval type {} is allowed only when the 1st argument "
"has the type DateTime or DateTime64",
getName(), interval_type->getKind().toString());
"Illegal type {} of 3rd argument of function {}, expected a constant timezone string",
type_arg3->getName(), getName());
if (value_is_date && result_type == ResultType::Date) /// weird why this is && instead of || but too afraid to change it
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"The timezone argument of function {} with interval type {} is allowed only when the 1st argument has type DateTime or DateTimt64",
getName(), interval_type->getKind().toString());
};
if (arguments.size() == 2)
{
check_first_argument();
check_interval_argument();
check_second_argument();
}
else if (arguments.size() == 3)
{
check_first_argument();
check_interval_argument();
check_timezone_argument();
check_second_argument();
check_third_argument();
}
else
{
@ -122,25 +124,27 @@ public:
getName(), arguments.size());
}
if (result_type_is_date)
return std::make_shared<DataTypeDate>();
else if (result_type_is_datetime)
return std::make_shared<DataTypeDateTime>(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false));
else if (result_type_is_datetime_64)
switch (result_type)
{
auto scale = 0;
case ResultType::Date:
return std::make_shared<DataTypeDate>();
case ResultType::DateTime:
return std::make_shared<DataTypeDateTime>(extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false));
case ResultType::DateTime64:
{
UInt32 scale = 0;
if (interval_type->getKind() == IntervalKind::Nanosecond)
scale = 9;
else if (interval_type->getKind() == IntervalKind::Microsecond)
scale = 6;
else if (interval_type->getKind() == IntervalKind::Millisecond)
scale = 3;
if (interval_type->getKind() == IntervalKind::Nanosecond)
scale = 9;
else if (interval_type->getKind() == IntervalKind::Microsecond)
scale = 6;
else if (interval_type->getKind() == IntervalKind::Millisecond)
scale = 3;
return std::make_shared<DataTypeDateTime64>(scale, extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false));
return std::make_shared<DataTypeDateTime64>(scale, extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false));
}
}
UNREACHABLE();
std::unreachable();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /* input_rows_count */) const override
@ -154,110 +158,106 @@ public:
private:
ColumnPtr dispatchForTimeColumn(
const ColumnWithTypeAndName & time_column, const ColumnWithTypeAndName & interval_column, const DataTypePtr & result_type, const DateLUTImpl & time_zone) const
const ColumnWithTypeAndName & time_column, const ColumnWithTypeAndName & interval_column,
const DataTypePtr & result_type, const DateLUTImpl & time_zone) const
{
const auto & from_datatype = *time_column.type.get();
const auto & time_column_type = *time_column.type.get();
const auto & time_column_col = *time_column.column.get();
if (isDateTime64(from_datatype))
if (isDateTime64(time_column_type))
{
const auto * time_column_vec = checkAndGetColumn<ColumnDateTime64>(time_column.column.get());
auto scale = assert_cast<const DataTypeDateTime64 &>(from_datatype).getScale();
const auto * time_column_vec = checkAndGetColumn<ColumnDateTime64>(time_column_col);
auto scale = assert_cast<const DataTypeDateTime64 &>(time_column_type).getScale();
if (time_column_vec)
return dispatchForIntervalColumn(assert_cast<const DataTypeDateTime64 &>(from_datatype), *time_column_vec, interval_column, result_type, time_zone, scale);
return dispatchForIntervalColumn(assert_cast<const DataTypeDateTime64 &>(time_column_type), *time_column_vec, interval_column, result_type, time_zone, scale);
}
if (isDateTime(from_datatype))
else if (isDateTime(time_column_type))
{
const auto * time_column_vec = checkAndGetColumn<ColumnDateTime>(time_column.column.get());
const auto * time_column_vec = checkAndGetColumn<ColumnDateTime>(time_column_col);
if (time_column_vec)
return dispatchForIntervalColumn(assert_cast<const DataTypeDateTime &>(from_datatype), *time_column_vec, interval_column, result_type, time_zone);
return dispatchForIntervalColumn(assert_cast<const DataTypeDateTime &>(time_column_type), *time_column_vec, interval_column, result_type, time_zone);
}
if (isDate(from_datatype))
else if (isDate(time_column_type))
{
const auto * time_column_vec = checkAndGetColumn<ColumnDate>(time_column.column.get());
const auto * time_column_vec = checkAndGetColumn<ColumnDate>(time_column_col);
if (time_column_vec)
return dispatchForIntervalColumn(assert_cast<const DataTypeDate &>(from_datatype), *time_column_vec, interval_column, result_type, time_zone);
return dispatchForIntervalColumn(assert_cast<const DataTypeDate &>(time_column_type), *time_column_vec, interval_column, result_type, time_zone);
}
if (isDate32(from_datatype))
{
const auto * time_column_vec = checkAndGetColumn<ColumnDate32>(time_column.column.get());
if (time_column_vec)
return dispatchForIntervalColumn(assert_cast<const DataTypeDate32 &>(from_datatype), *time_column_vec, interval_column, result_type, time_zone);
}
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal column for first argument of function {}. Must contain dates or dates with time", getName());
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal column for 1st argument of function {}, expected a Date, DateTime or DateTime64", getName());
}
template <typename TimeColumnType, typename TimeDataType>
template <typename TimeDataType, typename TimeColumnType>
ColumnPtr dispatchForIntervalColumn(
const TimeDataType & time_data_type, const TimeColumnType & time_column, const ColumnWithTypeAndName & interval_column,
const DataTypePtr & result_type, const DateLUTImpl & time_zone, const UInt16 scale = 1) const
const DataTypePtr & result_type, const DateLUTImpl & time_zone, UInt16 scale = 1) const
{
const auto * interval_type = checkAndGetDataType<DataTypeInterval>(interval_column.type.get());
if (!interval_type)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column for second argument of function {}, must be an interval of time.", getName());
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column for 2nd argument of function {}, must be a time interval", getName());
const auto * interval_column_const_int64 = checkAndGetColumnConst<ColumnInt64>(interval_column.column.get());
if (!interval_column_const_int64)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column for second argument of function {}, must be a const interval of time.", getName());
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column for 2nd argument of function {}, must be a const time interval", getName());
Int64 num_units = interval_column_const_int64->getValue<Int64>();
const Int64 num_units = interval_column_const_int64->getValue<Int64>();
if (num_units <= 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Value for second argument of function {} must be positive.", getName());
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Value for 2nd argument of function {} must be positive", getName());
switch (interval_type->getKind()) // NOLINT(bugprone-switch-missing-default-case)
{
case IntervalKind::Nanosecond:
return execute<TimeDataType, DataTypeDateTime64, IntervalKind::Nanosecond>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDateTime64, IntervalKind::Nanosecond>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Microsecond:
return execute<TimeDataType, DataTypeDateTime64, IntervalKind::Microsecond>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDateTime64, IntervalKind::Microsecond>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Millisecond:
return execute<TimeDataType, DataTypeDateTime64, IntervalKind::Millisecond>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDateTime64, IntervalKind::Millisecond>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Second:
return execute<TimeDataType, DataTypeDateTime, IntervalKind::Second>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDateTime, IntervalKind::Second>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Minute:
return execute<TimeDataType, DataTypeDateTime, IntervalKind::Minute>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDateTime, IntervalKind::Minute>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Hour:
return execute<TimeDataType, DataTypeDateTime, IntervalKind::Hour>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDateTime, IntervalKind::Hour>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Day:
return execute<TimeDataType, DataTypeDateTime, IntervalKind::Day>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDateTime, IntervalKind::Day>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Week:
return execute<TimeDataType, DataTypeDate, IntervalKind::Week>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDate, IntervalKind::Week>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Month:
return execute<TimeDataType, DataTypeDate, IntervalKind::Month>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDate, IntervalKind::Month>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Quarter:
return execute<TimeDataType, DataTypeDate, IntervalKind::Quarter>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDate, IntervalKind::Quarter>(time_data_type, time_column, num_units, result_type, time_zone, scale);
case IntervalKind::Year:
return execute<TimeDataType, DataTypeDate, IntervalKind::Year>(time_data_type, time_column, num_units, result_type, time_zone, scale);
return execute<TimeDataType, TimeColumnType, DataTypeDate, IntervalKind::Year>(time_data_type, time_column, num_units, result_type, time_zone, scale);
}
UNREACHABLE();
std::unreachable();
}
template <typename TimeDataType, typename ToDataType, IntervalKind::Kind unit, typename ColumnType>
ColumnPtr execute(const TimeDataType &, const ColumnType & time_column_type, Int64 num_units, const DataTypePtr & result_type, const DateLUTImpl & time_zone, const UInt16 scale) const
template <typename TimeDataType, typename TimeColumnType, typename ResultDataType, IntervalKind::Kind unit>
ColumnPtr execute(
const TimeDataType &, const TimeColumnType & time_column_type, Int64 num_units,
const DataTypePtr & result_type, const DateLUTImpl & time_zone, UInt16 scale) const
{
using ToColumnType = typename ToDataType::ColumnType;
using ToFieldType = typename ToDataType::FieldType;
using ResultColumnType = typename ResultDataType::ColumnType;
using ResultFieldType = typename ResultDataType::FieldType;
const auto & time_data = time_column_type.getData();
size_t size = time_data.size();
auto result_col = result_type->createColumn();
auto *col_to = assert_cast<ToColumnType *>(result_col.get());
auto * col_to = assert_cast<ResultColumnType *>(result_col.get());
auto & result_data = col_to->getData();
result_data.resize(size);
Int64 scale_multiplier = DecimalUtils::scaleMultiplier<DateTime64>(scale);
for (size_t i = 0; i != size; ++i)
result_data[i] = static_cast<ToFieldType>(ToStartOfInterval<unit>::execute(time_data[i], num_units, time_zone, scale_multiplier));
result_data[i] = static_cast<ResultFieldType>(ToStartOfInterval<unit>::execute(time_data[i], num_units, time_zone, scale_multiplier));
return result_col;
}
};
}
REGISTER_FUNCTION(ToStartOfInterval)
{
factory.registerFunction<FunctionToStartOfInterval>();

View File

@ -515,7 +515,9 @@ Aws::S3::Model::GetObjectResult ReadBufferFromS3::sendRequest(size_t attempt, si
// We do not know in advance how many bytes we are going to consume, to avoid blocking estimated it from below
constexpr ResourceCost estimated_cost = 1;
ResourceGuard rlock(read_settings.resource_link, estimated_cost);
Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req);
rlock.unlock();
if (outcome.IsSuccess())

View File

@ -1591,7 +1591,7 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim
if (skip_spaces)
skipWhitespaceIfAny(buf);
if (checkString(row_between_delimiter, buf))
if (buf.eof() || checkString(row_between_delimiter, buf))
break;
}
}

View File

@ -13,9 +13,9 @@
namespace DB::S3
{
std::shared_ptr<Aws::Http::HttpClient>
PocoHTTPClientFactory::CreateHttpClient(const Aws::Client::ClientConfiguration & clientConfiguration) const
PocoHTTPClientFactory::CreateHttpClient(const Aws::Client::ClientConfiguration & client_configuration) const
{
return std::make_shared<PocoHTTPClient>(static_cast<const PocoHTTPClientConfiguration &>(clientConfiguration));
return std::make_shared<PocoHTTPClient>(static_cast<const PocoHTTPClientConfiguration &>(client_configuration));
}
std::shared_ptr<Aws::Http::HttpRequest> PocoHTTPClientFactory::CreateHttpRequest(

View File

@ -15,7 +15,7 @@ class PocoHTTPClientFactory : public Aws::Http::HttpClientFactory
public:
~PocoHTTPClientFactory() override = default;
[[nodiscard]] std::shared_ptr<Aws::Http::HttpClient>
CreateHttpClient(const Aws::Client::ClientConfiguration & clientConfiguration) const override;
CreateHttpClient(const Aws::Client::ClientConfiguration & client_configuration) const override;
[[nodiscard]] std::shared_ptr<Aws::Http::HttpRequest>
CreateHttpRequest(const Aws::String & uri, Aws::Http::HttpMethod method, const Aws::IOStreamFactory & streamFactory) const override;
[[nodiscard]] std::shared_ptr<Aws::Http::HttpRequest>

View File

@ -655,6 +655,7 @@ namespace
void performCopy()
{
LOG_TEST(log, "Copy object {} to {} using native copy", src_key, dest_key);
if (!supports_multipart_copy || size <= upload_settings.max_single_operation_copy_size)
performSingleOperationCopy();
else

View File

@ -16,6 +16,7 @@
#include <Common/Throttler_fwd.h>
#include <IO/S3/URI.h>
#include <IO/S3/Credentials.h>
#include <aws/core/Aws.h>
#include <aws/s3/S3Errors.h>

Some files were not shown because too many files have changed in this diff Show More