Merge branch 'master' into fix-27179

This commit is contained in:
Nikolai Kochetov 2021-08-09 11:59:22 +03:00 committed by GitHub
commit 372293b85d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
290 changed files with 4686 additions and 1827 deletions

3
.gitmodules vendored
View File

@ -243,3 +243,6 @@
[submodule "contrib/s2geometry"]
path = contrib/s2geometry
url = https://github.com/ClickHouse-Extras/s2geometry.git
[submodule "contrib/bzip2"]
path = contrib/bzip2
url = https://github.com/ClickHouse-Extras/bzip2.git

View File

@ -543,6 +543,7 @@ include (cmake/find/nuraft.cmake)
include (cmake/find/yaml-cpp.cmake)
include (cmake/find/s2geometry.cmake)
include (cmake/find/nlp.cmake)
include (cmake/find/bzip2.cmake)
if(NOT USE_INTERNAL_PARQUET_LIBRARY)
set (ENABLE_ORC OFF CACHE INTERNAL "")

View File

@ -1,57 +0,0 @@
#pragma once
#include <new>
#include "defines.h"
#if USE_JEMALLOC
# include <jemalloc/jemalloc.h>
#endif
#if !USE_JEMALLOC || JEMALLOC_VERSION_MAJOR < 4
# include <cstdlib>
#endif
namespace Memory
{
inline ALWAYS_INLINE void * newImpl(std::size_t size)
{
auto * ptr = malloc(size);
if (likely(ptr != nullptr))
return ptr;
/// @note no std::get_new_handler logic implemented
throw std::bad_alloc{};
}
inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept
{
return malloc(size);
}
inline ALWAYS_INLINE void deleteImpl(void * ptr) noexcept
{
free(ptr);
}
#if USE_JEMALLOC && JEMALLOC_VERSION_MAJOR >= 4
inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size) noexcept
{
if (unlikely(ptr == nullptr))
return;
sdallocx(ptr, size, 0);
}
#else
inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]]) noexcept
{
free(ptr);
}
#endif
}

19
cmake/find/bzip2.cmake Normal file
View File

@ -0,0 +1,19 @@
option(ENABLE_BZIP2 "Enable bzip2 compression support" ${ENABLE_LIBRARIES})
if (NOT ENABLE_BZIP2)
message (STATUS "bzip2 compression disabled")
return()
endif()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/bzip2/bzlib.h")
message (WARNING "submodule contrib/bzip2 is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal bzip2 library")
set (USE_NLP 0)
return()
endif ()
set (USE_BZIP2 1)
set (BZIP2_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/bzip2")
set (BZIP2_LIBRARY bzip2)
message (STATUS "Using bzip2=${USE_BZIP2}: ${BZIP2_INCLUDE_DIR} : ${BZIP2_LIBRARY}")

View File

@ -334,6 +334,10 @@ if (USE_NLP)
add_subdirectory(lemmagen-c-cmake)
endif()
if (USE_BZIP2)
add_subdirectory(bzip2-cmake)
endif()
if (USE_SQLITE)
add_subdirectory(sqlite-cmake)
endif()

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit 0ce9490093021c63564cca159571a8b27772ad48
Subproject commit 7ecb16844af6a9c283ad432d85ecc2e7d1544676

1
contrib/bzip2 vendored Submodule

@ -0,0 +1 @@
Subproject commit bf905ea2251191ff9911ae7ec0cfc35d41f9f7f6

View File

@ -0,0 +1,23 @@
set(BZIP2_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/bzip2")
set(BZIP2_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/bzip2")
set(SRCS
"${BZIP2_SOURCE_DIR}/blocksort.c"
"${BZIP2_SOURCE_DIR}/huffman.c"
"${BZIP2_SOURCE_DIR}/crctable.c"
"${BZIP2_SOURCE_DIR}/randtable.c"
"${BZIP2_SOURCE_DIR}/compress.c"
"${BZIP2_SOURCE_DIR}/decompress.c"
"${BZIP2_SOURCE_DIR}/bzlib.c"
)
# From bzip2/CMakeLists.txt
set(BZ_VERSION "1.0.7")
configure_file (
"${BZIP2_SOURCE_DIR}/bz_version.h.in"
"${BZIP2_BINARY_DIR}/bz_version.h"
)
add_library(bzip2 ${SRCS})
target_include_directories(bzip2 PUBLIC "${BZIP2_SOURCE_DIR}" "${BZIP2_BINARY_DIR}")

View File

@ -24,3 +24,19 @@ add_library(roaring ${SRCS})
target_include_directories(roaring PRIVATE "${LIBRARY_DIR}/include/roaring")
target_include_directories(roaring SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include")
target_include_directories(roaring SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/cpp")
# We redirect malloc/free family of functions to different functions that will track memory in ClickHouse.
# It will make this library depend on linking to 'clickhouse_common_io' library that is not done explicitly via 'target_link_libraries'.
# And we check that all libraries dependencies are satisfied and all symbols are resolved if we do build with shared libraries.
# That's why we enable it only in static build.
# Also note that we exploit implicit function declarations.
if (USE_STATIC_LIBRARIES)
target_compile_definitions(roaring PRIVATE
-Dmalloc=clickhouse_malloc
-Dcalloc=clickhouse_calloc
-Drealloc=clickhouse_realloc
-Dreallocarray=clickhouse_reallocarray
-Dfree=clickhouse_free
-Dposix_memalign=clickhouse_posix_memalign)
endif ()

2
contrib/zlib-ng vendored

@ -1 +1 @@
Subproject commit db232d30b4c72fd58e6d7eae2d12cebf9c3d90db
Subproject commit 6a5e93b9007782115f7f7e5235dedc81c4f1facb

View File

@ -279,6 +279,7 @@ function run_tests
00926_multimatch
00929_multi_match_edit_distance
01681_hyperscan_debug_assertion
02004_max_hyperscan_regex_length
01176_mysql_client_interactive # requires mysql client
01031_mutations_interpreter_and_context
@ -312,6 +313,7 @@ function run_tests
01798_uniq_theta_sketch
01799_long_uniq_theta_sketch
01890_stem # depends on libstemmer_c
02003_compress_bz2 # depends on bzip2
collate
collation
_orc_

View File

@ -28,7 +28,7 @@ RUN apt-get update --yes \
ENV PKG_VERSION="pvs-studio-latest"
RUN set -x \
&& export PUBKEY_HASHSUM="486a0694c7f92e96190bbfac01c3b5ac2cb7823981db510a28f744c99eabbbf17a7bcee53ca42dc6d84d4323c2742761" \
&& export PUBKEY_HASHSUM="686e5eb8b3c543a5c54442c39ec876b6c2d912fe8a729099e600017ae53c877dda3368fe38ed7a66024fe26df6b5892a" \
&& wget -nv https://files.viva64.com/etc/pubkey.txt -O /tmp/pubkey.txt \
&& echo "${PUBKEY_HASHSUM} /tmp/pubkey.txt" | sha384sum -c \
&& apt-key add /tmp/pubkey.txt \

View File

@ -3,45 +3,52 @@ toc_priority: 30
toc_title: MaterializedPostgreSQL
---
# MaterializedPostgreSQL {#materialize-postgresql}
# [experimental] MaterializedPostgreSQL {#materialize-postgresql}
Creates ClickHouse database with an initial data dump of PostgreSQL database tables and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL database tables in the remote PostgreSQL database.
ClickHouse server works as PostgreSQL replica. It reads WAL and performs DML queries. DDL is not replicated, but can be handled (described below).
## Creating a Database {#creating-a-database}
``` sql
CREATE DATABASE test_database
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password'
SELECT * FROM test_database.postgres_table;
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster]
ENGINE = MaterializedPostgreSQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...]
```
**Engine Parameters**
- `host:port` — PostgreSQL server endpoint.
- `database` — PostgreSQL database name.
- `user` — PostgreSQL user.
- `password` — User password.
## Settings {#settings}
1. `materialized_postgresql_max_block_size` - Number of rows collected before flushing data into table. Default: `65536`.
- [materialized_postgresql_max_block_size](../../operations/settings/settings.md#materialized-postgresql-max-block-size)
2. `materialized_postgresql_tables_list` - List of tables for MaterializedPostgreSQL database engine. Default: `whole database`.
- [materialized_postgresql_tables_list](../../operations/settings/settings.md#materialized-postgresql-tables-list)
3. `materialized_postgresql_allow_automatic_update` - Allow to reload table in the background, when schema changes are detected. Default: `0` (`false`).
- [materialized_postgresql_allow_automatic_update](../../operations/settings/settings.md#materialized-postgresql-allow-automatic-update)
``` sql
CREATE DATABASE test_database
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password'
CREATE DATABASE database1
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password')
SETTINGS materialized_postgresql_max_block_size = 65536,
materialized_postgresql_tables_list = 'table1,table2,table3';
SELECT * FROM test_database.table1;
SELECT * FROM database1.table1;
```
## Requirements {#requirements}
- Setting `wal_level`to `logical` and `max_replication_slots` to at least `2` in the postgresql config file.
1. The [wal_level](https://www.postgresql.org/docs/current/runtime-config-wal.html) setting must have a value `logical` and `max_replication_slots` parameter must have a value at least `2` in the PostgreSQL config file.
- Each replicated table must have one of the following **replica identity**:
2. Each replicated table must have one of the following [replica identity](https://www.postgresql.org/docs/10/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY):
1. **default** (primary key)
- primary key (by default)
2. **index**
- index
``` bash
postgres# CREATE TABLE postgres_table (a Integer NOT NULL, b Integer, c Integer NOT NULL, d Integer, e Integer NOT NULL);
@ -49,9 +56,8 @@ postgres# CREATE unique INDEX postgres_table_index on postgres_table(a, c, e);
postgres# ALTER TABLE postgres_table REPLICA IDENTITY USING INDEX postgres_table_index;
```
Primary key is always checked first. If it is absent, then index, defined as replica identity index, is checked.
If index is used as replica identity, there has to be only one such index in a table.
The primary key is always checked first. If it is absent, then the index, defined as replica identity index, is checked.
If the index is used as a replica identity, there has to be only one such index in a table.
You can check what type is used for a specific table with the following command:
``` bash
@ -65,7 +71,14 @@ FROM pg_class
WHERE oid = 'postgres_table'::regclass;
```
!!! warning "Warning"
Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used.
## Warning {#warning}
## Example of Use {#example-of-use}
1. **TOAST** values convertion is not supported. Default value for the data type will be used.
``` sql
CREATE DATABASE postgresql_db
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password');
SELECT * FROM postgresql_db.postgres_table;
```

View File

@ -5,42 +5,52 @@ toc_title: MaterializedPostgreSQL
# MaterializedPostgreSQL {#materialize-postgresql}
Creates ClickHouse table with an initial data dump of PostgreSQL table and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL table in the remote PostgreSQL database.
If more than one table is required, it is highly recommended to use the [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) database engine instead of the table engine and use the [materialized_postgresql_tables_list](../../../operations/settings/settings.md#materialized-postgresql-tables-list) setting, which specifies the tables to be replicated. It will be much better in terms of CPU, fewer connections and fewer replication slots inside the remote PostgreSQL database.
## Creating a Table {#creating-a-table}
``` sql
CREATE TABLE test.postgresql_replica (key UInt64, value UInt64)
CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64)
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password')
PRIMARY KEY key;
```
**Engine Parameters**
- `host:port` — PostgreSQL server address.
- `database` — Remote database name.
- `table` — Remote table name.
- `user` — PostgreSQL user.
- `password` — User password.
## Requirements {#requirements}
- Setting `wal_level`to `logical` and `max_replication_slots` to at least `2` in the postgresql config file.
1. The [wal_level](https://www.postgresql.org/docs/current/runtime-config-wal.html) setting must have a value `logical` and `max_replication_slots` parameter must have a value at least `2` in the PostgreSQL config file.
- A table with engine `MaterializedPostgreSQL` must have a primary key - the same as a replica identity index (default: primary key) of a postgres table (See [details on replica identity index](../../database-engines/materialized-postgresql.md#requirements)).
2. A table with `MaterializedPostgreSQL` engine must have a primary key — the same as a replica identity index (by default: primary key) of a PostgreSQL table (see [details on replica identity index](../../../engines/database-engines/materialized-postgresql.md#requirements)).
- Only database `Atomic` is allowed.
3. Only database [Atomic](https://en.wikipedia.org/wiki/Atomicity_(database_systems)) is allowed.
## Virtual columns {#virtual-columns}
## Virtual columns {#creating-a-table}
- `_version` — Transaction counter. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
- `_version` (`UInt64`)
- `_sign` — Deletion mark. Type: [Int8](../../../sql-reference/data-types/int-uint.md). Possible values:
- `1` — Row is not deleted,
- `-1` — Row is deleted.
- `_sign` (`Int8`)
These columns do not need to be added, when table is created. They are always accessible in `SELECT` query.
These columns do not need to be added when a table is created. They are always accessible in `SELECT` query.
`_version` column equals `LSN` position in `WAL`, so it might be used to check how up-to-date replication is.
``` sql
CREATE TABLE test.postgresql_replica (key UInt64, value UInt64)
CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64)
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password')
PRIMARY KEY key;
SELECT key, value, _version FROM test.postgresql_replica;
SELECT key, value, _version FROM postgresql_db.postgresql_replica;
```
## Warning {#warning}
1. **TOAST** values convertion is not supported. Default value for the data type will be used.
!!! warning "Warning"
Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used.

View File

@ -14,7 +14,9 @@ The list of documented datasets:
- [Anonymized Yandex.Metrica Dataset](../../getting-started/example-datasets/metrica.md)
- [Recipes](../../getting-started/example-datasets/recipes.md)
- [OnTime](../../getting-started/example-datasets/ontime.md)
- [OpenSky](../../getting-started/example-datasets/opensky.md)
- [New York Taxi Data](../../getting-started/example-datasets/nyc-taxi.md)
- [UK Property Price Paid](../../getting-started/example-datasets/uk-price-paid.md)
- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md)
- [WikiStat](../../getting-started/example-datasets/wikistat.md)
- [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md)

View File

@ -0,0 +1,384 @@
---
toc_priority: 20
toc_title: OpenSky
---
# Crowdsourced air traffic data from The OpenSky Network 2020
"The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. It spans all flights seen by the network's more than 2500 members since 1 January 2019. More data will be periodically included in the dataset until the end of the COVID-19 pandemic".
Source: https://zenodo.org/record/5092942#.YRBCyTpRXYd
Martin Strohmeier, Xavier Olive, Jannis Lübbe, Matthias Schäfer, and Vincent Lenders
"Crowdsourced air traffic data from the OpenSky Network 20192020"
Earth System Science Data 13(2), 2021
https://doi.org/10.5194/essd-13-357-2021
## Download the Dataset
```
wget -O- https://zenodo.org/record/5092942 | grep -oP 'https://zenodo.org/record/5092942/files/flightlist_\d+_\d+\.csv\.gz' | xargs wget
```
Download will take about 2 minutes with good internet connection. There are 30 files with total size of 4.3 GB.
## Create the Table
```
CREATE TABLE opensky
(
callsign String,
number String,
icao24 String,
registration String,
typecode String,
origin String,
destination String,
firstseen DateTime,
lastseen DateTime,
day DateTime,
latitude_1 Float64,
longitude_1 Float64,
altitude_1 Float64,
latitude_2 Float64,
longitude_2 Float64,
altitude_2 Float64
) ENGINE = MergeTree ORDER BY (origin, destination, callsign);
```
## Import Data
Upload data into ClickHouse in parallel:
```
ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c '
gzip -c -d "{}" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"'
```
Here we pass the list of files (`ls -1 flightlist_*.csv.gz`) to `xargs` for parallel processing.
`xargs -P100` specifies to use up to 100 parallel workers but as we only have 30 files, the number of workers will be only 30.
For every file, `xargs` will run a script with `bash -c`. The script has substitution in form of `{}` and the `xargs` command will substitute the filename to it (we have asked it for xargs with `-I{}`).
The script will decompress the file (`gzip -c -d "{}"`) to standard output (`-c` parameter) and the output is redirected to `clickhouse-client`.
Finally, `clickhouse-client` will do insertion. It will read input data in `CSVWithNames` format. We also asked to parse DateTime fields with extended parser (`--date_time_input_format best_effort`) to recognize ISO-8601 format with timezone offsets.
Parallel upload takes 24 seconds.
If you don't like parallel upload, here is sequential variant:
```
for file in flightlist_*.csv.gz; do gzip -c -d "$file" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"; done
```
## Validate the Data
```
SELECT count() FROM opensky
66010819
```
The size of dataset in ClickHouse is just 2.64 GiB:
```
SELECT formatReadableSize(total_bytes) FROM system.tables WHERE name = 'opensky'
2.64 GiB
```
## Run Some Queries
Total distance travelled is 68 billion kilometers:
```
SELECT formatReadableQuantity(sum(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2)) / 1000) FROM opensky
┌─formatReadableQuantity(divide(sum(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2)), 1000))─┐
│ 68.72 billion │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
Average flight distance is around 1000 km.
```
SELECT avg(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2)) FROM opensky
┌─avg(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2))─┐
│ 1041090.6465708319 │
└────────────────────────────────────────────────────────────────────┘
```
### Most busy origin airports and the average distance seen:
```
SELECT
origin,
count(),
round(avg(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2))) AS distance,
bar(distance, 0, 10000000, 100) AS bar
FROM opensky
WHERE origin != ''
GROUP BY origin
ORDER BY count() DESC
LIMIT 100
Query id: f9010ea5-97d0-45a3-a5bd-9657906cd105
┌─origin─┬─count()─┬─distance─┬─bar────────────────────────────────────┐
1. │ KORD │ 745007 │ 1546108 │ ███████████████▍ │
2. │ KDFW │ 696702 │ 1358721 │ █████████████▌ │
3. │ KATL │ 667286 │ 1169661 │ ███████████▋ │
4. │ KDEN │ 582709 │ 1287742 │ ████████████▊ │
5. │ KLAX │ 581952 │ 2628393 │ ██████████████████████████▎ │
6. │ KLAS │ 447789 │ 1336967 │ █████████████▎ │
7. │ KPHX │ 428558 │ 1345635 │ █████████████▍ │
8. │ KSEA │ 412592 │ 1757317 │ █████████████████▌ │
9. │ KCLT │ 404612 │ 880355 │ ████████▋ │
10. │ VIDP │ 363074 │ 1445052 │ ██████████████▍ │
11. │ EDDF │ 362643 │ 2263960 │ ██████████████████████▋ │
12. │ KSFO │ 361869 │ 2445732 │ ████████████████████████▍ │
13. │ KJFK │ 349232 │ 2996550 │ █████████████████████████████▊ │
14. │ KMSP │ 346010 │ 1287328 │ ████████████▋ │
15. │ LFPG │ 344748 │ 2206203 │ ██████████████████████ │
16. │ EGLL │ 341370 │ 3216593 │ ████████████████████████████████▏ │
17. │ EHAM │ 340272 │ 2116425 │ █████████████████████▏ │
18. │ KEWR │ 337696 │ 1826545 │ ██████████████████▎ │
19. │ KPHL │ 320762 │ 1291761 │ ████████████▊ │
20. │ OMDB │ 308855 │ 2855706 │ ████████████████████████████▌ │
21. │ UUEE │ 307098 │ 1555122 │ ███████████████▌ │
22. │ KBOS │ 304416 │ 1621675 │ ████████████████▏ │
23. │ LEMD │ 291787 │ 1695097 │ ████████████████▊ │
24. │ YSSY │ 272979 │ 1875298 │ ██████████████████▋ │
25. │ KMIA │ 265121 │ 1923542 │ ███████████████████▏ │
26. │ ZGSZ │ 263497 │ 745086 │ ███████▍ │
27. │ EDDM │ 256691 │ 1361453 │ █████████████▌ │
28. │ WMKK │ 254264 │ 1626688 │ ████████████████▎ │
29. │ CYYZ │ 251192 │ 2175026 │ █████████████████████▋ │
30. │ KLGA │ 248699 │ 1106935 │ ███████████ │
31. │ VHHH │ 248473 │ 3457658 │ ██████████████████████████████████▌ │
32. │ RJTT │ 243477 │ 1272744 │ ████████████▋ │
33. │ KBWI │ 241440 │ 1187060 │ ███████████▋ │
34. │ KIAD │ 239558 │ 1683485 │ ████████████████▋ │
35. │ KIAH │ 234202 │ 1538335 │ ███████████████▍ │
36. │ KFLL │ 223447 │ 1464410 │ ██████████████▋ │
37. │ KDAL │ 212055 │ 1082339 │ ██████████▋ │
38. │ KDCA │ 207883 │ 1013359 │ ██████████▏ │
39. │ LIRF │ 207047 │ 1427965 │ ██████████████▎ │
40. │ PANC │ 206007 │ 2525359 │ █████████████████████████▎ │
41. │ LTFJ │ 205415 │ 860470 │ ████████▌ │
42. │ KDTW │ 204020 │ 1106716 │ ███████████ │
43. │ VABB │ 201679 │ 1300865 │ █████████████ │
44. │ OTHH │ 200797 │ 3759544 │ █████████████████████████████████████▌ │
45. │ KMDW │ 200796 │ 1232551 │ ████████████▎ │
46. │ KSAN │ 198003 │ 1495195 │ ██████████████▊ │
47. │ KPDX │ 197760 │ 1269230 │ ████████████▋ │
48. │ SBGR │ 197624 │ 2041697 │ ████████████████████▍ │
49. │ VOBL │ 189011 │ 1040180 │ ██████████▍ │
50. │ LEBL │ 188956 │ 1283190 │ ████████████▋ │
51. │ YBBN │ 188011 │ 1253405 │ ████████████▌ │
52. │ LSZH │ 187934 │ 1572029 │ ███████████████▋ │
53. │ YMML │ 187643 │ 1870076 │ ██████████████████▋ │
54. │ RCTP │ 184466 │ 2773976 │ ███████████████████████████▋ │
55. │ KSNA │ 180045 │ 778484 │ ███████▋ │
56. │ EGKK │ 176420 │ 1694770 │ ████████████████▊ │
57. │ LOWW │ 176191 │ 1274833 │ ████████████▋ │
58. │ UUDD │ 176099 │ 1368226 │ █████████████▋ │
59. │ RKSI │ 173466 │ 3079026 │ ██████████████████████████████▋ │
60. │ EKCH │ 172128 │ 1229895 │ ████████████▎ │
61. │ KOAK │ 171119 │ 1114447 │ ███████████▏ │
62. │ RPLL │ 170122 │ 1440735 │ ██████████████▍ │
63. │ KRDU │ 167001 │ 830521 │ ████████▎ │
64. │ KAUS │ 164524 │ 1256198 │ ████████████▌ │
65. │ KBNA │ 163242 │ 1022726 │ ██████████▏ │
66. │ KSDF │ 162655 │ 1380867 │ █████████████▋ │
67. │ ENGM │ 160732 │ 910108 │ █████████ │
68. │ LIMC │ 160696 │ 1564620 │ ███████████████▋ │
69. │ KSJC │ 159278 │ 1081125 │ ██████████▋ │
70. │ KSTL │ 157984 │ 1026699 │ ██████████▎ │
71. │ UUWW │ 156811 │ 1261155 │ ████████████▌ │
72. │ KIND │ 153929 │ 987944 │ █████████▊ │
73. │ ESSA │ 153390 │ 1203439 │ ████████████ │
74. │ KMCO │ 153351 │ 1508657 │ ███████████████ │
75. │ KDVT │ 152895 │ 74048 │ ▋ │
76. │ VTBS │ 152645 │ 2255591 │ ██████████████████████▌ │
77. │ CYVR │ 149574 │ 2027413 │ ████████████████████▎ │
78. │ EIDW │ 148723 │ 1503985 │ ███████████████ │
79. │ LFPO │ 143277 │ 1152964 │ ███████████▌ │
80. │ EGSS │ 140830 │ 1348183 │ █████████████▍ │
81. │ KAPA │ 140776 │ 420441 │ ████▏ │
82. │ KHOU │ 138985 │ 1068806 │ ██████████▋ │
83. │ KTPA │ 138033 │ 1338223 │ █████████████▍ │
84. │ KFFZ │ 137333 │ 55397 │ ▌ │
85. │ NZAA │ 136092 │ 1581264 │ ███████████████▋ │
86. │ YPPH │ 133916 │ 1271550 │ ████████████▋ │
87. │ RJBB │ 133522 │ 1805623 │ ██████████████████ │
88. │ EDDL │ 133018 │ 1265919 │ ████████████▋ │
89. │ ULLI │ 130501 │ 1197108 │ ███████████▊ │
90. │ KIWA │ 127195 │ 250876 │ ██▌ │
91. │ KTEB │ 126969 │ 1189414 │ ███████████▊ │
92. │ VOMM │ 125616 │ 1127757 │ ███████████▎ │
93. │ LSGG │ 123998 │ 1049101 │ ██████████▍ │
94. │ LPPT │ 122733 │ 1779187 │ █████████████████▋ │
95. │ WSSS │ 120493 │ 3264122 │ ████████████████████████████████▋ │
96. │ EBBR │ 118539 │ 1579939 │ ███████████████▋ │
97. │ VTBD │ 118107 │ 661627 │ ██████▌ │
98. │ KVNY │ 116326 │ 692960 │ ██████▊ │
99. │ EDDT │ 115122 │ 941740 │ █████████▍ │
100. │ EFHK │ 114860 │ 1629143 │ ████████████████▎ │
└────────┴─────────┴──────────┴────────────────────────────────────────┘
100 rows in set. Elapsed: 0.186 sec. Processed 48.31 million rows, 2.17 GB (259.27 million rows/s., 11.67 GB/s.)
```
### Number of flights from three major Moscow airports, weekly:
```
SELECT
toMonday(day) AS k,
count() AS c,
bar(c, 0, 10000, 100) AS bar
FROM opensky
WHERE origin IN ('UUEE', 'UUDD', 'UUWW')
GROUP BY k
ORDER BY k ASC
Query id: 1b446157-9519-4cc4-a1cb-178dfcc15a8e
┌──────────k─┬────c─┬─bar──────────────────────────────────────────────────────────────────────────┐
1. │ 2018-12-31 │ 5248 │ ████████████████████████████████████████████████████▍ │
2. │ 2019-01-07 │ 6302 │ ███████████████████████████████████████████████████████████████ │
3. │ 2019-01-14 │ 5701 │ █████████████████████████████████████████████████████████ │
4. │ 2019-01-21 │ 5638 │ ████████████████████████████████████████████████████████▍ │
5. │ 2019-01-28 │ 5731 │ █████████████████████████████████████████████████████████▎ │
6. │ 2019-02-04 │ 5683 │ ████████████████████████████████████████████████████████▋ │
7. │ 2019-02-11 │ 5759 │ █████████████████████████████████████████████████████████▌ │
8. │ 2019-02-18 │ 5736 │ █████████████████████████████████████████████████████████▎ │
9. │ 2019-02-25 │ 5873 │ ██████████████████████████████████████████████████████████▋ │
10. │ 2019-03-04 │ 5965 │ ███████████████████████████████████████████████████████████▋ │
11. │ 2019-03-11 │ 5900 │ ███████████████████████████████████████████████████████████ │
12. │ 2019-03-18 │ 5823 │ ██████████████████████████████████████████████████████████▏ │
13. │ 2019-03-25 │ 5899 │ ██████████████████████████████████████████████████████████▊ │
14. │ 2019-04-01 │ 6043 │ ████████████████████████████████████████████████████████████▍ │
15. │ 2019-04-08 │ 6098 │ ████████████████████████████████████████████████████████████▊ │
16. │ 2019-04-15 │ 6196 │ █████████████████████████████████████████████████████████████▊ │
17. │ 2019-04-22 │ 6486 │ ████████████████████████████████████████████████████████████████▋ │
18. │ 2019-04-29 │ 6682 │ ██████████████████████████████████████████████████████████████████▋ │
19. │ 2019-05-06 │ 6739 │ ███████████████████████████████████████████████████████████████████▍ │
20. │ 2019-05-13 │ 6600 │ ██████████████████████████████████████████████████████████████████ │
21. │ 2019-05-20 │ 6575 │ █████████████████████████████████████████████████████████████████▋ │
22. │ 2019-05-27 │ 6786 │ ███████████████████████████████████████████████████████████████████▋ │
23. │ 2019-06-03 │ 6872 │ ████████████████████████████████████████████████████████████████████▋ │
24. │ 2019-06-10 │ 7045 │ ██████████████████████████████████████████████████████████████████████▍ │
25. │ 2019-06-17 │ 7045 │ ██████████████████████████████████████████████████████████████████████▍ │
26. │ 2019-06-24 │ 6852 │ ████████████████████████████████████████████████████████████████████▌ │
27. │ 2019-07-01 │ 7248 │ ████████████████████████████████████████████████████████████████████████▍ │
28. │ 2019-07-08 │ 7284 │ ████████████████████████████████████████████████████████████████████████▋ │
29. │ 2019-07-15 │ 7142 │ ███████████████████████████████████████████████████████████████████████▍ │
30. │ 2019-07-22 │ 7108 │ ███████████████████████████████████████████████████████████████████████ │
31. │ 2019-07-29 │ 7251 │ ████████████████████████████████████████████████████████████████████████▌ │
32. │ 2019-08-05 │ 7403 │ ██████████████████████████████████████████████████████████████████████████ │
33. │ 2019-08-12 │ 7457 │ ██████████████████████████████████████████████████████████████████████████▌ │
34. │ 2019-08-19 │ 7502 │ ███████████████████████████████████████████████████████████████████████████ │
35. │ 2019-08-26 │ 7540 │ ███████████████████████████████████████████████████████████████████████████▍ │
36. │ 2019-09-02 │ 7237 │ ████████████████████████████████████████████████████████████████████████▎ │
37. │ 2019-09-09 │ 7328 │ █████████████████████████████████████████████████████████████████████████▎ │
38. │ 2019-09-16 │ 5566 │ ███████████████████████████████████████████████████████▋ │
39. │ 2019-09-23 │ 7049 │ ██████████████████████████████████████████████████████████████████████▍ │
40. │ 2019-09-30 │ 6880 │ ████████████████████████████████████████████████████████████████████▋ │
41. │ 2019-10-07 │ 6518 │ █████████████████████████████████████████████████████████████████▏ │
42. │ 2019-10-14 │ 6688 │ ██████████████████████████████████████████████████████████████████▊ │
43. │ 2019-10-21 │ 6667 │ ██████████████████████████████████████████████████████████████████▋ │
44. │ 2019-10-28 │ 6303 │ ███████████████████████████████████████████████████████████████ │
45. │ 2019-11-04 │ 6298 │ ██████████████████████████████████████████████████████████████▊ │
46. │ 2019-11-11 │ 6137 │ █████████████████████████████████████████████████████████████▎ │
47. │ 2019-11-18 │ 6051 │ ████████████████████████████████████████████████████████████▌ │
48. │ 2019-11-25 │ 5820 │ ██████████████████████████████████████████████████████████▏ │
49. │ 2019-12-02 │ 5942 │ ███████████████████████████████████████████████████████████▍ │
50. │ 2019-12-09 │ 4891 │ ████████████████████████████████████████████████▊ │
51. │ 2019-12-16 │ 5682 │ ████████████████████████████████████████████████████████▋ │
52. │ 2019-12-23 │ 6111 │ █████████████████████████████████████████████████████████████ │
53. │ 2019-12-30 │ 5870 │ ██████████████████████████████████████████████████████████▋ │
54. │ 2020-01-06 │ 5953 │ ███████████████████████████████████████████████████████████▌ │
55. │ 2020-01-13 │ 5698 │ ████████████████████████████████████████████████████████▊ │
56. │ 2020-01-20 │ 5339 │ █████████████████████████████████████████████████████▍ │
57. │ 2020-01-27 │ 5566 │ ███████████████████████████████████████████████████████▋ │
58. │ 2020-02-03 │ 5801 │ ██████████████████████████████████████████████████████████ │
59. │ 2020-02-10 │ 5692 │ ████████████████████████████████████████████████████████▊ │
60. │ 2020-02-17 │ 5912 │ ███████████████████████████████████████████████████████████ │
61. │ 2020-02-24 │ 6031 │ ████████████████████████████████████████████████████████████▎ │
62. │ 2020-03-02 │ 6105 │ █████████████████████████████████████████████████████████████ │
63. │ 2020-03-09 │ 5823 │ ██████████████████████████████████████████████████████████▏ │
64. │ 2020-03-16 │ 4659 │ ██████████████████████████████████████████████▌ │
65. │ 2020-03-23 │ 3720 │ █████████████████████████████████████▏ │
66. │ 2020-03-30 │ 1720 │ █████████████████▏ │
67. │ 2020-04-06 │ 849 │ ████████▍ │
68. │ 2020-04-13 │ 710 │ ███████ │
69. │ 2020-04-20 │ 725 │ ███████▏ │
70. │ 2020-04-27 │ 920 │ █████████▏ │
71. │ 2020-05-04 │ 859 │ ████████▌ │
72. │ 2020-05-11 │ 1047 │ ██████████▍ │
73. │ 2020-05-18 │ 1135 │ ███████████▎ │
74. │ 2020-05-25 │ 1266 │ ████████████▋ │
75. │ 2020-06-01 │ 1793 │ █████████████████▊ │
76. │ 2020-06-08 │ 1979 │ ███████████████████▋ │
77. │ 2020-06-15 │ 2297 │ ██████████████████████▊ │
78. │ 2020-06-22 │ 2788 │ ███████████████████████████▊ │
79. │ 2020-06-29 │ 3389 │ █████████████████████████████████▊ │
80. │ 2020-07-06 │ 3545 │ ███████████████████████████████████▍ │
81. │ 2020-07-13 │ 3569 │ ███████████████████████████████████▋ │
82. │ 2020-07-20 │ 3784 │ █████████████████████████████████████▋ │
83. │ 2020-07-27 │ 3960 │ ███████████████████████████████████████▌ │
84. │ 2020-08-03 │ 4323 │ ███████████████████████████████████████████▏ │
85. │ 2020-08-10 │ 4581 │ █████████████████████████████████████████████▋ │
86. │ 2020-08-17 │ 4791 │ ███████████████████████████████████████████████▊ │
87. │ 2020-08-24 │ 4928 │ █████████████████████████████████████████████████▎ │
88. │ 2020-08-31 │ 4687 │ ██████████████████████████████████████████████▋ │
89. │ 2020-09-07 │ 4643 │ ██████████████████████████████████████████████▍ │
90. │ 2020-09-14 │ 4594 │ █████████████████████████████████████████████▊ │
91. │ 2020-09-21 │ 4478 │ ████████████████████████████████████████████▋ │
92. │ 2020-09-28 │ 4382 │ ███████████████████████████████████████████▋ │
93. │ 2020-10-05 │ 4261 │ ██████████████████████████████████████████▌ │
94. │ 2020-10-12 │ 4243 │ ██████████████████████████████████████████▍ │
95. │ 2020-10-19 │ 3941 │ ███████████████████████████████████████▍ │
96. │ 2020-10-26 │ 3616 │ ████████████████████████████████████▏ │
97. │ 2020-11-02 │ 3586 │ ███████████████████████████████████▋ │
98. │ 2020-11-09 │ 3403 │ ██████████████████████████████████ │
99. │ 2020-11-16 │ 3336 │ █████████████████████████████████▎ │
100. │ 2020-11-23 │ 3230 │ ████████████████████████████████▎ │
101. │ 2020-11-30 │ 3183 │ ███████████████████████████████▋ │
102. │ 2020-12-07 │ 3285 │ ████████████████████████████████▋ │
103. │ 2020-12-14 │ 3367 │ █████████████████████████████████▋ │
104. │ 2020-12-21 │ 3748 │ █████████████████████████████████████▍ │
105. │ 2020-12-28 │ 3986 │ ███████████████████████████████████████▋ │
106. │ 2021-01-04 │ 3906 │ ███████████████████████████████████████ │
107. │ 2021-01-11 │ 3425 │ ██████████████████████████████████▎ │
108. │ 2021-01-18 │ 3144 │ ███████████████████████████████▍ │
109. │ 2021-01-25 │ 3115 │ ███████████████████████████████▏ │
110. │ 2021-02-01 │ 3285 │ ████████████████████████████████▋ │
111. │ 2021-02-08 │ 3321 │ █████████████████████████████████▏ │
112. │ 2021-02-15 │ 3475 │ ██████████████████████████████████▋ │
113. │ 2021-02-22 │ 3549 │ ███████████████████████████████████▍ │
114. │ 2021-03-01 │ 3755 │ █████████████████████████████████████▌ │
115. │ 2021-03-08 │ 3080 │ ██████████████████████████████▋ │
116. │ 2021-03-15 │ 3789 │ █████████████████████████████████████▊ │
117. │ 2021-03-22 │ 3804 │ ██████████████████████████████████████ │
118. │ 2021-03-29 │ 4238 │ ██████████████████████████████████████████▍ │
119. │ 2021-04-05 │ 4307 │ ███████████████████████████████████████████ │
120. │ 2021-04-12 │ 4225 │ ██████████████████████████████████████████▎ │
121. │ 2021-04-19 │ 4391 │ ███████████████████████████████████████████▊ │
122. │ 2021-04-26 │ 4868 │ ████████████████████████████████████████████████▋ │
123. │ 2021-05-03 │ 4977 │ █████████████████████████████████████████████████▋ │
124. │ 2021-05-10 │ 5164 │ ███████████████████████████████████████████████████▋ │
125. │ 2021-05-17 │ 4986 │ █████████████████████████████████████████████████▋ │
126. │ 2021-05-24 │ 5024 │ ██████████████████████████████████████████████████▏ │
127. │ 2021-05-31 │ 4824 │ ████████████████████████████████████████████████▏ │
128. │ 2021-06-07 │ 5652 │ ████████████████████████████████████████████████████████▌ │
129. │ 2021-06-14 │ 5613 │ ████████████████████████████████████████████████████████▏ │
130. │ 2021-06-21 │ 6061 │ ████████████████████████████████████████████████████████████▌ │
131. │ 2021-06-28 │ 2554 │ █████████████████████████▌ │
└────────────┴──────┴──────────────────────────────────────────────────────────────────────────────┘
131 rows in set. Elapsed: 0.014 sec. Processed 655.36 thousand rows, 11.14 MB (47.56 million rows/s., 808.48 MB/s.)
```
### Test it in Playground
The data is uploaded to ClickHouse Playground, [example](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=).

View File

@ -0,0 +1,325 @@
---
toc_priority: 20
toc_title: UK Property Price Paid
---
# UK Property Price Paid
The dataset contains data about prices paid for real-estate property in England and Wales. The data is available since year 1995.
The size of the dataset in uncompressed form is about 4 GiB and it will take about 226 MiB in ClickHouse.
Source: https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads
Description of the fields: https://www.gov.uk/guidance/about-the-price-paid-data
Contains HM Land Registry data © Crown copyright and database right 2021. This data is licensed under the Open Government Licence v3.0.
## Download the Dataset
```
wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv
```
Download will take about 2 minutes with good internet connection.
## Create the Table
```
CREATE TABLE uk_price_paid
(
price UInt32,
date Date,
postcode1 LowCardinality(String),
postcode2 LowCardinality(String),
type Enum8('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0),
is_new UInt8,
duration Enum8('freehold' = 1, 'leasehold' = 2, 'unknown' = 0),
addr1 String,
addr2 String,
street LowCardinality(String),
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String),
category UInt8
) ENGINE = MergeTree ORDER BY (postcode1, postcode2, addr1, addr2);
```
## Preprocess and Import Data
We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it.
In this example, we define the structure of source data from the CSV file and specify a query to preprocess the data with `clickhouse-local`.
The preprocessing is:
- splitting the postcode to two different columns `postcode1` and `postcode2` that is better for storage and queries;
- coverting the `time` field to date as it only contains 00:00 time;
- ignoring the `uuid` field because we don't need it for analysis;
- transforming `type` and `duration` to more readable Enum fields with function `transform`;
- transforming `is_new` and `category` fields from single-character string (`Y`/`N` and `A`/`B`) to UInt8 field with 0 and 1.
Preprocessed data is piped directly to `clickhouse-client` to be inserted into ClickHouse table in streaming fashion.
```
clickhouse-local --input-format CSV --structure '
uuid String,
price UInt32,
time DateTime,
postcode String,
a String,
b String,
c String,
addr1 String,
addr2 String,
street String,
locality String,
town String,
district String,
county String,
d String,
e String
' --query "
WITH splitByChar(' ', postcode) AS p
SELECT
price,
toDate(time) AS date,
p[1] AS postcode1,
p[2] AS postcode2,
transform(a, ['T', 'S', 'D', 'F', 'O'], ['terraced', 'semi-detached', 'detached', 'flat', 'other']) AS type,
b = 'Y' AS is_new,
transform(c, ['F', 'L', 'U'], ['freehold', 'leasehold', 'unknown']) AS duration,
addr1,
addr2,
street,
locality,
town,
district,
county,
d = 'B' AS category
FROM table" --date_time_input_format best_effort < pp-complete.csv | clickhouse-client --query "INSERT INTO uk_price_paid FORMAT TSV"
```
It will take about 40 seconds.
## Validate the Data
```
SELECT count() FROM uk_price_paid
26248711
```
The size of dataset in ClickHouse is just 226 MiB:
```
SELECT formatReadableSize(total_bytes) FROM system.tables WHERE name = 'uk_price_paid'
226.40 MiB
```
## Run Some Queries
### Average price per year:
```
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 1000000, 80) FROM uk_price_paid GROUP BY year ORDER BY year
┌─year─┬──price─┬─bar(round(avg(price)), 0, 1000000, 80)─┐
│ 1995 │ 67932 │ █████▍ │
│ 1996 │ 71505 │ █████▋ │
│ 1997 │ 78532 │ ██████▎ │
│ 1998 │ 85435 │ ██████▋ │
│ 1999 │ 96036 │ ███████▋ │
│ 2000 │ 107478 │ ████████▌ │
│ 2001 │ 118886 │ █████████▌ │
│ 2002 │ 137940 │ ███████████ │
│ 2003 │ 155888 │ ████████████▍ │
│ 2004 │ 178885 │ ██████████████▎ │
│ 2005 │ 189350 │ ███████████████▏ │
│ 2006 │ 203528 │ ████████████████▎ │
│ 2007 │ 219377 │ █████████████████▌ │
│ 2008 │ 217056 │ █████████████████▎ │
│ 2009 │ 213419 │ █████████████████ │
│ 2010 │ 236110 │ ██████████████████▊ │
│ 2011 │ 232804 │ ██████████████████▌ │
│ 2012 │ 238366 │ ███████████████████ │
│ 2013 │ 256931 │ ████████████████████▌ │
│ 2014 │ 279917 │ ██████████████████████▍ │
│ 2015 │ 297264 │ ███████████████████████▋ │
│ 2016 │ 313197 │ █████████████████████████ │
│ 2017 │ 346070 │ ███████████████████████████▋ │
│ 2018 │ 350117 │ ████████████████████████████ │
│ 2019 │ 351010 │ ████████████████████████████ │
│ 2020 │ 368974 │ █████████████████████████████▌ │
│ 2021 │ 384351 │ ██████████████████████████████▋ │
└──────┴────────┴────────────────────────────────────────┘
27 rows in set. Elapsed: 0.027 sec. Processed 26.25 million rows, 157.49 MB (955.96 million rows/s., 5.74 GB/s.)
```
### Average price per year in London:
```
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 2000000, 100) FROM uk_price_paid WHERE town = 'LONDON' GROUP BY year ORDER BY year
┌─year─┬───price─┬─bar(round(avg(price)), 0, 2000000, 100)───────────────┐
│ 1995 │ 109112 │ █████▍ │
│ 1996 │ 118667 │ █████▊ │
│ 1997 │ 136518 │ ██████▋ │
│ 1998 │ 152983 │ ███████▋ │
│ 1999 │ 180633 │ █████████ │
│ 2000 │ 215830 │ ██████████▋ │
│ 2001 │ 232996 │ ███████████▋ │
│ 2002 │ 263672 │ █████████████▏ │
│ 2003 │ 278394 │ █████████████▊ │
│ 2004 │ 304665 │ ███████████████▏ │
│ 2005 │ 322875 │ ████████████████▏ │
│ 2006 │ 356192 │ █████████████████▋ │
│ 2007 │ 404055 │ ████████████████████▏ │
│ 2008 │ 420741 │ █████████████████████ │
│ 2009 │ 427754 │ █████████████████████▍ │
│ 2010 │ 480306 │ ████████████████████████ │
│ 2011 │ 496274 │ ████████████████████████▋ │
│ 2012 │ 519441 │ █████████████████████████▊ │
│ 2013 │ 616209 │ ██████████████████████████████▋ │
│ 2014 │ 724144 │ ████████████████████████████████████▏ │
│ 2015 │ 792112 │ ███████████████████████████████████████▌ │
│ 2016 │ 843568 │ ██████████████████████████████████████████▏ │
│ 2017 │ 982566 │ █████████████████████████████████████████████████▏ │
│ 2018 │ 1016845 │ ██████████████████████████████████████████████████▋ │
│ 2019 │ 1043277 │ ████████████████████████████████████████████████████▏ │
│ 2020 │ 1003963 │ ██████████████████████████████████████████████████▏ │
│ 2021 │ 940794 │ ███████████████████████████████████████████████ │
└──────┴─────────┴───────────────────────────────────────────────────────┘
27 rows in set. Elapsed: 0.024 sec. Processed 26.25 million rows, 76.88 MB (1.08 billion rows/s., 3.15 GB/s.)
```
Something happened in 2013. I don't have a clue. Maybe you have a clue what happened in 2020?
### The most expensive neighborhoods:
```
SELECT
town,
district,
count() AS c,
round(avg(price)) AS price,
bar(price, 0, 5000000, 100)
FROM uk_price_paid
WHERE date >= '2020-01-01'
GROUP BY
town,
district
HAVING c >= 100
ORDER BY price DESC
LIMIT 100
Query id: df8c0a98-4713-4f0e-9690-5f73b52f7206
┌─town─────────────────┬─district───────────────┬────c─┬───price─┬─bar(round(avg(price)), 0, 5000000, 100)────────────────────────────┐
│ LONDON │ CITY OF WESTMINSTER │ 3372 │ 3305225 │ ██████████████████████████████████████████████████████████████████ │
│ LONDON │ CITY OF LONDON │ 257 │ 3294478 │ █████████████████████████████████████████████████████████████████▊ │
│ LONDON │ KENSINGTON AND CHELSEA │ 2367 │ 2342422 │ ██████████████████████████████████████████████▋ │
│ LEATHERHEAD │ ELMBRIDGE │ 108 │ 1927143 │ ██████████████████████████████████████▌ │
│ VIRGINIA WATER │ RUNNYMEDE │ 142 │ 1868819 │ █████████████████████████████████████▍ │
│ LONDON │ CAMDEN │ 2815 │ 1736788 │ ██████████████████████████████████▋ │
│ THORNTON HEATH │ CROYDON │ 521 │ 1733051 │ ██████████████████████████████████▋ │
│ WINDLESHAM │ SURREY HEATH │ 103 │ 1717255 │ ██████████████████████████████████▎ │
│ BARNET │ ENFIELD │ 115 │ 1503458 │ ██████████████████████████████ │
│ OXFORD │ SOUTH OXFORDSHIRE │ 298 │ 1275200 │ █████████████████████████▌ │
│ LONDON │ ISLINGTON │ 2458 │ 1274308 │ █████████████████████████▍ │
│ COBHAM │ ELMBRIDGE │ 364 │ 1260005 │ █████████████████████████▏ │
│ LONDON │ HOUNSLOW │ 618 │ 1215682 │ ████████████████████████▎ │
│ ASCOT │ WINDSOR AND MAIDENHEAD │ 379 │ 1215146 │ ████████████████████████▎ │
│ LONDON │ RICHMOND UPON THAMES │ 654 │ 1207551 │ ████████████████████████▏ │
│ BEACONSFIELD │ BUCKINGHAMSHIRE │ 307 │ 1186220 │ ███████████████████████▋ │
│ RICHMOND │ RICHMOND UPON THAMES │ 805 │ 1100420 │ ██████████████████████ │
│ LONDON │ HAMMERSMITH AND FULHAM │ 2888 │ 1062959 │ █████████████████████▎ │
│ WEYBRIDGE │ ELMBRIDGE │ 607 │ 1027161 │ ████████████████████▌ │
│ RADLETT │ HERTSMERE │ 265 │ 1015896 │ ████████████████████▎ │
│ SALCOMBE │ SOUTH HAMS │ 124 │ 1014393 │ ████████████████████▎ │
│ BURFORD │ WEST OXFORDSHIRE │ 102 │ 993100 │ ███████████████████▋ │
│ ESHER │ ELMBRIDGE │ 454 │ 969770 │ ███████████████████▍ │
│ HINDHEAD │ WAVERLEY │ 128 │ 967786 │ ███████████████████▎ │
│ BROCKENHURST │ NEW FOREST │ 121 │ 967046 │ ███████████████████▎ │
│ LEATHERHEAD │ GUILDFORD │ 191 │ 964489 │ ███████████████████▎ │
│ GERRARDS CROSS │ BUCKINGHAMSHIRE │ 376 │ 958555 │ ███████████████████▏ │
│ EAST MOLESEY │ ELMBRIDGE │ 181 │ 943457 │ ██████████████████▋ │
│ OLNEY │ MILTON KEYNES │ 220 │ 942892 │ ██████████████████▋ │
│ CHALFONT ST GILES │ BUCKINGHAMSHIRE │ 135 │ 926950 │ ██████████████████▌ │
│ HENLEY-ON-THAMES │ SOUTH OXFORDSHIRE │ 509 │ 905732 │ ██████████████████ │
│ KINGSTON UPON THAMES │ KINGSTON UPON THAMES │ 889 │ 899689 │ █████████████████▊ │
│ BELVEDERE │ BEXLEY │ 313 │ 895336 │ █████████████████▊ │
│ CRANBROOK │ TUNBRIDGE WELLS │ 404 │ 888190 │ █████████████████▋ │
│ LONDON │ EALING │ 2460 │ 865893 │ █████████████████▎ │
│ MAIDENHEAD │ BUCKINGHAMSHIRE │ 114 │ 863814 │ █████████████████▎ │
│ LONDON │ MERTON │ 1958 │ 857192 │ █████████████████▏ │
│ GUILDFORD │ WAVERLEY │ 131 │ 854447 │ █████████████████ │
│ LONDON │ HACKNEY │ 3088 │ 846571 │ ████████████████▊ │
│ LYMM │ WARRINGTON │ 285 │ 839920 │ ████████████████▋ │
│ HARPENDEN │ ST ALBANS │ 606 │ 836994 │ ████████████████▋ │
│ LONDON │ WANDSWORTH │ 6113 │ 832292 │ ████████████████▋ │
│ LONDON │ SOUTHWARK │ 3612 │ 831319 │ ████████████████▋ │
│ BERKHAMSTED │ DACORUM │ 502 │ 830356 │ ████████████████▌ │
│ KINGS LANGLEY │ DACORUM │ 137 │ 821358 │ ████████████████▍ │
│ TONBRIDGE │ TUNBRIDGE WELLS │ 339 │ 806736 │ ████████████████▏ │
│ EPSOM │ REIGATE AND BANSTEAD │ 157 │ 805903 │ ████████████████ │
│ WOKING │ GUILDFORD │ 161 │ 803283 │ ████████████████ │
│ STOCKBRIDGE │ TEST VALLEY │ 168 │ 801973 │ ████████████████ │
│ TEDDINGTON │ RICHMOND UPON THAMES │ 539 │ 798591 │ ███████████████▊ │
│ OXFORD │ VALE OF WHITE HORSE │ 329 │ 792907 │ ███████████████▋ │
│ LONDON │ BARNET │ 3624 │ 789583 │ ███████████████▋ │
│ TWICKENHAM │ RICHMOND UPON THAMES │ 1090 │ 787760 │ ███████████████▋ │
│ LUTON │ CENTRAL BEDFORDSHIRE │ 196 │ 786051 │ ███████████████▋ │
│ TONBRIDGE │ MAIDSTONE │ 277 │ 785746 │ ███████████████▋ │
│ TOWCESTER │ WEST NORTHAMPTONSHIRE │ 186 │ 783532 │ ███████████████▋ │
│ LONDON │ LAMBETH │ 4832 │ 783422 │ ███████████████▋ │
│ LUTTERWORTH │ HARBOROUGH │ 515 │ 781775 │ ███████████████▋ │
│ WOODSTOCK │ WEST OXFORDSHIRE │ 135 │ 777499 │ ███████████████▌ │
│ ALRESFORD │ WINCHESTER │ 196 │ 775577 │ ███████████████▌ │
│ LONDON │ NEWHAM │ 2942 │ 768551 │ ███████████████▎ │
│ ALDERLEY EDGE │ CHESHIRE EAST │ 168 │ 768280 │ ███████████████▎ │
│ MARLOW │ BUCKINGHAMSHIRE │ 301 │ 762784 │ ███████████████▎ │
│ BILLINGSHURST │ CHICHESTER │ 134 │ 760920 │ ███████████████▏ │
│ LONDON │ TOWER HAMLETS │ 4183 │ 759635 │ ███████████████▏ │
│ MIDHURST │ CHICHESTER │ 245 │ 759101 │ ███████████████▏ │
│ THAMES DITTON │ ELMBRIDGE │ 227 │ 753347 │ ███████████████ │
│ POTTERS BAR │ WELWYN HATFIELD │ 163 │ 752926 │ ███████████████ │
│ REIGATE │ REIGATE AND BANSTEAD │ 555 │ 740961 │ ██████████████▋ │
│ TADWORTH │ REIGATE AND BANSTEAD │ 477 │ 738997 │ ██████████████▋ │
│ SEVENOAKS │ SEVENOAKS │ 1074 │ 734658 │ ██████████████▋ │
│ PETWORTH │ CHICHESTER │ 138 │ 732432 │ ██████████████▋ │
│ BOURNE END │ BUCKINGHAMSHIRE │ 127 │ 730742 │ ██████████████▌ │
│ PURLEY │ CROYDON │ 540 │ 727721 │ ██████████████▌ │
│ OXTED │ TANDRIDGE │ 320 │ 726078 │ ██████████████▌ │
│ LONDON │ HARINGEY │ 2988 │ 724573 │ ██████████████▍ │
│ BANSTEAD │ REIGATE AND BANSTEAD │ 373 │ 713834 │ ██████████████▎ │
│ PINNER │ HARROW │ 480 │ 712166 │ ██████████████▏ │
│ MALMESBURY │ WILTSHIRE │ 293 │ 707747 │ ██████████████▏ │
│ RICKMANSWORTH │ THREE RIVERS │ 732 │ 705400 │ ██████████████ │
│ SLOUGH │ BUCKINGHAMSHIRE │ 359 │ 705002 │ ██████████████ │
│ GREAT MISSENDEN │ BUCKINGHAMSHIRE │ 214 │ 704904 │ ██████████████ │
│ READING │ SOUTH OXFORDSHIRE │ 295 │ 701697 │ ██████████████ │
│ HYTHE │ FOLKESTONE AND HYTHE │ 457 │ 700334 │ ██████████████ │
│ WELWYN │ WELWYN HATFIELD │ 217 │ 699649 │ █████████████▊ │
│ CHIGWELL │ EPPING FOREST │ 242 │ 697869 │ █████████████▊ │
│ BARNET │ BARNET │ 906 │ 695680 │ █████████████▊ │
│ HASLEMERE │ CHICHESTER │ 120 │ 694028 │ █████████████▊ │
│ LEATHERHEAD │ MOLE VALLEY │ 748 │ 692026 │ █████████████▋ │
│ LONDON │ BRENT │ 1945 │ 690799 │ █████████████▋ │
│ HASLEMERE │ WAVERLEY │ 258 │ 690765 │ █████████████▋ │
│ NORTHWOOD │ HILLINGDON │ 252 │ 690753 │ █████████████▋ │
│ WALTON-ON-THAMES │ ELMBRIDGE │ 871 │ 689431 │ █████████████▋ │
│ INGATESTONE │ BRENTWOOD │ 150 │ 688345 │ █████████████▋ │
│ OXFORD │ OXFORD │ 1761 │ 686114 │ █████████████▋ │
│ CHISLEHURST │ BROMLEY │ 410 │ 682892 │ █████████████▋ │
│ KINGS LANGLEY │ THREE RIVERS │ 109 │ 682320 │ █████████████▋ │
│ ASHTEAD │ MOLE VALLEY │ 280 │ 680483 │ █████████████▌ │
│ WOKING │ SURREY HEATH │ 269 │ 679035 │ █████████████▌ │
│ ASCOT │ BRACKNELL FOREST │ 160 │ 678632 │ █████████████▌ │
└──────────────────────┴────────────────────────┴──────┴─────────┴────────────────────────────────────────────────────────────────────┘
100 rows in set. Elapsed: 0.039 sec. Processed 26.25 million rows, 278.03 MB (674.32 million rows/s., 7.14 GB/s.)
```
### Test it in Playground
The data is uploaded to ClickHouse Playground, [example](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==).

View File

@ -3345,3 +3345,30 @@ Possible values:
- 1 — The `LowCardinality` type is converted to the `DICTIONARY` type.
Default value: `0`.
## materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size}
Sets the number of rows collected in memory before flushing data into PostgreSQL database table.
Possible values:
- Positive integer.
Default value: `65536`.
## materialized_postgresql_tables_list {#materialized-postgresql-tables-list}
Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine.
Default value: empty list — means whole PostgreSQL database will be replicated.
## materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update}
Allow reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them.
Possible values:
- 0 — The table is not automatically updated in the background, when schema changes are detected.
- 1 — The table is automatically updated in the background, when schema changes are detected.
Default value: `0`.

View File

@ -41,6 +41,13 @@ SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)
└───────────────────────────────────────────────────────────────────┘
```
## geoDistance
Similar to `greatCircleDistance` but calculates the distance on WGS-84 ellipsoid instead of sphere. This is more precise approximation of the Earth Geoid.
The performance is the same as for `greatCircleDistance` (no performance drawback). It is recommended to use `geoDistance` to calculate the distances on Earth.
Technical note: for close enough points we calculate the distance using planar approximation with the metric on the tangent plane at the midpoint of the coordinates.
## greatCircleAngle {#greatcircleangle}
Calculates the central angle between two points on the Earths surface using [the great-circle formula](https://en.wikipedia.org/wiki/Great-circle_distance).

View File

@ -11,7 +11,7 @@ Arranges `key:value` pairs into [Map(key, value)](../../sql-reference/data-types
**Syntax**
``` sql
```sql
map(key1, value1[, key2, value2, ...])
```
@ -30,7 +30,7 @@ Type: [Map(key, value)](../../sql-reference/data-types/map.md).
Query:
``` sql
```sql
SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
```
@ -46,7 +46,7 @@ Result:
Query:
``` sql
```sql
CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
SELECT a['key2'] FROM table_map;
@ -54,7 +54,7 @@ SELECT a['key2'] FROM table_map;
Result:
``` text
```text
┌─arrayElement(a, 'key2')─┐
│ 0 │
│ 2 │
@ -72,7 +72,7 @@ Collect all the keys and sum corresponding values.
**Syntax**
``` sql
```sql
mapAdd(arg1, arg2 [, ...])
```
@ -88,13 +88,13 @@ Arguments are [maps](../../sql-reference/data-types/map.md) or [tuples](../../sq
Query with a tuple map:
``` sql
```sql
SELECT mapAdd(([toUInt8(1), 2], [1, 1]), ([toUInt8(1), 2], [1, 1])) as res, toTypeName(res) as type;
```
Result:
``` text
```text
┌─res───────────┬─type───────────────────────────────┐
│ ([1,2],[2,2]) │ Tuple(Array(UInt8), Array(UInt64)) │
└───────────────┴────────────────────────────────────┘
@ -102,7 +102,16 @@ Result:
Query with `Map` type:
``` sql
```sql
SELECT mapAdd(map(1,1), map(1,1));
```
Result:
```text
┌─mapAdd(map(1, 1), map(1, 1))─┐
│ {1:2} │
└──────────────────────────────┘
```
## mapSubtract {#function-mapsubtract}
@ -111,21 +120,21 @@ Collect all the keys and subtract corresponding values.
**Syntax**
``` sql
```sql
mapSubtract(Tuple(Array, Array), Tuple(Array, Array) [, ...])
```
**Arguments**
Arguments are [tuples](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promote to the one type ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) or [Float64](../../sql-reference/data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array.
Arguments are [maps](../../sql-reference/data-types/map.md) or [tuples](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promote to the one type ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) or [Float64](../../sql-reference/data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array.
**Returned value**
- Returns one [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2), where the first array contains the sorted keys and the second array contains values.
- Depending on the arguments returns one [map](../../sql-reference/data-types/map.md) or [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2), where the first array contains the sorted keys and the second array contains values.
**Example**
Query:
Query with a tuple map:
```sql
SELECT mapSubtract(([toUInt8(1), 2], [toInt32(1), 1]), ([toUInt8(1), 2], [toInt32(2), 1])) as res, toTypeName(res) as type;
@ -139,32 +148,54 @@ Result:
└────────────────┴───────────────────────────────────┘
```
Query with `Map` type:
```sql
SELECT mapSubtract(map(1,1), map(1,1));
```
Result:
```text
┌─mapSubtract(map(1, 1), map(1, 1))─┐
│ {1:0} │
└───────────────────────────────────┘
```
## mapPopulateSeries {#function-mappopulateseries}
Fills missing keys in the maps (key and value array pair), where keys are integers. Also, it supports specifying the max key, which is used to extend the keys array.
Arguments are [maps](../../sql-reference/data-types/map.md) or two [arrays](../../sql-reference/data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key.
For array arguments the number of elements in `keys` and `values` must be the same for each row.
**Syntax**
``` sql
```sql
mapPopulateSeries(keys, values[, max])
mapPopulateSeries(map[, max])
```
Generates a map, where keys are a series of numbers, from minimum to maximum keys (or `max` argument if it specified) taken from `keys` array with a step size of one, and corresponding values taken from `values` array. If the value is not specified for the key, then it uses the default value in the resulting map. For repeated keys, only the first value (in order of appearing) gets associated with the key.
The number of elements in `keys` and `values` must be the same for each row.
Generates a map (a tuple with two arrays or a value of `Map` type, depending on the arguments), where keys are a series of numbers, from minimum to maximum keys (or `max` argument if it specified) taken from the map with a step size of one, and corresponding values. If the value is not specified for the key, then it uses the default value in the resulting map. For repeated keys, only the first value (in order of appearing) gets associated with the key.
**Arguments**
Mapped arrays:
- `keys` — Array of keys. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
- `values` — Array of values. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
or
- `map` — Map with integer keys. [Map](../../sql-reference/data-types/map.md).
**Returned value**
- Returns a [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys.
- Depending on the arguments returns a [map](../../sql-reference/data-types/map.md) or a [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys.
**Example**
Query:
Query with mapped arrays:
```sql
select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type;
@ -178,13 +209,27 @@ Result:
└──────────────────────────────┴───────────────────────────────────┘
```
Query with `Map` type:
```sql
SELECT mapPopulateSeries(map(1, 10, 5, 20), 6);
```
Result:
```text
┌─mapPopulateSeries(map(1, 10, 5, 20), 6)─┐
│ {1:10,2:0,3:0,4:0,5:20,6:0} │
└─────────────────────────────────────────┘
```
## mapContains {#mapcontains}
Determines whether the `map` contains the `key` parameter.
**Syntax**
``` sql
```sql
mapContains(map, key)
```

View File

@ -0,0 +1,23 @@
---
toc_priority: 49
toc_title: PROJECTION
---
# Manipulating Projections {#manipulations-with-projections}
The following operations are available:
- `ALTER TABLE [db].name ADD PROJECTION name AS SELECT <COLUMN LIST EXPR> [GROUP BY] [ORDER BY]` - Adds projection description to tables metadata.
- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk.
- `ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description.
The commands ADD, DROP and CLEAR are lightweight in a sense that they only change metadata or remove files.
Also, they are replicated, syncing projections metadata via ZooKeeper.
!!! note "Note"
Projection manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants).

View File

@ -43,7 +43,7 @@ ClickHouse не работает и не собирается на 32-битны
git clone --recursive git@github.com:ClickHouse/ClickHouse.git
cd ClickHouse
Замените *yandex* на имя вашего аккаунта на GitHub.
Замените первое вхождение слова `ClickHouse` в команде для git на имя вашего аккаунта на GitHub.
Эта команда создаст директорию ClickHouse, содержащую рабочую копию проекта.
@ -92,7 +92,6 @@ ClickHouse не работает и не собирается на 32-битны
# Две последние команды могут быть объединены вместе:
git submodule update --init
The next commands would help you to reset all submodules to the initial state (!WARING! - any changes inside will be deleted):
Следующие команды помогут сбросить все сабмодули в изначальное состояние (!ВНИМАНИЕ! - все изменения в сабмодулях будут утеряны):
# Synchronizes submodules' remote URL with .gitmodules
@ -140,7 +139,7 @@ ClickHouse использует для сборки некоторое коли
Впрочем, наша среда continuous integration проверяет около десятка вариантов сборки, включая gcc, но сборка с помощью gcc непригодна для использования в продакшене.
On Ubuntu/Debian you can use the automatic installation script (check [official webpage](https://apt.llvm.org/))
На Ubuntu и Debian вы можете использовать скрипт для автоматической установки (см. [официальный сайт](https://apt.llvm.org/))
```bash
sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
@ -163,7 +162,7 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
export CC=clang CXX=clang++
cmake ..
Переменная CC отвечает за компилятор C (сокращение от слов C Compiler), переменная CXX отвечает за выбор компилятора C++ (символ X - это как плюс, но положенный набок, ради того, чтобы превратить его в букву).
Переменная CC отвечает за компилятор C (сокращение от слов C Compiler), переменная CXX отвечает за выбор компилятора C++ (символ X - это как плюс, но положенный набок, ради того, чтобы превратить его в букву). При получении ошибки типа `Could not find compiler set in environment variable CC: clang` необходимо указать в значениях для переменных CC и CXX явную версию компилятора, например, `clang-12` и `clang++-12`.
Для более быстрой сборки, можно использовать debug вариант - сборку без оптимизаций. Для этого, укажите параметр `-D CMAKE_BUILD_TYPE=Debug`:
@ -195,6 +194,14 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
В процессе сборки могут появится сообщения `libprotobuf WARNING` про protobuf файлы в библиотеке libhdfs2. Это не имеет значения.
В случае получения ошибок вида `error: variable 'y' set but not used [-Werror,-Wunused-but-set-variable]` ножно попробовать использовать другую версию компилятора сlang. Например, на момент написания данного текста описанная выше команда по установке clang для Ubuntu 20.04 по-умолчанию устанавливает clang-13, с которым возникает эта ошибка. Для решения проблемы можно установить clang-12 с помощью команд:
```bash
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
sudo ./llvm.sh 12
```
И далее использовать именно его, указав соответствующую версию при установке переменных окружения CC и CXX перед вызовом cmake.
При успешной сборке, вы получите готовый исполняемый файл `ClickHouse/build/programs/clickhouse`:
ls -l programs/clickhouse

View File

@ -0,0 +1,84 @@
---
toc_priority: 30
toc_title: MaterializedPostgreSQL
---
# [экспериментальный] MaterializedPostgreSQL {#materialize-postgresql}
Создает базу данных ClickHouse с исходным дампом данных таблиц PostgreSQL и запускает процесс репликации, т.е. выполняется применение новых изменений в фоне, как эти изменения происходят в таблице PostgreSQL в удаленной базе данных PostgreSQL.
Сервер ClickHouse работает как реплика PostgreSQL. Он читает WAL и выполняет DML запросы. Данные, полученные в результате DDL запросов, не реплицируются, но сами запросы могут быть обработаны (описано ниже).
## Создание базы данных {#creating-a-database}
``` sql
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster]
ENGINE = MaterializedPostgreSQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...]
```
**Параметры движка**
- `host:port` — адрес сервера PostgreSQL.
- `database` — имя базы данных на удалённом сервере.
- `user` — пользователь PostgreSQL.
- `password` — пароль пользователя.
## Настройки {#settings}
- [materialized_postgresql_max_block_size](../../operations/settings/settings.md#materialized-postgresql-max-block-size)
- [materialized_postgresql_tables_list](../../operations/settings/settings.md#materialized-postgresql-tables-list)
- [materialized_postgresql_allow_automatic_update](../../operations/settings/settings.md#materialized-postgresql-allow-automatic-update)
``` sql
CREATE DATABASE database1
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password')
SETTINGS materialized_postgresql_max_block_size = 65536,
materialized_postgresql_tables_list = 'table1,table2,table3';
SELECT * FROM database1.table1;
```
## Требования {#requirements}
1. Настройка [wal_level](https://postgrespro.ru/docs/postgrespro/10/runtime-config-wal) должна иметь значение `logical`, параметр `max_replication_slots` должен быть равен по меньшей мере `2` в конфигурационном файле в PostgreSQL.
2. Каждая реплицируемая таблица должна иметь один из следующих [репликационных идентификаторов](https://postgrespro.ru/docs/postgresql/10/sql-altertable#SQL-CREATETABLE-REPLICA-IDENTITY):
- первичный ключ (по умолчанию)
- индекс
``` bash
postgres# CREATE TABLE postgres_table (a Integer NOT NULL, b Integer, c Integer NOT NULL, d Integer, e Integer NOT NULL);
postgres# CREATE unique INDEX postgres_table_index on postgres_table(a, c, e);
postgres# ALTER TABLE postgres_table REPLICA IDENTITY USING INDEX postgres_table_index;
```
Первичный ключ всегда проверяется первым. Если он отсутствует, то проверяется индекс, определенный как replica identity index (репликационный идентификатор).
Если индекс используется в качестве репликационного идентификатора, то в таблице должен быть только один такой индекс.
Вы можете проверить, какой тип используется для указанной таблицы, выполнив следующую команду:
``` bash
postgres# SELECT CASE relreplident
WHEN 'd' THEN 'default'
WHEN 'n' THEN 'nothing'
WHEN 'f' THEN 'full'
WHEN 'i' THEN 'index'
END AS replica_identity
FROM pg_class
WHERE oid = 'postgres_table'::regclass;
```
!!! warning "Предупреждение"
Репликация **TOAST**-значений не поддерживается. Для типа данных будет использоваться значение по умолчанию.
## Пример использования {#example-of-use}
``` sql
CREATE DATABASE postgresql_db
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password');
SELECT * FROM postgresql_db.postgres_table;
```

View File

@ -0,0 +1,55 @@
---
toc_priority: 12
toc_title: MaterializedPostgreSQL
---
# MaterializedPostgreSQL {#materialize-postgresql}
Создает таблицу ClickHouse с исходным дампом данных таблицы PostgreSQL и запускает процесс репликации, т.е. выполняется применение новых изменений в фоне, как эти изменения происходят в таблице PostgreSQL в удаленной базе данных PostgreSQL.
Если требуется более одной таблицы, вместо движка таблиц рекомендуется использовать движок баз данных [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) и с помощью настройки [materialized_postgresql_tables_list](../../../operations/settings/settings.md#materialized-postgresql-tables-list) указывать таблицы, которые нужно реплицировать. Это будет намного лучше с точки зрения нагрузки на процессор, уменьшит количество подключений и количество слотов репликации внутри удаленной базы данных PostgreSQL.
## Создание таблицы {#creating-a-table}
``` sql
CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64)
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password')
PRIMARY KEY key;
```
**Параметры движка**
- `host:port` — адрес сервера PostgreSQL.
- `database` — имя базы данных на удалённом сервере.
- `table` — имя таблицы на удалённом сервере.
- `user` — пользователь PostgreSQL.
- `password` — пароль пользователя.
## Требования {#requirements}
1. Настройка [wal_level](https://postgrespro.ru/docs/postgrespro/10/runtime-config-wal) должна иметь значение `logical`, параметр `max_replication_slots` должен быть равен по меньшей мере `2` в конфигурационном файле в PostgreSQL.
2. Таблица, созданная с помощью движка `MaterializedPostgreSQL`, должна иметь первичный ключ — такой же, как replica identity index (по умолчанию: первичный ключ) таблицы PostgreSQL (смотрите [replica identity index](../../../engines/database-engines/materialized-postgresql.md#requirements)).
3. Допускается только база данных [Atomic](https://en.wikipedia.org/wiki/Atomicity_(database_systems)).
## Виртуальные столбцы {#virtual-columns}
- `_version` — счетчик транзакций. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
- `_sign` — метка удаления. Тип: [Int8](../../../sql-reference/data-types/int-uint.md). Возможные значения:
- `1` — строка не удалена,
- `-1` — строка удалена.
Эти столбцы не нужно добавлять при создании таблицы. Они всегда доступны в `SELECT` запросе.
Столбец `_version` равен позиции `LSN` в `WAL`, поэтому его можно использовать для проверки актуальности репликации.
``` sql
CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64)
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password')
PRIMARY KEY key;
SELECT key, value, _version FROM postgresql_db.postgresql_replica;
```
!!! warning "Предупреждение"
Репликация **TOAST**-значений не поддерживается. Для типа данных будет использоваться значение по умолчанию.

View File

@ -3195,3 +3195,30 @@ SETTINGS index_granularity = 8192 │
- 1 — тип `LowCardinality` конвертируется в тип `DICTIONARY`.
Значение по умолчанию: `0`.
## materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size}
Задает максимальное количество строк, собранных в памяти перед вставкой данных в таблицу базы данных PostgreSQL.
Возможные значения:
- Положительное целое число.
Значение по умолчанию: `65536`.
## materialized_postgresql_tables_list {#materialized-postgresql-tables-list}
Задает список таблиц базы данных PostgreSQL, разделенных запятыми, которые будут реплицироваться с помощью движка базы данных [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md).
Значение по умолчанию: пустой список — база данных PostgreSQL будет полностью реплицирована.
## materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update}
Позволяет автоматически обновить таблицу в фоновом режиме при обнаружении изменений схемы. DDL-запросы на стороне сервера PostgreSQL не реплицируются с помощью движка ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md), поскольку это запрещено протоколом логической репликации PostgreSQL, но факт DDL-измененений обнаруживается транзакционно. После обнаружения DDL по умолчанию прекращается репликация этих таблиц. Однако, если эта настройка включена, то вместо остановки репликации, таблицы будут перезагружены в фоновом режиме с помощью снимка базы данных без потери информации, и репликация для них будет продолжена.
Возможные значения:
- 0 — таблица не обновляется автоматически в фоновом режиме при обнаружении изменений схемы.
- 1 — таблица обновляется автоматически в фоновом режиме при обнаружении изменений схемы.
Значение по умолчанию: `0`.

View File

@ -2092,7 +2092,7 @@ SELECT tcpPort();
Возвращает список [профилей настроек](../../operations/access-rights.md#settings-profiles-management) для текущего пользователя.
Для изменения текущего профиля настроек может быть использована команда [SET PROFILE](../../sql-reference/statements/set.md#set-statement#query-set). Если команда `SET PROFILE` не применялась, функция возвращает профили, указанные при определении текущего пользователя (см. [CREATE USER](../../sql-reference/statements/create/user.md#create-user-statement)).
Для изменения текущего профиля настроек может быть использована команда SET PROFILE. Если команда `SET PROFILE` не применялась, функция возвращает профили, указанные при определении текущего пользователя (см. [CREATE USER](../../sql-reference/statements/create/user.md#create-user-statement)).
**Синтаксис**

View File

@ -183,7 +183,7 @@ CREATE TABLE codec_example
dt Date CODEC(ZSTD),
ts DateTime CODEC(LZ4HC),
float_value Float32 CODEC(NONE),
double_value Float64 CODEC(LZ4HC(9))
double_value Float64 CODEC(LZ4HC(9)),
value Float32 CODEC(Delta, ZSTD)
)
ENGINE = <Engine>

View File

@ -44,7 +44,7 @@ private:
void toLarge()
{
rb = std::make_unique<RoaringBitmap>();
rb = std::make_shared<RoaringBitmap>();
for (const auto & x : small)
rb->add(static_cast<Value>(x.getValue()));
small.clear();
@ -114,7 +114,7 @@ public:
readVarUInt(size, in);
std::unique_ptr<char[]> buf(new char[size]);
in.readStrict(buf.get(), size);
rb = std::make_unique<RoaringBitmap>(RoaringBitmap::read(buf.get()));
rb = std::make_shared<RoaringBitmap>(RoaringBitmap::read(buf.get()));
}
}
@ -141,7 +141,7 @@ public:
*/
std::shared_ptr<RoaringBitmap> getNewRoaringBitmapFromSmall() const
{
std::shared_ptr<RoaringBitmap> ret = std::make_unique<RoaringBitmap>();
std::shared_ptr<RoaringBitmap> ret = std::make_shared<RoaringBitmap>();
for (const auto & x : small)
ret->add(static_cast<Value>(x.getValue()));
return ret;

View File

@ -158,6 +158,8 @@ else()
target_link_libraries (clickhouse_new_delete PRIVATE clickhouse_common_io jemalloc)
endif()
target_link_libraries (clickhouse_common_io PRIVATE jemalloc)
add_subdirectory(Common/ZooKeeper)
add_subdirectory(Common/Config)
@ -479,6 +481,11 @@ if (USE_NLP)
dbms_target_link_libraries (PUBLIC lemmagen)
endif()
if (USE_BZIP2)
target_link_libraries (clickhouse_common_io PRIVATE ${BZIP2_LIBRARY})
target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${BZIP2_INCLUDE_DIR})
endif()
include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake")
if (ENABLE_TESTS AND USE_GTEST)

View File

@ -3,6 +3,7 @@
#include <Common/CurrentMemoryTracker.h>
namespace
{
@ -36,6 +37,7 @@ namespace
if (current_thread)
{
current_thread->untracked_memory += size;
if (current_thread->untracked_memory > current_thread->untracked_memory_limit)
{
/// Zero untracked before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
@ -54,6 +56,12 @@ namespace
}
}
void check()
{
if (auto * memory_tracker = getMemoryTracker())
memory_tracker->allocImpl(0, true);
}
void alloc(Int64 size)
{
bool throw_if_memory_exceeded = true;

View File

@ -9,4 +9,5 @@ namespace CurrentMemoryTracker
void allocNoThrow(Int64 size);
void realloc(Int64 old_size, Int64 new_size);
void free(Int64 size);
void check();
}

View File

@ -561,6 +561,8 @@
M(591, SQLITE_ENGINE_ERROR) \
M(592, DATA_ENCRYPTION_ERROR) \
M(593, ZERO_COPY_REPLICATION_ERROR) \
M(594, BZIP2_STREAM_DECODER_FAILED) \
M(595, BZIP2_STREAM_ENCODER_FAILED) \
\
M(998, POSTGRESQL_CONNECTION_FAILURE) \
M(999, KEEPER_EXCEPTION) \

View File

@ -192,6 +192,9 @@ template <typename Thread>
ThreadPoolImpl<Thread>::~ThreadPoolImpl()
{
finalize();
/// wait() hadn't been called, log exception at least.
if (first_exception)
DB::tryLogException(first_exception, __PRETTY_FUNCTION__);
}
template <typename Thread>
@ -270,11 +273,21 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
}
catch (...)
{
ALLOW_ALLOCATIONS_IN_SCOPE;
/// job should be reset before decrementing scheduled_jobs to
/// ensure that the Job destroyed before wait() returns.
job = {};
{
/// In case thread pool will not be terminated on exception
/// (this is the case for GlobalThreadPool),
/// than first_exception may be overwritten and got lost,
/// and this usually is an error, since this will finish the thread,
/// and for this the caller may not be ready.
if (!shutdown_on_exception)
DB::tryLogException(std::current_exception(), __PRETTY_FUNCTION__);
std::unique_lock lock(mutex);
if (!first_exception)
first_exception = std::current_exception(); // NOLINT

View File

@ -0,0 +1,55 @@
#include <Common/memory.h>
#include <cstdlib>
/** These functions can be substituted instead of regular ones when memory tracking is needed.
*/
extern "C" void * clickhouse_malloc(size_t size)
{
void * res = malloc(size);
if (res)
Memory::trackMemory(size);
return res;
}
extern "C" void * clickhouse_calloc(size_t number_of_members, size_t size)
{
void * res = calloc(number_of_members, size);
if (res)
Memory::trackMemory(number_of_members * size);
return res;
}
extern "C" void * clickhouse_realloc(void * ptr, size_t size)
{
if (ptr)
Memory::untrackMemory(ptr);
void * res = realloc(ptr, size);
if (res)
Memory::trackMemory(size);
return res;
}
extern "C" void * clickhouse_reallocarray(void * ptr, size_t number_of_members, size_t size)
{
size_t real_size = 0;
if (__builtin_mul_overflow(number_of_members, size, &real_size))
return nullptr;
return clickhouse_realloc(ptr, real_size);
}
extern "C" void clickhouse_free(void * ptr)
{
Memory::untrackMemory(ptr);
free(ptr);
}
extern "C" int clickhouse_posix_memalign(void ** memptr, size_t alignment, size_t size)
{
int res = posix_memalign(memptr, alignment, size);
if (res == 0)
Memory::trackMemory(size);
return res;
}

View File

@ -19,3 +19,4 @@
#cmakedefine01 USE_DATASKETCHES
#cmakedefine01 USE_YAML_CPP
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
#cmakedefine01 USE_BZIP2

25
src/Common/memory.cpp Normal file
View File

@ -0,0 +1,25 @@
#if defined(OS_DARWIN) && defined(BUNDLED_STATIC_JEMALLOC)
extern "C"
{
extern void zone_register();
}
struct InitializeJemallocZoneAllocatorForOSX
{
InitializeJemallocZoneAllocatorForOSX()
{
/// In case of OSX jemalloc register itself as a default zone allocator.
///
/// But when you link statically then zone_register() will not be called,
/// and even will be optimized out:
///
/// It is ok to call it twice (i.e. in case of shared libraries)
/// Since zone_register() is a no-op if the default zone is already replaced with something.
///
/// https://github.com/jemalloc/jemalloc/issues/708
zone_register();
}
} initializeJemallocZoneAllocatorForOSX;
#endif

108
src/Common/memory.h Normal file
View File

@ -0,0 +1,108 @@
#pragma once
#include <new>
#include <common/defines.h>
#include <Common/CurrentMemoryTracker.h>
#if USE_JEMALLOC
# include <jemalloc/jemalloc.h>
#endif
#if !USE_JEMALLOC || JEMALLOC_VERSION_MAJOR < 4
# include <cstdlib>
#endif
namespace Memory
{
inline ALWAYS_INLINE void * newImpl(std::size_t size)
{
auto * ptr = malloc(size);
if (likely(ptr != nullptr))
return ptr;
/// @note no std::get_new_handler logic implemented
throw std::bad_alloc{};
}
inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept
{
return malloc(size);
}
inline ALWAYS_INLINE void deleteImpl(void * ptr) noexcept
{
free(ptr);
}
#if USE_JEMALLOC && JEMALLOC_VERSION_MAJOR >= 4
inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size) noexcept
{
if (unlikely(ptr == nullptr))
return;
sdallocx(ptr, size, 0);
}
#else
inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]]) noexcept
{
free(ptr);
}
#endif
#if defined(OS_LINUX)
# include <malloc.h>
#elif defined(OS_DARWIN)
# include <malloc/malloc.h>
#endif
inline ALWAYS_INLINE size_t getActualAllocationSize(size_t size)
{
size_t actual_size = size;
#if USE_JEMALLOC && JEMALLOC_VERSION_MAJOR >= 5
/// The nallocx() function allocates no memory, but it performs the same size computation as the mallocx() function
/// @note je_mallocx() != je_malloc(). It's expected they don't differ much in allocation logic.
if (likely(size != 0))
actual_size = nallocx(size, 0);
#endif
return actual_size;
}
inline ALWAYS_INLINE void trackMemory(std::size_t size)
{
std::size_t actual_size = getActualAllocationSize(size);
CurrentMemoryTracker::allocNoThrow(actual_size);
}
inline ALWAYS_INLINE void untrackMemory(void * ptr [[maybe_unused]], std::size_t size [[maybe_unused]] = 0) noexcept
{
try
{
#if USE_JEMALLOC && JEMALLOC_VERSION_MAJOR >= 5
/// @note It's also possible to use je_malloc_usable_size() here.
if (likely(ptr != nullptr))
CurrentMemoryTracker::free(sallocx(ptr, 0));
#else
if (size)
CurrentMemoryTracker::free(size);
# if defined(_GNU_SOURCE)
/// It's innaccurate resource free for sanitizers. malloc_usable_size() result is greater or equal to allocated size.
else
CurrentMemoryTracker::free(malloc_usable_size(ptr));
# endif
#endif
}
catch (...)
{}
}
}

View File

@ -1,117 +1,34 @@
#include <common/memory.h>
#include <Common/CurrentMemoryTracker.h>
#include <iostream>
#include <Common/memory.h>
#include <new>
#if defined(OS_LINUX)
# include <malloc.h>
#elif defined(OS_DARWIN)
# include <malloc/malloc.h>
#endif
#if defined(OS_DARWIN) && defined(BUNDLED_STATIC_JEMALLOC)
extern "C"
{
extern void zone_register();
}
struct InitializeJemallocZoneAllocatorForOSX
{
InitializeJemallocZoneAllocatorForOSX()
{
/// In case of OSX jemalloc register itself as a default zone allocator.
///
/// But when you link statically then zone_register() will not be called,
/// and even will be optimized out:
///
/// It is ok to call it twice (i.e. in case of shared libraries)
/// Since zone_register() is a no-op if the default zone is already replaced with something.
///
/// https://github.com/jemalloc/jemalloc/issues/708
zone_register();
}
} initializeJemallocZoneAllocatorForOSX;
#endif
/// Replace default new/delete with memory tracking versions.
/// @sa https://en.cppreference.com/w/cpp/memory/new/operator_new
/// https://en.cppreference.com/w/cpp/memory/new/operator_delete
namespace Memory
{
inline ALWAYS_INLINE size_t getActualAllocationSize(size_t size)
{
size_t actual_size = size;
#if USE_JEMALLOC && JEMALLOC_VERSION_MAJOR >= 5
/// The nallocx() function allocates no memory, but it performs the same size computation as the mallocx() function
/// @note je_mallocx() != je_malloc(). It's expected they don't differ much in allocation logic.
if (likely(size != 0))
actual_size = nallocx(size, 0);
#endif
return actual_size;
}
inline ALWAYS_INLINE void trackMemory(std::size_t size)
{
std::size_t actual_size = getActualAllocationSize(size);
CurrentMemoryTracker::allocNoThrow(actual_size);
}
inline ALWAYS_INLINE void untrackMemory(void * ptr [[maybe_unused]], std::size_t size [[maybe_unused]] = 0) noexcept
{
try
{
#if USE_JEMALLOC && JEMALLOC_VERSION_MAJOR >= 5
/// @note It's also possible to use je_malloc_usable_size() here.
if (likely(ptr != nullptr))
CurrentMemoryTracker::free(sallocx(ptr, 0));
#else
if (size)
CurrentMemoryTracker::free(size);
# if defined(_GNU_SOURCE)
/// It's innaccurate resource free for sanitizers. malloc_usable_size() result is greater or equal to allocated size.
else
CurrentMemoryTracker::free(malloc_usable_size(ptr));
# endif
#endif
}
catch (...)
{}
}
}
/// new
void * operator new(std::size_t size)
{
Memory::trackMemory(size);
return Memory::newImpl(size);
}
void * operator new[](std::size_t size)
{
Memory::trackMemory(size);
return Memory::newImpl(size);
}
void * operator new(std::size_t size, const std::nothrow_t &) noexcept
{
Memory::trackMemory(size);
return Memory::newNoExept(size);
}
void * operator new[](std::size_t size, const std::nothrow_t &) noexcept
{
Memory::trackMemory(size);
return Memory::newNoExept(size);
}

View File

@ -123,7 +123,7 @@ class IColumn;
\
M(UInt64, parallel_distributed_insert_select, 0, "Process distributed INSERT SELECT query in the same cluster on local tables on every shard, if 1 SELECT is executed on each shard, if 2 SELECT and INSERT is executed on each shard", 0) \
M(UInt64, distributed_group_by_no_merge, 0, "If 1, Do not merge aggregation states from different servers for distributed queries (shards will process query up to the Complete stage, initiator just proxies the data from the shards). If 2 the initiator will apply ORDER BY and LIMIT stages (it is not in case when shard process query up to the Complete stage)", 0) \
M(UInt64, distributed_push_down_limit, 0, "If 1, LIMIT will be applied on each shard separatelly. Usually you don't need to use it, since this will be done automatically if it is possible, i.e. for simple query SELECT FROM LIMIT.", 0) \
M(UInt64, distributed_push_down_limit, 1, "If 1, LIMIT will be applied on each shard separatelly. Usually you don't need to use it, since this will be done automatically if it is possible, i.e. for simple query SELECT FROM LIMIT.", 0) \
M(Bool, optimize_distributed_group_by_sharding_key, false, "Optimize GROUP BY sharding_key queries (by avoiding costly aggregation on the initiator server).", 0) \
M(UInt64, optimize_skip_unused_shards_limit, 1000, "Limit for number of sharding key values, turns off optimize_skip_unused_shards if the limit is reached", 0) \
M(Bool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.", 0) \
@ -169,6 +169,7 @@ class IColumn;
M(Int64, os_thread_priority, 0, "If non zero - set corresponding 'nice' value for query processing threads. Can be used to adjust query priority for OS scheduler.", 0) \
\
M(Bool, log_queries, 1, "Log requests and write the log to the system table.", 0) \
M(Bool, log_formatted_queries, 0, "Log formatted queries and write the log to the system table.", 0) \
M(LogQueriesType, log_queries_min_type, QueryLogElementType::QUERY_START, "Minimal type in query_log to log, possible values (from low to high): QUERY_START, QUERY_FINISH, EXCEPTION_BEFORE_START, EXCEPTION_WHILE_PROCESSING.", 0) \
M(Milliseconds, log_queries_min_query_duration_ms, 0, "Minimal time for the query to run, to get to the query_log/query_thread_log.", 0) \
M(UInt64, log_queries_cut_to_length, 100000, "If query length is greater than specified threshold (in bytes), then cut query when writing to query log. Also limit length of printed query in ordinary text log.", 0) \
@ -377,6 +378,8 @@ class IColumn;
M(Bool, external_table_functions_use_nulls, true, "If it is set to true, external table functions will implicitly use Nullable type if needed. Otherwise NULLs will be substituted with default values. Currently supported only by 'mysql', 'postgresql' and 'odbc' table functions.", 0) \
\
M(Bool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.", 0) \
M(UInt64, max_hyperscan_regexp_length, 0, "Max length of regexp than can be used in hyperscan multi-match functions. Zero means unlimited.", 0) \
M(UInt64, max_hyperscan_regexp_total_length, 0, "Max total length of all regexps than can be used in hyperscan multi-match functions (per every function). Zero means unlimited.", 0) \
M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \
M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \
\
@ -498,6 +501,7 @@ class IColumn;
M(Bool, enable_debug_queries, false, "Obsolete setting, does nothing.", 0) \
M(Bool, allow_experimental_database_atomic, true, "Obsolete setting, does nothing.", 0) \
M(Bool, allow_experimental_bigint_types, true, "Obsolete setting, does nothing.", 0) \
M(Bool, allow_experimental_window_functions, true, "Obsolete setting, does nothing.", 0) \
M(HandleKafkaErrorMode, handle_kafka_error_mode, HandleKafkaErrorMode::DEFAULT, "Obsolete setting, does nothing.", 0) \
M(Bool, database_replicated_ddl_output, true, "Obsolete setting, does nothing.", 0) \
/** The section above is for obsolete settings. Do not add anything there. */

View File

@ -3,6 +3,7 @@
#include <Common/ProfileEvents.h>
#include <Common/CurrentThread.h>
#include <IO/WriteHelpers.h>
#include <Common/Stopwatch.h>
#include <common/sleep.h>
namespace ProfileEvents
@ -104,14 +105,18 @@ static bool handleOverflowMode(OverflowMode mode, const String & message, int co
}
}
bool ExecutionSpeedLimits::checkTimeLimit(UInt64 elapsed_ns, OverflowMode overflow_mode) const
bool ExecutionSpeedLimits::checkTimeLimit(const Stopwatch & stopwatch, OverflowMode overflow_mode) const
{
if (max_execution_time != 0
&& elapsed_ns > static_cast<UInt64>(max_execution_time.totalMicroseconds()) * 1000)
return handleOverflowMode(overflow_mode,
if (max_execution_time != 0)
{
auto elapsed_ns = stopwatch.elapsed();
if (elapsed_ns > static_cast<UInt64>(max_execution_time.totalMicroseconds()) * 1000)
return handleOverflowMode(overflow_mode,
"Timeout exceeded: elapsed " + toString(static_cast<double>(elapsed_ns) / 1000000000ULL)
+ " seconds, maximum: " + toString(max_execution_time.totalMicroseconds() / 1000000.0),
ErrorCodes::TIMEOUT_EXCEEDED);
}
return true;
}

View File

@ -3,6 +3,7 @@
#include <Poco/Timespan.h>
#include <common/types.h>
#include <DataStreams/SizeLimits.h>
#include <Common/Stopwatch.h>
namespace DB
{
@ -25,7 +26,7 @@ public:
/// Pause execution in case if speed limits were exceeded.
void throttle(size_t read_rows, size_t read_bytes, size_t total_rows_to_read, UInt64 total_elapsed_microseconds) const;
bool checkTimeLimit(UInt64 elapsed_ns, OverflowMode overflow_mode) const;
bool checkTimeLimit(const Stopwatch & stopwatch, OverflowMode overflow_mode) const;
};
}

View File

@ -201,7 +201,7 @@ void IBlockInputStream::updateExtremes(Block & block)
bool IBlockInputStream::checkTimeLimit() const
{
return limits.speed_limits.checkTimeLimit(info.total_stopwatch.elapsed(), limits.timeout_overflow_mode);
return limits.speed_limits.checkTimeLimit(info.total_stopwatch, limits.timeout_overflow_mode);
}

View File

@ -19,6 +19,7 @@ public:
virtual ~ProxyConfiguration() = default;
/// Returns proxy configuration on each HTTP request.
virtual Aws::Client::ClientConfigurationPerRequest getConfiguration(const Aws::Http::HttpRequest & request) = 0;
virtual void errorReport(const Aws::Client::ClientConfigurationPerRequest & config) = 0;
};
}

View File

@ -20,6 +20,7 @@ class ProxyListConfiguration : public ProxyConfiguration
public:
explicit ProxyListConfiguration(std::vector<Poco::URI> proxies_);
Aws::Client::ClientConfigurationPerRequest getConfiguration(const Aws::Http::HttpRequest & request) override;
void errorReport(const Aws::Client::ClientConfigurationPerRequest &) override {}
private:
/// List of configured proxies.

View File

@ -16,8 +16,10 @@ namespace DB::ErrorCodes
namespace DB::S3
{
ProxyResolverConfiguration::ProxyResolverConfiguration(const Poco::URI & endpoint_, String proxy_scheme_, unsigned proxy_port_)
: endpoint(endpoint_), proxy_scheme(std::move(proxy_scheme_)), proxy_port(proxy_port_)
ProxyResolverConfiguration::ProxyResolverConfiguration(const Poco::URI & endpoint_, String proxy_scheme_
, unsigned proxy_port_, unsigned cache_ttl_)
: endpoint(endpoint_), proxy_scheme(std::move(proxy_scheme_)), proxy_port(proxy_port_), cache_ttl(cache_ttl_)
{
}
@ -25,16 +27,25 @@ Aws::Client::ClientConfigurationPerRequest ProxyResolverConfiguration::getConfig
{
LOG_DEBUG(&Poco::Logger::get("AWSClient"), "Obtain proxy using resolver: {}", endpoint.toString());
std::unique_lock lock(cache_mutex);
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
if (cache_ttl.count() && cache_valid && now <= cache_timestamp + cache_ttl && now >= cache_timestamp)
{
LOG_DEBUG(&Poco::Logger::get("AWSClient"), "Use cached proxy: {}://{}:{}", Aws::Http::SchemeMapper::ToString(cached_config.proxyScheme), cached_config.proxyHost, cached_config.proxyPort);
return cached_config;
}
/// 1 second is enough for now.
/// TODO: Make timeouts configurable.
ConnectionTimeouts timeouts(
Poco::Timespan(1000000), /// Connection timeout.
Poco::Timespan(1000000), /// Send timeout.
Poco::Timespan(1000000) /// Receive timeout.
Poco::Timespan(1000000) /// Receive timeout.
);
auto session = makeHTTPSession(endpoint, timeouts);
Aws::Client::ClientConfigurationPerRequest cfg;
try
{
/// It should be just empty GET request.
@ -53,20 +64,41 @@ Aws::Client::ClientConfigurationPerRequest ProxyResolverConfiguration::getConfig
LOG_DEBUG(&Poco::Logger::get("AWSClient"), "Use proxy: {}://{}:{}", proxy_scheme, proxy_host, proxy_port);
cfg.proxyScheme = Aws::Http::SchemeMapper::FromString(proxy_scheme.c_str());
cfg.proxyHost = proxy_host;
cfg.proxyPort = proxy_port;
cached_config.proxyScheme = Aws::Http::SchemeMapper::FromString(proxy_scheme.c_str());
cached_config.proxyHost = proxy_host;
cached_config.proxyPort = proxy_port;
cache_timestamp = std::chrono::system_clock::now();
cache_valid = true;
return cfg;
return cached_config;
}
catch (...)
{
tryLogCurrentException("AWSClient", "Failed to obtain proxy");
/// Don't use proxy if it can't be obtained.
Aws::Client::ClientConfigurationPerRequest cfg;
return cfg;
}
}
void ProxyResolverConfiguration::errorReport(const Aws::Client::ClientConfigurationPerRequest & config)
{
if (config.proxyHost.empty())
return;
std::unique_lock lock(cache_mutex);
if (!cache_ttl.count() || !cache_valid)
return;
if (cached_config.proxyScheme != config.proxyScheme || cached_config.proxyHost != config.proxyHost
|| cached_config.proxyPort != config.proxyPort)
return;
/// Invalidate cached proxy when got error with this proxy
cache_valid = false;
}
}
#endif

View File

@ -8,6 +8,8 @@
#include "ProxyConfiguration.h"
#include <mutex>
namespace DB::S3
{
/**
@ -18,8 +20,9 @@ namespace DB::S3
class ProxyResolverConfiguration : public ProxyConfiguration
{
public:
ProxyResolverConfiguration(const Poco::URI & endpoint_, String proxy_scheme_, unsigned proxy_port_);
ProxyResolverConfiguration(const Poco::URI & endpoint_, String proxy_scheme_, unsigned proxy_port_, unsigned cache_ttl_);
Aws::Client::ClientConfigurationPerRequest getConfiguration(const Aws::Http::HttpRequest & request) override;
void errorReport(const Aws::Client::ClientConfigurationPerRequest & config) override;
private:
/// Endpoint to obtain a proxy host.
@ -28,6 +31,12 @@ private:
const String proxy_scheme;
/// Port for obtained proxy.
const unsigned proxy_port;
std::mutex cache_mutex;
bool cache_valid = false;
std::chrono::time_point<std::chrono::system_clock> cache_timestamp;
const std::chrono::seconds cache_ttl{0};
Aws::Client::ClientConfigurationPerRequest cached_config;
};
}

View File

@ -56,11 +56,12 @@ std::shared_ptr<S3::ProxyResolverConfiguration> getProxyResolverConfiguration(
if (proxy_scheme != "http" && proxy_scheme != "https")
throw Exception("Only HTTP/HTTPS schemas allowed in proxy resolver config: " + proxy_scheme, ErrorCodes::BAD_ARGUMENTS);
auto proxy_port = proxy_resolver_config.getUInt(prefix + ".proxy_port");
auto cache_ttl = proxy_resolver_config.getUInt(prefix + ".proxy_cache_time", 10);
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Configured proxy resolver: {}, Scheme: {}, Port: {}",
endpoint.toString(), proxy_scheme, proxy_port);
return std::make_shared<S3::ProxyResolverConfiguration>(endpoint, proxy_scheme, proxy_port);
return std::make_shared<S3::ProxyResolverConfiguration>(endpoint, proxy_scheme, proxy_port, cache_ttl);
}
std::shared_ptr<S3::ProxyListConfiguration> getProxyListConfiguration(
@ -128,8 +129,12 @@ getClient(const Poco::Util::AbstractConfiguration & config, const String & confi
auto proxy_config = getProxyConfiguration(config_prefix, config);
if (proxy_config)
{
client_configuration.perRequestConfiguration
= [proxy_config](const auto & request) { return proxy_config->getConfiguration(request); };
client_configuration.error_report
= [proxy_config](const auto & request_config) { proxy_config->errorReport(request_config); };
}
client_configuration.retryStrategy
= std::make_shared<Aws::Client::DefaultRetryStrategy>(config.getUInt(config_prefix + ".retry_attempts", 10));

View File

@ -10,6 +10,7 @@
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/hyperscanRegexpChecker.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <common/StringRef.h>
@ -40,7 +41,13 @@ public:
throw Exception(
"Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED);
return std::make_shared<FunctionsMultiStringFuzzySearch>();
return std::make_shared<FunctionsMultiStringFuzzySearch>(
context->getSettingsRef().max_hyperscan_regexp_length, context->getSettingsRef().max_hyperscan_regexp_total_length);
}
FunctionsMultiStringFuzzySearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_)
: max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_)
{
}
String getName() const override { return name; }
@ -113,6 +120,9 @@ public:
for (const auto & el : src_arr)
refs.emplace_back(el.get<String>());
if (Impl::is_using_hyperscan)
checkRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
auto col_res = ColumnVector<ResultType>::create();
auto col_offsets = ColumnArray::ColumnOffsets::create();
@ -131,6 +141,10 @@ public:
else
return col_res;
}
private:
size_t max_hyperscan_regexp_length;
size_t max_hyperscan_regexp_total_length;
};
}

View File

@ -10,6 +10,7 @@
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/hyperscanRegexpChecker.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <common/StringRef.h>
@ -53,7 +54,13 @@ public:
throw Exception(
"Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED);
return std::make_shared<FunctionsMultiStringSearch>();
return std::make_shared<FunctionsMultiStringSearch>(
context->getSettingsRef().max_hyperscan_regexp_length, context->getSettingsRef().max_hyperscan_regexp_total_length);
}
FunctionsMultiStringSearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_)
: max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_)
{
}
String getName() const override { return name; }
@ -105,6 +112,9 @@ public:
for (const auto & el : src_arr)
refs.emplace_back(el.get<String>());
if (Impl::is_using_hyperscan)
checkRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
auto col_res = ColumnVector<ResultType>::create();
auto col_offsets = ColumnArray::ColumnOffsets::create();
@ -122,6 +132,10 @@ public:
else
return col_res;
}
private:
size_t max_hyperscan_regexp_length;
size_t max_hyperscan_regexp_total_length;
};
}

View File

@ -42,6 +42,8 @@ struct MultiSearchFirstIndexImpl
}
++iteration;
}
if (iteration == 0)
std::fill(res.begin(), res.end(), 0);
}
};

View File

@ -51,6 +51,8 @@ struct MultiSearchFirstPositionImpl
}
++iteration;
}
if (iteration == 0)
std::fill(res.begin(), res.end(), 0);
}
};

View File

@ -41,6 +41,8 @@ struct MultiSearchImpl
}
++iteration;
}
if (iteration == 0)
std::fill(res.begin(), res.end(), 0);
}
};

View File

@ -1,4 +1,5 @@
#include <Columns/ColumnArray.h>
#include <Columns/ColumnMap.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnVector.h>
#include <DataTypes/DataTypeArray.h>
@ -7,6 +8,7 @@
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include "Core/ColumnWithTypeAndName.h"
#include "DataTypes/DataTypeMap.h"
#include "DataTypes/IDataType.h"
namespace DB
@ -32,85 +34,211 @@ private:
bool isVariadic() const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
void checkTypes(const DataTypePtr & key_type, const DataTypePtr max_key_type) const
{
WhichDataType which_key(key_type);
if (!(which_key.isInt() || which_key.isUInt()))
{
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Keys for {} function should be of integer type (signed or unsigned)", getName());
}
if (max_key_type)
{
WhichDataType which_max_key(max_key_type);
if (which_max_key.isNullable())
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Max key argument in arguments of function " + getName() + " can not be Nullable");
if (key_type->getTypeId() != max_key_type->getTypeId())
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Max key type in {} should be same as keys type", getName());
}
}
DataTypePtr getReturnTypeForTuple(const DataTypes & arguments) const
{
if (arguments.size() < 2)
throw Exception{getName() + " accepts at least two arrays for key and value", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH};
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} accepts at least two arrays for key and value", getName());
if (arguments.size() > 3)
throw Exception{"too many arguments in " + getName() + " call", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH};
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Too many arguments in {} call", getName());
const DataTypeArray * key_array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get());
const DataTypeArray * val_array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
if (!key_array_type || !val_array_type)
throw Exception{getName() + " accepts two arrays for key and value", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {} accepts two arrays for key and value", getName());
DataTypePtr keys_type = key_array_type->getNestedType();
WhichDataType which_key(keys_type);
if (!(which_key.isNativeInt() || which_key.isNativeUInt()))
{
throw Exception(
"Keys for " + getName() + " should be of native integer type (signed or unsigned)", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
const auto & key_type = key_array_type->getNestedType();
if (arguments.size() == 3)
{
DataTypePtr max_key_type = arguments[2];
WhichDataType which_max_key(max_key_type);
if (which_max_key.isNullable())
throw Exception(
"Max key argument in arguments of function " + getName() + " can not be Nullable",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (keys_type->getTypeId() != max_key_type->getTypeId())
throw Exception("Max key type in " + getName() + " should be same as keys type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
this->checkTypes(key_type, arguments[2]);
else
this->checkTypes(key_type, nullptr);
return std::make_shared<DataTypeTuple>(DataTypes{arguments[0], arguments[1]});
}
template <typename KeyType, typename ValType>
ColumnPtr execute2(ColumnPtr key_column, ColumnPtr val_column, ColumnPtr max_key_column, const DataTypeTuple & res_type) const
DataTypePtr getReturnTypeForMap(const DataTypes & arguments) const
{
MutableColumnPtr res_tuple = res_type.createColumn();
const auto * map = assert_cast<const DataTypeMap *>(arguments[0].get());
if (arguments.size() == 1)
this->checkTypes(map->getKeyType(), nullptr);
else if (arguments.size() == 2)
this->checkTypes(map->getKeyType(), arguments[1]);
else
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Too many arguments in {} call", getName());
auto * to_tuple = assert_cast<ColumnTuple *>(res_tuple.get());
auto & to_keys_arr = assert_cast<ColumnArray &>(to_tuple->getColumn(0));
auto & to_keys_data = to_keys_arr.getData();
auto & to_keys_offsets = to_keys_arr.getOffsets();
return std::make_shared<DataTypeMap>(map->getKeyType(), map->getValueType());
}
auto & to_vals_arr = assert_cast<ColumnArray &>(to_tuple->getColumn(1));
auto & to_values_data = to_vals_arr.getData();
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.empty())
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, getName() + " accepts at least one map or two arrays");
bool max_key_is_const = false, key_is_const = false, val_is_const = false;
if (arguments[0]->getTypeId() == TypeIndex::Array)
return getReturnTypeForTuple(arguments);
else if (arguments[0]->getTypeId() == TypeIndex::Map)
return getReturnTypeForMap(arguments);
else
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Function {} only accepts one map or arrays, but got {}",
getName(),
arguments[0]->getName());
}
const auto * keys_array = checkAndGetColumn<ColumnArray>(key_column.get());
if (!keys_array)
// Struct holds input and output columns references,
// Both arrays and maps have similar columns to work with but extracted differently
template <typename KeyType, typename ValType>
struct ColumnsInOut
{
// inputs
const PaddedPODArray<KeyType> & in_keys_data;
const PaddedPODArray<ValType> & in_vals_data;
const IColumn::Offsets & in_key_offsets;
const IColumn::Offsets & in_val_offsets;
size_t row_count;
bool key_is_const;
bool val_is_const;
// outputs
PaddedPODArray<KeyType> & out_keys_data;
PaddedPODArray<ValType> & out_vals_data;
IColumn::Offsets & out_keys_offsets;
// with map argument this field will not be used
IColumn::Offsets * out_vals_offsets;
};
template <typename KeyType, typename ValType>
ColumnsInOut<KeyType, ValType> getInOutDataFromArrays(MutableColumnPtr & res_column, ColumnPtr * arg_columns) const
{
auto * out_tuple = assert_cast<ColumnTuple *>(res_column.get());
auto & out_keys_array = assert_cast<ColumnArray &>(out_tuple->getColumn(0));
auto & out_vals_array = assert_cast<ColumnArray &>(out_tuple->getColumn(1));
const auto * key_column = arg_columns[0].get();
const auto * in_keys_array = checkAndGetColumn<ColumnArray>(key_column);
bool key_is_const = false, val_is_const = false;
if (!in_keys_array)
{
const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(key_column.get());
const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(key_column);
if (!const_array)
throw Exception("Expected array column, found " + key_column->getName(), ErrorCodes::ILLEGAL_COLUMN);
throw Exception(
ErrorCodes::ILLEGAL_COLUMN, "Expected array column in function {}, found {}", getName(), key_column->getName());
keys_array = checkAndGetColumn<ColumnArray>(const_array->getDataColumnPtr().get());
in_keys_array = checkAndGetColumn<ColumnArray>(const_array->getDataColumnPtr().get());
key_is_const = true;
}
const auto * values_array = checkAndGetColumn<ColumnArray>(val_column.get());
if (!values_array)
const auto * val_column = arg_columns[1].get();
const auto * in_values_array = checkAndGetColumn<ColumnArray>(val_column);
if (!in_values_array)
{
const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(val_column.get());
const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(val_column);
if (!const_array)
throw Exception("Expected array column, found " + val_column->getName(), ErrorCodes::ILLEGAL_COLUMN);
throw Exception(
ErrorCodes::ILLEGAL_COLUMN, "Expected array column in function {}, found {}", getName(), val_column->getName());
values_array = checkAndGetColumn<ColumnArray>(const_array->getDataColumnPtr().get());
in_values_array = checkAndGetColumn<ColumnArray>(const_array->getDataColumnPtr().get());
val_is_const = true;
}
if (!keys_array || !values_array)
if (!in_keys_array || !in_values_array)
/* something went wrong */
throw Exception{"Illegal columns in arguments of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns in arguments of function " + getName());
const auto & in_keys_data = assert_cast<const ColumnVector<KeyType> &>(in_keys_array->getData()).getData();
const auto & in_values_data = assert_cast<const ColumnVector<ValType> &>(in_values_array->getData()).getData();
const auto & in_keys_offsets = in_keys_array->getOffsets();
const auto & in_vals_offsets = in_values_array->getOffsets();
auto & out_keys_data = assert_cast<ColumnVector<KeyType> &>(out_keys_array.getData()).getData();
auto & out_vals_data = assert_cast<ColumnVector<ValType> &>(out_vals_array.getData()).getData();
auto & out_keys_offsets = out_keys_array.getOffsets();
size_t row_count = key_is_const ? in_values_array->size() : in_keys_array->size();
IColumn::Offsets * out_vals_offsets = &out_vals_array.getOffsets();
return {
in_keys_data,
in_values_data,
in_keys_offsets,
in_vals_offsets,
row_count,
key_is_const,
val_is_const,
out_keys_data,
out_vals_data,
out_keys_offsets,
out_vals_offsets};
}
template <typename KeyType, typename ValType>
ColumnsInOut<KeyType, ValType> getInOutDataFromMap(MutableColumnPtr & res_column, ColumnPtr * arg_columns) const
{
const auto * in_map = assert_cast<const ColumnMap *>(arg_columns[0].get());
const auto & in_nested_array = in_map->getNestedColumn();
const auto & in_nested_tuple = in_map->getNestedData();
const auto & in_keys_data = assert_cast<const ColumnVector<KeyType> &>(in_nested_tuple.getColumn(0)).getData();
const auto & in_vals_data = assert_cast<const ColumnVector<ValType> &>(in_nested_tuple.getColumn(1)).getData();
const auto & in_keys_offsets = in_nested_array.getOffsets();
auto * out_map = assert_cast<ColumnMap *>(res_column.get());
auto & out_nested_array = out_map->getNestedColumn();
auto & out_nested_tuple = out_map->getNestedData();
auto & out_keys_data = assert_cast<ColumnVector<KeyType> &>(out_nested_tuple.getColumn(0)).getData();
auto & out_vals_data = assert_cast<ColumnVector<ValType> &>(out_nested_tuple.getColumn(1)).getData();
auto & out_keys_offsets = out_nested_array.getOffsets();
return {
in_keys_data,
in_vals_data,
in_keys_offsets,
in_keys_offsets,
in_nested_array.size(),
false,
false,
out_keys_data,
out_vals_data,
out_keys_offsets,
nullptr};
}
template <typename KeyType, typename ValType>
ColumnPtr execute2(ColumnPtr * arg_columns, ColumnPtr max_key_column, const DataTypePtr & res_type) const
{
MutableColumnPtr res_column = res_type->createColumn();
bool max_key_is_const = false;
auto columns = res_column->getDataType() == TypeIndex::Tuple ? getInOutDataFromArrays<KeyType, ValType>(res_column, arg_columns)
: getInOutDataFromMap<KeyType, ValType>(res_column, arg_columns);
KeyType max_key_const{0};
@ -121,49 +249,43 @@ private:
max_key_is_const = true;
}
auto & keys_data = assert_cast<const ColumnVector<KeyType> &>(keys_array->getData()).getData();
auto & values_data = assert_cast<const ColumnVector<ValType> &>(values_array->getData()).getData();
// Original offsets
const IColumn::Offsets & key_offsets = keys_array->getOffsets();
const IColumn::Offsets & val_offsets = values_array->getOffsets();
IColumn::Offset offset{0};
size_t row_count = key_is_const ? values_array->size() : keys_array->size();
std::map<KeyType, ValType> res_map;
//Iterate through two arrays and fill result values.
for (size_t row = 0; row < row_count; ++row)
for (size_t row = 0; row < columns.row_count; ++row)
{
size_t key_offset = 0, val_offset = 0, array_size = key_offsets[0], val_array_size = val_offsets[0];
size_t key_offset = 0, val_offset = 0, items_count = columns.in_key_offsets[0], val_array_size = columns.in_val_offsets[0];
res_map.clear();
if (!key_is_const)
if (!columns.key_is_const)
{
key_offset = row > 0 ? key_offsets[row - 1] : 0;
array_size = key_offsets[row] - key_offset;
key_offset = row > 0 ? columns.in_key_offsets[row - 1] : 0;
items_count = columns.in_key_offsets[row] - key_offset;
}
if (!val_is_const)
if (!columns.val_is_const)
{
val_offset = row > 0 ? val_offsets[row - 1] : 0;
val_array_size = val_offsets[row] - val_offset;
val_offset = row > 0 ? columns.in_val_offsets[row - 1] : 0;
val_array_size = columns.in_val_offsets[row] - val_offset;
}
if (array_size != val_array_size)
throw Exception("Key and value array should have same amount of elements", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (items_count != val_array_size)
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Key and value array should have same amount of elements in function {}",
getName());
if (array_size == 0)
if (items_count == 0)
{
to_keys_offsets.push_back(offset);
columns.out_keys_offsets.push_back(offset);
continue;
}
for (size_t i = 0; i < array_size; ++i)
for (size_t i = 0; i < items_count; ++i)
{
res_map.insert({keys_data[key_offset + i], values_data[val_offset + i]});
res_map.insert({columns.in_keys_data[key_offset + i], columns.in_vals_data[val_offset + i]});
}
auto min_key = res_map.begin()->first;
@ -184,7 +306,7 @@ private:
/* no need to add anything, max key is less that first key */
if (max_key < min_key)
{
to_keys_offsets.push_back(offset);
columns.out_keys_offsets.push_back(offset);
continue;
}
}
@ -197,16 +319,16 @@ private:
KeyType key;
for (key = min_key;; ++key)
{
to_keys_data.insert(key);
columns.out_keys_data.push_back(key);
auto it = res_map.find(key);
if (it != res_map.end())
{
to_values_data.insert(it->second);
columns.out_vals_data.push_back(it->second);
}
else
{
to_values_data.insertDefault();
columns.out_vals_data.push_back(0);
}
++offset;
@ -214,80 +336,112 @@ private:
break;
}
to_keys_offsets.push_back(offset);
columns.out_keys_offsets.push_back(offset);
}
to_vals_arr.getOffsets().insert(to_keys_offsets.begin(), to_keys_offsets.end());
return res_tuple;
if (columns.out_vals_offsets)
columns.out_vals_offsets->insert(columns.out_keys_offsets.begin(), columns.out_keys_offsets.end());
return res_column;
}
template <typename KeyType>
ColumnPtr execute1(ColumnPtr key_column, ColumnPtr val_column, ColumnPtr max_key_column, const DataTypeTuple & res_type) const
ColumnPtr execute1(ColumnPtr * arg_columns, ColumnPtr max_key_column, const DataTypePtr & res_type, const DataTypePtr & val_type) const
{
const auto & val_type = (assert_cast<const DataTypeArray *>(res_type.getElements()[1].get()))->getNestedType();
switch (val_type->getTypeId())
{
case TypeIndex::Int8:
return execute2<KeyType, Int8>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, Int8>(arg_columns, max_key_column, res_type);
case TypeIndex::Int16:
return execute2<KeyType, Int16>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, Int16>(arg_columns, max_key_column, res_type);
case TypeIndex::Int32:
return execute2<KeyType, Int32>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, Int32>(arg_columns, max_key_column, res_type);
case TypeIndex::Int64:
return execute2<KeyType, Int64>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, Int64>(arg_columns, max_key_column, res_type);
case TypeIndex::Int128:
return execute2<KeyType, Int128>(arg_columns, max_key_column, res_type);
case TypeIndex::Int256:
return execute2<KeyType, Int256>(arg_columns, max_key_column, res_type);
case TypeIndex::UInt8:
return execute2<KeyType, UInt8>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, UInt8>(arg_columns, max_key_column, res_type);
case TypeIndex::UInt16:
return execute2<KeyType, UInt16>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, UInt16>(arg_columns, max_key_column, res_type);
case TypeIndex::UInt32:
return execute2<KeyType, UInt32>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, UInt32>(arg_columns, max_key_column, res_type);
case TypeIndex::UInt64:
return execute2<KeyType, UInt64>(key_column, val_column, max_key_column, res_type);
return execute2<KeyType, UInt64>(arg_columns, max_key_column, res_type);
case TypeIndex::UInt128:
return execute2<KeyType, UInt128>(arg_columns, max_key_column, res_type);
case TypeIndex::UInt256:
return execute2<KeyType, UInt256>(arg_columns, max_key_column, res_type);
default:
throw Exception{"Illegal columns in arguments of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns in arguments of function " + getName());
}
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
{
auto col1 = arguments[0];
auto col2 = arguments[1];
const auto * k = assert_cast<const DataTypeArray *>(col1.type.get());
const auto * v = assert_cast<const DataTypeArray *>(col2.type.get());
/* determine output type */
const DataTypeTuple & res_type = DataTypeTuple(
DataTypes{std::make_shared<DataTypeArray>(k->getNestedType()), std::make_shared<DataTypeArray>(v->getNestedType())});
DataTypePtr res_type, key_type, val_type;
ColumnPtr max_key_column = nullptr;
ColumnPtr arg_columns[] = {arguments[0].column, nullptr};
if (arguments.size() == 3)
if (arguments[0].type->getTypeId() == TypeIndex::Array)
{
/* max key provided */
max_key_column = arguments[2].column;
key_type = assert_cast<const DataTypeArray *>(arguments[0].type.get())->getNestedType();
val_type = assert_cast<const DataTypeArray *>(arguments[1].type.get())->getNestedType();
res_type = getReturnTypeImpl(DataTypes{arguments[0].type, arguments[1].type});
arg_columns[1] = arguments[1].column;
if (arguments.size() == 3)
{
/* max key provided */
max_key_column = arguments[2].column;
}
}
else
{
assert(arguments[0].type->getTypeId() == TypeIndex::Map);
const auto * map_type = assert_cast<const DataTypeMap *>(arguments[0].type.get());
res_type = getReturnTypeImpl(DataTypes{arguments[0].type});
key_type = map_type->getKeyType();
val_type = map_type->getValueType();
if (arguments.size() == 2)
{
/* max key provided */
max_key_column = arguments[1].column;
}
}
switch (k->getNestedType()->getTypeId())
switch (key_type->getTypeId())
{
case TypeIndex::Int8:
return execute1<Int8>(col1.column, col2.column, max_key_column, res_type);
return execute1<Int8>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::Int16:
return execute1<Int16>(col1.column, col2.column, max_key_column, res_type);
return execute1<Int16>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::Int32:
return execute1<Int32>(col1.column, col2.column, max_key_column, res_type);
return execute1<Int32>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::Int64:
return execute1<Int64>(col1.column, col2.column, max_key_column, res_type);
return execute1<Int64>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::Int128:
return execute1<Int128>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::Int256:
return execute1<Int256>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::UInt8:
return execute1<UInt8>(col1.column, col2.column, max_key_column, res_type);
return execute1<UInt8>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::UInt16:
return execute1<UInt16>(col1.column, col2.column, max_key_column, res_type);
return execute1<UInt16>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::UInt32:
return execute1<UInt32>(col1.column, col2.column, max_key_column, res_type);
return execute1<UInt32>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::UInt64:
return execute1<UInt64>(col1.column, col2.column, max_key_column, res_type);
return execute1<UInt64>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::UInt128:
return execute1<UInt128>(arg_columns, max_key_column, res_type, val_type);
case TypeIndex::UInt256:
return execute1<UInt256>(arg_columns, max_key_column, res_type, val_type);
default:
throw Exception{"Illegal columns in arguments of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns in arguments of function " + getName());
}
}
};
@ -296,5 +450,4 @@ void registerFunctionMapPopulateSeries(FunctionFactory & factory)
{
factory.registerFunction<FunctionMapPopulateSeries>();
}
}

View File

@ -0,0 +1,29 @@
#include <Functions/hyperscanRegexpChecker.h>
#include <Common/Exception.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
void checkRegexp(const std::vector<StringRef> & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length)
{
if (max_hyperscan_regexp_length > 0 || max_hyperscan_regexp_total_length > 0)
{
size_t total_regexp_length = 0;
for (const auto & pattern : refs)
{
if (max_hyperscan_regexp_length > 0 && pattern.size > max_hyperscan_regexp_length)
throw Exception("Regexp length too large", ErrorCodes::BAD_ARGUMENTS);
total_regexp_length += pattern.size;
}
if (max_hyperscan_regexp_total_length > 0 && total_regexp_length > max_hyperscan_regexp_total_length)
throw Exception("Total regexp lengths too large", ErrorCodes::BAD_ARGUMENTS);
}
}
}

View File

@ -0,0 +1,10 @@
#pragma once
#include <common/StringRef.h>
namespace DB
{
void checkRegexp(const std::vector<StringRef> & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length);
}

View File

@ -0,0 +1,97 @@
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
#endif
#if USE_BZIP2
# include <IO/Bzip2ReadBuffer.h>
# include <bzlib.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BZIP2_STREAM_DECODER_FAILED;
}
class Bzip2ReadBuffer::Bzip2StateWrapper
{
public:
Bzip2StateWrapper()
{
memset(&stream, 0, sizeof(stream));
int ret = BZ2_bzDecompressInit(&stream, 0, 0);
if (ret != BZ_OK)
throw Exception(
ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
"bzip2 stream encoder init failed: error code: {}",
ret);
}
~Bzip2StateWrapper()
{
BZ2_bzDecompressEnd(&stream);
}
bz_stream stream;
};
Bzip2ReadBuffer::Bzip2ReadBuffer(std::unique_ptr<ReadBuffer> in_, size_t buf_size, char *existing_memory, size_t alignment)
: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment)
, in(std::move(in_))
, bz(std::make_unique<Bzip2StateWrapper>())
, eof(false)
{
}
Bzip2ReadBuffer::~Bzip2ReadBuffer() = default;
bool Bzip2ReadBuffer::nextImpl()
{
if (eof)
return false;
if (!bz->stream.avail_in)
{
in->nextIfAtEnd();
bz->stream.avail_in = in->buffer().end() - in->position();
bz->stream.next_in = in->position();
}
bz->stream.avail_out = internal_buffer.size();
bz->stream.next_out = internal_buffer.begin();
int ret = BZ2_bzDecompress(&bz->stream);
in->position() = in->buffer().end() - bz->stream.avail_in;
working_buffer.resize(internal_buffer.size() - bz->stream.avail_out);
if (ret == BZ_STREAM_END)
{
if (in->eof())
{
eof = true;
return !working_buffer.empty();
}
else
{
throw Exception(
ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
"bzip2 decoder finished, but input stream has not exceeded: error code: {}", ret);
}
}
if (ret != BZ_OK)
throw Exception(
ErrorCodes::BZIP2_STREAM_DECODER_FAILED,
"bzip2 stream decoder failed: error code: {}",
ret);
return true;
}
}
#endif

33
src/IO/Bzip2ReadBuffer.h Normal file
View File

@ -0,0 +1,33 @@
#pragma once
#include <IO/ReadBuffer.h>
#include <IO/BufferWithOwnMemory.h>
namespace DB
{
class Bzip2ReadBuffer : public BufferWithOwnMemory<ReadBuffer>
{
public:
Bzip2ReadBuffer(
std::unique_ptr<ReadBuffer> in_,
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
char * existing_memory = nullptr,
size_t alignment = 0);
~Bzip2ReadBuffer() override;
private:
bool nextImpl() override;
std::unique_ptr<ReadBuffer> in;
class Bzip2StateWrapper;
std::unique_ptr<Bzip2StateWrapper> bz;
bool eof;
};
}

138
src/IO/Bzip2WriteBuffer.cpp Normal file
View File

@ -0,0 +1,138 @@
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
#endif
#if USE_BROTLI
# include <IO/Bzip2WriteBuffer.h>
# include <bzlib.h>
#include <Common/MemoryTracker.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BZIP2_STREAM_ENCODER_FAILED;
}
class Bzip2WriteBuffer::Bzip2StateWrapper
{
public:
explicit Bzip2StateWrapper(int compression_level)
{
memset(&stream, 0, sizeof(stream));
int ret = BZ2_bzCompressInit(&stream, compression_level, 0, 0);
if (ret != BZ_OK)
throw Exception(
ErrorCodes::BZIP2_STREAM_ENCODER_FAILED,
"bzip2 stream encoder init failed: error code: {}",
ret);
}
~Bzip2StateWrapper()
{
BZ2_bzCompressEnd(&stream);
}
bz_stream stream;
};
Bzip2WriteBuffer::Bzip2WriteBuffer(std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment)
, bz(std::make_unique<Bzip2StateWrapper>(compression_level))
, out(std::move(out_))
{
}
Bzip2WriteBuffer::~Bzip2WriteBuffer()
{
/// FIXME move final flush into the caller
MemoryTracker::LockExceptionInThread lock(VariableContext::Global);
finish();
}
void Bzip2WriteBuffer::nextImpl()
{
if (!offset())
{
return;
}
bz->stream.next_in = working_buffer.begin();
bz->stream.avail_in = offset();
try
{
do
{
out->nextIfAtEnd();
bz->stream.next_out = out->position();
bz->stream.avail_out = out->buffer().end() - out->position();
int ret = BZ2_bzCompress(&bz->stream, BZ_RUN);
out->position() = out->buffer().end() - bz->stream.avail_out;
if (ret != BZ_RUN_OK)
throw Exception(
ErrorCodes::BZIP2_STREAM_ENCODER_FAILED,
"bzip2 stream encoder failed: error code: {}",
ret);
}
while (bz->stream.avail_in > 0);
}
catch (...)
{
/// Do not try to write next time after exception.
out->position() = out->buffer().begin();
throw;
}
}
void Bzip2WriteBuffer::finish()
{
if (finished)
return;
try
{
finishImpl();
out->finalize();
finished = true;
}
catch (...)
{
/// Do not try to flush next time after exception.
out->position() = out->buffer().begin();
finished = true;
throw;
}
}
void Bzip2WriteBuffer::finishImpl()
{
next();
out->nextIfAtEnd();
bz->stream.next_out = out->position();
bz->stream.avail_out = out->buffer().end() - out->position();
int ret = BZ2_bzCompress(&bz->stream, BZ_FINISH);
out->position() = out->buffer().end() - bz->stream.avail_out;
if (ret != BZ_STREAM_END && ret != BZ_FINISH_OK)
throw Exception(
ErrorCodes::BZIP2_STREAM_ENCODER_FAILED,
"bzip2 stream encoder failed: error code: {}",
ret);
}
}
#endif

37
src/IO/Bzip2WriteBuffer.h Normal file
View File

@ -0,0 +1,37 @@
#pragma once
#include <IO/WriteBuffer.h>
#include <IO/BufferWithOwnMemory.h>
namespace DB
{
class Bzip2WriteBuffer : public BufferWithOwnMemory<WriteBuffer>
{
public:
Bzip2WriteBuffer(
std::unique_ptr<WriteBuffer> out_,
int compression_level,
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
char * existing_memory = nullptr,
size_t alignment = 0);
~Bzip2WriteBuffer() override;
void finalize() override { finish(); }
private:
void nextImpl() override;
void finish();
void finishImpl();
class Bzip2StateWrapper;
std::unique_ptr<Bzip2StateWrapper> bz;
std::unique_ptr<WriteBuffer> out;
bool finished = false;
};
}

View File

@ -10,6 +10,8 @@
#include <IO/ZlibInflatingReadBuffer.h>
#include <IO/ZstdDeflatingWriteBuffer.h>
#include <IO/ZstdInflatingReadBuffer.h>
#include <IO/Bzip2ReadBuffer.h>
#include <IO/Bzip2WriteBuffer.h>
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
@ -40,6 +42,8 @@ std::string toContentEncodingName(CompressionMethod method)
return "xz";
case CompressionMethod::Zstd:
return "zstd";
case CompressionMethod::Bzip2:
return "bz2";
case CompressionMethod::None:
return "";
}
@ -69,11 +73,13 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s
return CompressionMethod::Xz;
if (method_str == "zstd" || method_str == "zst")
return CompressionMethod::Zstd;
if (method_str == "bz2")
return CompressionMethod::Bzip2;
if (hint.empty() || hint == "auto" || hint == "none")
return CompressionMethod::None;
throw Exception(
"Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'deflate', 'br', 'xz', 'zstd' are supported as compression methods",
"Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'deflate', 'br', 'xz', 'zstd', 'bz2' are supported as compression methods",
ErrorCodes::NOT_IMPLEMENTED);
}
@ -91,7 +97,10 @@ std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod(
return std::make_unique<LZMAInflatingReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
if (method == CompressionMethod::Zstd)
return std::make_unique<ZstdInflatingReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
#if USE_BZIP2
if (method == CompressionMethod::Bzip2)
return std::make_unique<Bzip2ReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
#endif
if (method == CompressionMethod::None)
return nested;
@ -114,7 +123,10 @@ std::unique_ptr<WriteBuffer> wrapWriteBufferWithCompressionMethod(
if (method == CompressionMethod::Zstd)
return std::make_unique<ZstdDeflatingWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
#if USE_BZIP2
if (method == CompressionMethod::Bzip2)
return std::make_unique<Bzip2WriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
#endif
if (method == CompressionMethod::None)
return nested;

View File

@ -31,7 +31,8 @@ enum class CompressionMethod
/// Zstd compressor
/// This option corresponds to HTTP Content-Encoding: zstd
Zstd,
Brotli
Brotli,
Bzip2
};
/// How the compression method is named in HTTP.

View File

@ -89,6 +89,7 @@ void PocoHTTPClientConfiguration::updateSchemeAndRegion()
PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & clientConfiguration)
: per_request_configuration(clientConfiguration.perRequestConfiguration)
, error_report(clientConfiguration.error_report)
, timeouts(ConnectionTimeouts(
Poco::Timespan(clientConfiguration.connectTimeoutMs * 1000), /// connection timeout.
Poco::Timespan(clientConfiguration.requestTimeoutMs * 1000), /// send timeout.
@ -296,6 +297,8 @@ void PocoHTTPClient::makeRequestInternal(
else if (status_code >= 300)
{
ProfileEvents::increment(select_metric(S3MetricType::Errors));
if (status_code >= 500 && error_report)
error_report(request_configuration);
}
response->SetResponseBody(response_body_stream, session);

View File

@ -37,6 +37,8 @@ struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration
void updateSchemeAndRegion();
std::function<void(const Aws::Client::ClientConfigurationPerRequest &)> error_report;
private:
PocoHTTPClientConfiguration(const String & force_region_, const RemoteHostFilter & remote_host_filter_, unsigned int s3_max_redirects_);
@ -95,6 +97,7 @@ private:
Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const;
std::function<Aws::Client::ClientConfigurationPerRequest(const Aws::Http::HttpRequest &)> per_request_configuration;
std::function<void(const Aws::Client::ClientConfigurationPerRequest &)> error_report;
ConnectionTimeouts timeouts;
const RemoteHostFilter & remote_host_filter;
unsigned int s3_max_redirects;

View File

@ -23,6 +23,8 @@ SRCS(
AIOContextPool.cpp
BrotliReadBuffer.cpp
BrotliWriteBuffer.cpp
Bzip2ReadBuffer.cpp
Bzip2WriteBuffer.cpp
CascadeWriteBuffer.cpp
CompressionMethod.cpp
DoubleConverter.cpp

View File

@ -348,7 +348,7 @@ SetPtr makeExplicitSet(
const ASTPtr & left_arg = args.children.at(0);
const ASTPtr & right_arg = args.children.at(1);
auto column_name = left_arg->getColumnName(context->getSettingsRef());
auto column_name = left_arg->getColumnName();
const auto & dag_node = actions.findInIndex(column_name);
const DataTypePtr & left_arg_type = dag_node.result_type;
@ -641,7 +641,7 @@ std::optional<NameAndTypePair> ActionsMatcher::getNameAndTypeFromAST(const ASTPt
{
// If the argument is a literal, we generated a unique column name for it.
// Use it instead of a generic display name.
auto child_column_name = ast->getColumnName(data.getContext()->getSettingsRef());
auto child_column_name = ast->getColumnName();
const auto * as_literal = ast->as<ASTLiteral>();
if (as_literal)
{
@ -698,7 +698,7 @@ ASTs ActionsMatcher::doUntuple(const ASTFunction * function, ActionsMatcher::Dat
auto func = makeASTFunction("tupleElement", tuple_ast, literal);
auto function_builder = FunctionFactory::instance().get(func->name, data.getContext());
data.addFunction(function_builder, {tuple_name_type->name, literal->getColumnName(data.getContext()->getSettingsRef())}, func->getColumnName(data.getContext()->getSettingsRef()));
data.addFunction(function_builder, {tuple_name_type->name, literal->getColumnName()}, func->getColumnName());
columns.push_back(std::move(func));
}
@ -762,7 +762,7 @@ void ActionsMatcher::visit(const ASTIdentifier & identifier, const ASTPtr &, Dat
void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & data)
{
auto column_name = ast->getColumnName(data.getContext()->getSettingsRef());
auto column_name = ast->getColumnName();
if (data.hasColumn(column_name))
return;
@ -778,7 +778,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
ASTPtr arg = node.arguments->children.at(0);
visit(arg, data);
if (!data.only_consts)
data.addArrayJoin(arg->getColumnName(data.getContext()->getSettingsRef()), column_name);
data.addArrayJoin(arg->getColumnName(), column_name);
return;
}
@ -800,7 +800,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
/// We are in the part of the tree that we are not going to compute. You just need to define types.
/// Do not subquery and create sets. We replace "in*" function to "in*IgnoreSet".
auto argument_name = node.arguments->children.at(0)->getColumnName(data.getContext()->getSettingsRef());
auto argument_name = node.arguments->children.at(0)->getColumnName();
data.addFunction(
FunctionFactory::instance().get(node.name + "IgnoreSet", data.getContext()),
@ -929,7 +929,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
if (!prepared_set->empty())
column.name = data.getUniqueName("__set");
else
column.name = child->getColumnName(data.getContext()->getSettingsRef());
column.name = child->getColumnName();
if (!data.hasColumn(column.name))
{
@ -1008,7 +1008,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
visit(lambda->arguments->children.at(1), data);
auto lambda_dag = data.actions_stack.popLevel();
String result_name = lambda->arguments->children.at(1)->getColumnName(data.getContext()->getSettingsRef());
String result_name = lambda->arguments->children.at(1)->getColumnName();
lambda_dag->removeUnusedActions(Names(1, result_name));
auto lambda_actions = std::make_shared<ExpressionActions>(
@ -1023,7 +1023,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
if (findColumn(required_arg, lambda_arguments) == lambda_arguments.end())
captured.push_back(required_arg);
/// We can not name `getColumnName(data.getContext()->getSettingsRef())`,
/// We can not name `getColumnName()`,
/// because it does not uniquely define the expression (the types of arguments can be different).
String lambda_name = data.getUniqueName("__lambda");
@ -1053,7 +1053,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
if (arguments_present)
{
/// Calculate column name here again, because AST may be changed here (in case of untuple).
data.addFunction(function_builder, argument_names, ast->getColumnName(data.getContext()->getSettingsRef()));
data.addFunction(function_builder, argument_names, ast->getColumnName());
}
}
@ -1067,7 +1067,7 @@ void ActionsMatcher::visit(const ASTLiteral & literal, const ASTPtr & /* ast */,
// AST here? Anyway, do not modify the column name if it is set already.
if (literal.unique_column_name.empty())
{
const auto default_name = literal.getColumnName(data.getContext()->getSettingsRef());
const auto default_name = literal.getColumnName();
const auto & index = data.actions_stack.getLastActionsIndex();
const auto * existing_column = index.tryGetNode(default_name);
@ -1147,7 +1147,7 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su
}
/// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery.
String set_id = right_in_operand->getColumnName(data.getContext()->getSettingsRef());
String set_id = right_in_operand->getColumnName();
SubqueryForSet & subquery_for_set = data.subqueries_for_sets[set_id];
@ -1183,7 +1183,7 @@ SetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool no_su
{
const auto & last_actions = data.actions_stack.getLastActions();
const auto & index = data.actions_stack.getLastActionsIndex();
if (index.contains(left_in_operand->getColumnName(data.getContext()->getSettingsRef())))
if (index.contains(left_in_operand->getColumnName()))
/// An explicit enumeration of values in parentheses.
return makeExplicitSet(&node, last_actions, false, data.getContext(), data.set_size_limit, data.prepared_sets);
else

View File

@ -781,15 +781,62 @@ void NO_INLINE Aggregator::executeImplBatch(
}
template <bool use_compiled_functions>
void NO_INLINE Aggregator::executeWithoutKeyImpl(
AggregatedDataWithoutKey & res,
size_t rows,
AggregateFunctionInstruction * aggregate_instructions,
Arena * arena)
{
/// Adding values
for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
#if USE_EMBEDDED_COMPILER
if constexpr (use_compiled_functions)
{
std::vector<ColumnData> columns_data;
for (size_t i = 0; i < aggregate_functions.size(); ++i)
{
if (!is_aggregate_function_compiled[i])
continue;
AggregateFunctionInstruction * inst = aggregate_instructions + i;
size_t arguments_size = inst->that->getArgumentTypes().size();
for (size_t argument_index = 0; argument_index < arguments_size; ++argument_index)
{
columns_data.emplace_back(getColumnData(inst->batch_arguments[argument_index]));
}
}
auto add_into_aggregate_states_function_single_place = compiled_aggregate_functions_holder->compiled_aggregate_functions.add_into_aggregate_states_function_single_place;
add_into_aggregate_states_function_single_place(rows, columns_data.data(), res);
#if defined(MEMORY_SANITIZER)
/// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place.
for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index)
{
if (!is_aggregate_function_compiled[aggregate_function_index])
continue;
auto aggregate_data_with_offset = res + offsets_of_aggregate_states[aggregate_function_index];
auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData();
__msan_unpoison(aggregate_data_with_offset, data_size);
}
#endif
}
#endif
/// Adding values
for (size_t i = 0; i < aggregate_functions.size(); ++i)
{
AggregateFunctionInstruction * inst = aggregate_instructions + i;
#if USE_EMBEDDED_COMPILER
if constexpr (use_compiled_functions)
if (is_aggregate_function_compiled[i])
continue;
#endif
if (inst->offsets)
inst->batch_that->addBatchSinglePlace(
inst->offsets[static_cast<ssize_t>(rows - 1)], res + inst->state_offset, inst->batch_arguments, arena);
@ -930,7 +977,16 @@ bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedData
/// For the case when there are no keys (all aggregate into one row).
if (result.type == AggregatedDataVariants::Type::without_key)
{
executeWithoutKeyImpl(result.without_key, num_rows, aggregate_functions_instructions.data(), result.aggregates_pool);
#if USE_EMBEDDED_COMPILER
if (compiled_aggregate_functions_holder)
{
executeWithoutKeyImpl<true>(result.without_key, num_rows, aggregate_functions_instructions.data(), result.aggregates_pool);
}
else
#endif
{
executeWithoutKeyImpl<false>(result.without_key, num_rows, aggregate_functions_instructions.data(), result.aggregates_pool);
}
}
else
{
@ -1193,6 +1249,9 @@ bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
}
}
/// Some aggregate functions cannot throw exceptions on allocations (e.g. from C malloc)
/// but still tracks memory. Check it here.
CurrentMemoryTracker::check();
return true;
}

View File

@ -1121,7 +1121,7 @@ private:
AggregateDataPtr overflow_row) const;
/// Specialization for a particular value no_more_keys.
template <bool no_more_keys, bool use_compiled_expressions, typename Method>
template <bool no_more_keys, bool use_compiled_functions, typename Method>
void executeImplBatch(
Method & method,
typename Method::State & state,
@ -1131,7 +1131,8 @@ private:
AggregateDataPtr overflow_row) const;
/// For case when there are no keys (all aggregate into one row).
static void executeWithoutKeyImpl(
template <bool use_compiled_functions>
void executeWithoutKeyImpl(
AggregatedDataWithoutKey & res,
size_t rows,
AggregateFunctionInstruction * aggregate_instructions,

View File

@ -779,43 +779,60 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
uint64_t kb = 0;
readText(kb, *meminfo);
if (kb)
if (!kb)
{
skipWhitespaceIfAny(*meminfo, true);
assertString("kB", *meminfo);
skipToNextLineOrEOF(*meminfo);
continue;
}
uint64_t bytes = kb * 1024;
skipWhitespaceIfAny(*meminfo, true);
if (name == "MemTotal:")
{
new_values["OSMemoryTotal"] = bytes;
}
else if (name == "MemFree:")
{
/// We cannot simply name this metric "Free", because it confuses users.
/// See https://www.linuxatemyram.com/
/// For convenience we also provide OSMemoryFreePlusCached, that should be somewhat similar to OSMemoryAvailable.
/**
* Not all entries in /proc/meminfo contain the kB suffix, e.g.
* HugePages_Total: 0
* HugePages_Free: 0
* We simply skip such entries as they're not needed
*/
if (*meminfo->position() == '\n')
{
skipToNextLineOrEOF(*meminfo);
continue;
}
free_plus_cached_bytes += bytes;
new_values["OSMemoryFreeWithoutCached"] = bytes;
}
else if (name == "MemAvailable:")
{
new_values["OSMemoryAvailable"] = bytes;
}
else if (name == "Buffers:")
{
new_values["OSMemoryBuffers"] = bytes;
}
else if (name == "Cached:")
{
free_plus_cached_bytes += bytes;
new_values["OSMemoryCached"] = bytes;
}
else if (name == "SwapCached:")
{
new_values["OSMemorySwapCached"] = bytes;
}
assertString("kB", *meminfo);
uint64_t bytes = kb * 1024;
if (name == "MemTotal:")
{
new_values["OSMemoryTotal"] = bytes;
}
else if (name == "MemFree:")
{
/// We cannot simply name this metric "Free", because it confuses users.
/// See https://www.linuxatemyram.com/
/// For convenience we also provide OSMemoryFreePlusCached, that should be somewhat similar to OSMemoryAvailable.
free_plus_cached_bytes += bytes;
new_values["OSMemoryFreeWithoutCached"] = bytes;
}
else if (name == "MemAvailable:")
{
new_values["OSMemoryAvailable"] = bytes;
}
else if (name == "Buffers:")
{
new_values["OSMemoryBuffers"] = bytes;
}
else if (name == "Cached:")
{
free_plus_cached_bytes += bytes;
new_values["OSMemoryCached"] = bytes;
}
else if (name == "SwapCached:")
{
new_values["OSMemorySwapCached"] = bytes;
}
skipToNextLineOrEOF(*meminfo);
@ -896,9 +913,9 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
if (block_devices_rescan_delay.elapsedSeconds() >= 300)
openBlockDevices();
for (auto & [name, device] : block_devs)
try
{
try
for (auto & [name, device] : block_devs)
{
device->rewind();
@ -947,20 +964,20 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti
new_values["BlockQueueTimePerOp_" + name] = delta_values.time_in_queue * time_multiplier / delta_values.in_flight_ios;
}
}
}
catch (...)
{
/// Try to reopen block devices in case of error
/// (i.e. ENOENT means that some disk had been replaced, and it may apperas with a new name)
try
{
openBlockDevices();
}
catch (...)
{
/// Try to reopen block devices in case of error
/// (i.e. ENOENT means that some disk had been replaced, and it may apperas with a new name)
try
{
openBlockDevices();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
tryLogCurrentException(__PRETTY_FUNCTION__);
}
tryLogCurrentException(__PRETTY_FUNCTION__);
}
if (net_dev)

View File

@ -1731,7 +1731,7 @@ BackgroundSchedulePool & Context::getMessageBrokerSchedulePool() const
if (!shared->message_broker_schedule_pool)
shared->message_broker_schedule_pool.emplace(
settings.background_message_broker_schedule_pool_size,
CurrentMetrics::BackgroundDistributedSchedulePoolTask,
CurrentMetrics::BackgroundMessageBrokerSchedulePoolTask,
"BgMBSchPool");
return *shared->message_broker_schedule_pool;
}

View File

@ -243,7 +243,7 @@ void ExpressionAnalyzer::analyzeAggregation()
ssize_t size = group_asts.size();
getRootActionsNoMakeSet(group_asts[i], true, temp_actions, false);
const auto & column_name = group_asts[i]->getColumnName(getContext()->getSettingsRef());
const auto & column_name = group_asts[i]->getColumnName();
const auto * node = temp_actions->tryFindInIndex(column_name);
if (!node)
throw Exception("Unknown identifier (in GROUP BY): " + column_name, ErrorCodes::UNKNOWN_IDENTIFIER);
@ -408,7 +408,7 @@ void SelectQueryExpressionAnalyzer::makeSetsForIndex(const ASTPtr & node)
auto temp_actions = std::make_shared<ActionsDAG>(columns_after_join);
getRootActions(left_in_operand, true, temp_actions);
if (temp_actions->tryFindInIndex(left_in_operand->getColumnName(getContext()->getSettingsRef())))
if (temp_actions->tryFindInIndex(left_in_operand->getColumnName()))
makeExplicitSet(func, *temp_actions, true, getContext(), settings.size_limits_for_set, prepared_sets);
}
}
@ -456,7 +456,7 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)
if (node->arguments)
getRootActionsNoMakeSet(node->arguments, true, actions);
aggregate.column_name = node->getColumnName(getContext()->getSettingsRef());
aggregate.column_name = node->getColumnName();
const ASTs & arguments = node->arguments ? node->arguments->children : ASTs();
aggregate.argument_names.resize(arguments.size());
@ -464,7 +464,7 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)
for (size_t i = 0; i < arguments.size(); ++i)
{
const std::string & name = arguments[i]->getColumnName(getContext()->getSettingsRef());
const std::string & name = arguments[i]->getColumnName();
const auto * dag_node = actions->tryFindInIndex(name);
if (!dag_node)
{
@ -645,7 +645,7 @@ void ExpressionAnalyzer::makeWindowDescriptions(ActionsDAGPtr actions)
WindowFunctionDescription window_function;
window_function.function_node = function_node;
window_function.column_name
= window_function.function_node->getColumnName(getContext()->getSettingsRef());
= window_function.function_node->getColumnName();
window_function.function_parameters
= window_function.function_node->parameters
? getAggregateFunctionParametersArray(
@ -664,7 +664,7 @@ void ExpressionAnalyzer::makeWindowDescriptions(ActionsDAGPtr actions)
window_function.argument_names.resize(arguments.size());
for (size_t i = 0; i < arguments.size(); ++i)
{
const std::string & name = arguments[i]->getColumnName(getContext()->getSettingsRef());
const std::string & name = arguments[i]->getColumnName();
const auto * node = actions->tryFindInIndex(name);
if (!node)
@ -961,7 +961,7 @@ ActionsDAGPtr SelectQueryExpressionAnalyzer::appendPrewhere(
auto & step = chain.lastStep(sourceColumns());
getRootActions(select_query->prewhere(), only_types, step.actions());
String prewhere_column_name = select_query->prewhere()->getColumnName(getContext()->getSettingsRef());
String prewhere_column_name = select_query->prewhere()->getColumnName();
step.addRequiredOutput(prewhere_column_name);
const auto & node = step.actions()->findInIndex(prewhere_column_name);
@ -1047,7 +1047,7 @@ bool SelectQueryExpressionAnalyzer::appendWhere(ExpressionActionsChain & chain,
getRootActions(select_query->where(), only_types, step.actions());
auto where_column_name = select_query->where()->getColumnName(getContext()->getSettingsRef());
auto where_column_name = select_query->where()->getColumnName();
step.addRequiredOutput(where_column_name);
const auto & node = step.actions()->findInIndex(where_column_name);
@ -1072,7 +1072,7 @@ bool SelectQueryExpressionAnalyzer::appendGroupBy(ExpressionActionsChain & chain
ASTs asts = select_query->groupBy()->children;
for (const auto & ast : asts)
{
step.addRequiredOutput(ast->getColumnName(getContext()->getSettingsRef()));
step.addRequiredOutput(ast->getColumnName());
getRootActions(ast, only_types, step.actions());
}
@ -1100,7 +1100,7 @@ void SelectQueryExpressionAnalyzer::appendAggregateFunctionsArguments(Expression
for (const auto & name : desc.argument_names)
step.addRequiredOutput(name);
/// Collect aggregates removing duplicates by node.getColumnName(getContext()->getSettingsRef())
/// Collect aggregates removing duplicates by node.getColumnName()
/// It's not clear why we recollect aggregates (for query parts) while we're able to use previously collected ones (for entire query)
/// @note The original recollection logic didn't remove duplicates.
GetAggregatesVisitor::Data data;
@ -1155,7 +1155,7 @@ void SelectQueryExpressionAnalyzer::appendWindowFunctionsArguments(
// (2b) Required function argument columns.
for (const auto & a : f.function_node->arguments->children)
{
step.addRequiredOutput(a->getColumnName(getContext()->getSettingsRef()));
step.addRequiredOutput(a->getColumnName());
}
}
@ -1177,7 +1177,7 @@ bool SelectQueryExpressionAnalyzer::appendHaving(ExpressionActionsChain & chain,
ExpressionActionsChain::Step & step = chain.lastStep(aggregated_columns);
getRootActionsForHaving(select_query->having(), only_types, step.actions());
step.addRequiredOutput(select_query->having()->getColumnName(getContext()->getSettingsRef()));
step.addRequiredOutput(select_query->having()->getColumnName());
return true;
}
@ -1201,7 +1201,7 @@ void SelectQueryExpressionAnalyzer::appendSelect(ExpressionActionsChain & chain,
continue;
}
step.addRequiredOutput(child->getColumnName(getContext()->getSettingsRef()));
step.addRequiredOutput(child->getColumnName());
}
}
@ -1229,7 +1229,7 @@ ActionsDAGPtr SelectQueryExpressionAnalyzer::appendOrderBy(ExpressionActionsChai
if (!ast || ast->children.empty())
throw Exception("Bad order expression AST", ErrorCodes::UNKNOWN_TYPE_OF_AST_NODE);
ASTPtr order_expression = ast->children.at(0);
step.addRequiredOutput(order_expression->getColumnName(getContext()->getSettingsRef()));
step.addRequiredOutput(order_expression->getColumnName());
if (ast->with_fill)
with_fill = true;
@ -1279,7 +1279,7 @@ bool SelectQueryExpressionAnalyzer::appendLimitBy(ExpressionActionsChain & chain
for (const auto & child : select_query->limitBy()->children)
{
auto child_name = child->getColumnName(getContext()->getSettingsRef());
auto child_name = child->getColumnName();
if (!aggregated_names.count(child_name))
step.addRequiredOutput(std::move(child_name));
}
@ -1295,15 +1295,13 @@ ActionsDAGPtr SelectQueryExpressionAnalyzer::appendProjectResult(ExpressionActio
NamesWithAliases result_columns;
const auto & settings = getContext()->getSettingsRef();
ASTs asts = select_query->select()->children;
for (const auto & ast : asts)
{
String result_name = ast->getAliasOrColumnName(settings);
String result_name = ast->getAliasOrColumnName();
if (required_result_columns.empty() || required_result_columns.count(result_name))
{
std::string source_name = ast->getColumnName(settings);
std::string source_name = ast->getColumnName();
/*
* For temporary columns created by ExpressionAnalyzer for literals,
@ -1345,7 +1343,7 @@ void ExpressionAnalyzer::appendExpression(ExpressionActionsChain & chain, const
{
ExpressionActionsChain::Step & step = chain.lastStep(sourceColumns());
getRootActions(expr, only_types, step.actions());
step.addRequiredOutput(expr->getColumnName(getContext()->getSettingsRef()));
step.addRequiredOutput(expr->getColumnName());
}
@ -1362,13 +1360,12 @@ ActionsDAGPtr ExpressionAnalyzer::getActionsDAG(bool add_aliases, bool project_r
else
asts = ASTs(1, query);
const auto & settings = getContext()->getSettingsRef();
for (const auto & ast : asts)
{
std::string name = ast->getColumnName(settings);
std::string name = ast->getColumnName();
std::string alias;
if (add_aliases)
alias = ast->getAliasOrColumnName(settings);
alias = ast->getAliasOrColumnName();
else
alias = name;
result_columns.emplace_back(name, alias);
@ -1497,7 +1494,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
if (auto actions = query_analyzer.appendPrewhere(chain, !first_stage, additional_required_columns_after_prewhere))
{
prewhere_info = std::make_shared<PrewhereInfo>(actions, query.prewhere()->getColumnName(settings));
prewhere_info = std::make_shared<PrewhereInfo>(actions, query.prewhere()->getColumnName());
if (allowEarlyConstantFolding(*prewhere_info->prewhere_actions, settings))
{
@ -1507,7 +1504,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
ExpressionActions(
prewhere_info->prewhere_actions,
ExpressionActionsSettings::fromSettings(context->getSettingsRef())).execute(before_prewhere_sample);
auto & column_elem = before_prewhere_sample.getByName(query.prewhere()->getColumnName(settings));
auto & column_elem = before_prewhere_sample.getByName(query.prewhere()->getColumnName());
/// If the filter column is a constant, record it.
if (column_elem.column)
prewhere_constant_filter_description = ConstantFilterDescription(*column_elem.column);
@ -1542,7 +1539,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
ExpressionActions(
before_where,
ExpressionActionsSettings::fromSettings(context->getSettingsRef())).execute(before_where_sample);
auto & column_elem = before_where_sample.getByName(query.where()->getColumnName(settings));
auto & column_elem = before_where_sample.getByName(query.where()->getColumnName());
/// If the filter column is a constant, record it.
if (column_elem.column)
where_constant_filter_description = ConstantFilterDescription(*column_elem.column);
@ -1633,7 +1630,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
const auto * select_query = query_analyzer.getSelectQuery();
for (const auto & child : select_query->select()->children)
{
step.addRequiredOutput(child->getColumnName(settings));
step.addRequiredOutput(child->getColumnName());
}
}
@ -1689,8 +1686,7 @@ void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, si
if (hasWhere())
{
const auto & settings = chain.getContext()->getSettingsRef();
where_column_name = query.where()->getColumnName(settings);
where_column_name = query.where()->getColumnName();
remove_where_filter = chain.steps.at(where_step_num)->required_output.find(where_column_name)->second;
}
}

View File

@ -1516,12 +1516,16 @@ private:
if (!rows_added)
return {};
correctLowcardAndNullability(columns_right);
Block res = result_sample_block.cloneEmpty();
addLeftColumns(res, rows_added);
addRightColumns(res, columns_right);
copySameKeys(res);
correctLowcardAndNullability(res);
#ifndef NDEBUG
assertBlocksHaveEqualStructure(res, result_sample_block, getName());
#endif
return res;
}

View File

@ -141,7 +141,7 @@ String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, co
SelectQueryExpressionAnalyzer analyzer(query_ast, syntax_result, context, metadata_snapshot);
actions = analyzer.simpleSelectActions();
auto column_name = expr_list->children.at(0)->getColumnName(context->getSettingsRef());
auto column_name = expr_list->children.at(0)->getColumnName();
actions->removeUnusedActions(NameSet{column_name});
actions->projectInput(false);
@ -782,7 +782,7 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, ContextP
order_descr.reserve(query.orderBy()->children.size());
for (const auto & elem : query.orderBy()->children)
{
String name = elem->children.front()->getColumnName(context->getSettingsRef());
String name = elem->children.front()->getColumnName();
const auto & order_by_elem = elem->as<ASTOrderByElement &>();
std::shared_ptr<Collator> collator;
@ -801,14 +801,14 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, ContextP
return order_descr;
}
static SortDescription getSortDescriptionFromGroupBy(const ASTSelectQuery & query, ContextPtr context)
static SortDescription getSortDescriptionFromGroupBy(const ASTSelectQuery & query)
{
SortDescription order_descr;
order_descr.reserve(query.groupBy()->children.size());
for (const auto & elem : query.groupBy()->children)
{
String name = elem->getColumnName(context->getSettingsRef());
String name = elem->getColumnName();
order_descr.emplace_back(name, 1, 1);
}
@ -1327,24 +1327,29 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
}
bool apply_limit = options.to_stage != QueryProcessingStage::WithMergeableStateAfterAggregation;
bool apply_prelimit = apply_limit &&
query.limitLength() && !query.limit_with_ties &&
!hasWithTotalsInAnySubqueryInFromClause(query) &&
!query.arrayJoinExpressionList() &&
!query.distinct &&
!expressions.hasLimitBy() &&
!settings.extremes &&
!has_withfill;
bool apply_offset = options.to_stage != QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit;
bool has_prelimit = false;
if (apply_limit &&
query.limitLength() && !query.limit_with_ties && !hasWithTotalsInAnySubqueryInFromClause(query) &&
!query.arrayJoinExpressionList() && !query.distinct && !expressions.hasLimitBy() && !settings.extremes &&
!has_withfill)
bool limit_applied = false;
if (apply_prelimit)
{
executePreLimit(query_plan, /* do_not_skip_offset= */!apply_offset);
has_prelimit = true;
limit_applied = true;
}
/** If there was more than one stream,
* then DISTINCT needs to be performed once again after merging all streams.
*/
if (query.distinct)
if (!from_aggregation_stage && query.distinct)
executeDistinct(query_plan, false, expressions.selected_columns, false);
if (expressions.hasLimitBy())
if (!from_aggregation_stage && expressions.hasLimitBy())
{
executeExpression(query_plan, expressions.before_limit_by, "Before LIMIT BY");
executeLimitBy(query_plan);
@ -1354,10 +1359,10 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
/// If we have 'WITH TIES', we need execute limit before projection,
/// because in that case columns from 'ORDER BY' are used.
if (query.limit_with_ties)
if (query.limit_with_ties && apply_offset)
{
executeLimit(query_plan);
has_prelimit = true;
limit_applied = true;
}
/// Projection not be done on the shards, since then initiator will not find column in blocks.
@ -1372,7 +1377,12 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
executeExtremes(query_plan);
/// Limit is no longer needed if there is prelimit.
if (apply_limit && !has_prelimit)
///
/// NOTE: that LIMIT cannot be applied if OFFSET should not be applied,
/// since LIMIT will apply OFFSET too.
/// This is the case for various optimizations for distributed queries,
/// and when LIMIT cannot be applied it will be applied on the initiator anyway.
if (apply_limit && !limit_applied && apply_offset)
executeLimit(query_plan);
if (apply_offset)
@ -1918,13 +1928,13 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc
{
query_info.projection->order_optimizer = std::make_shared<ReadInOrderOptimizer>(
query_info.projection->group_by_elements_actions,
getSortDescriptionFromGroupBy(query, context),
getSortDescriptionFromGroupBy(query),
query_info.syntax_analyzer_result);
}
else
{
query_info.order_optimizer = std::make_shared<ReadInOrderOptimizer>(
analysis_result.group_by_elements_actions, getSortDescriptionFromGroupBy(query, context), query_info.syntax_analyzer_result);
analysis_result.group_by_elements_actions, getSortDescriptionFromGroupBy(query), query_info.syntax_analyzer_result);
}
}
@ -2005,7 +2015,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc
void InterpreterSelectQuery::executeWhere(QueryPlan & query_plan, const ActionsDAGPtr & expression, bool remove_filter)
{
auto where_step = std::make_unique<FilterStep>(
query_plan.getCurrentDataStream(), expression, getSelectQuery().where()->getColumnName(context->getSettingsRef()), remove_filter);
query_plan.getCurrentDataStream(), expression, getSelectQuery().where()->getColumnName(), remove_filter);
where_step->setStepDescription("WHERE");
query_plan.addStep(std::move(where_step));
@ -2054,7 +2064,7 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac
SortDescription group_by_sort_description;
if (group_by_info && settings.optimize_aggregation_in_order)
group_by_sort_description = getSortDescriptionFromGroupBy(getSelectQuery(), context);
group_by_sort_description = getSortDescriptionFromGroupBy(getSelectQuery());
else
group_by_info = nullptr;
@ -2102,7 +2112,7 @@ void InterpreterSelectQuery::executeMergeAggregated(QueryPlan & query_plan, bool
void InterpreterSelectQuery::executeHaving(QueryPlan & query_plan, const ActionsDAGPtr & expression)
{
auto having_step
= std::make_unique<FilterStep>(query_plan.getCurrentDataStream(), expression, getSelectQuery().having()->getColumnName(context->getSettingsRef()), false);
= std::make_unique<FilterStep>(query_plan.getCurrentDataStream(), expression, getSelectQuery().having()->getColumnName(), false);
having_step->setStepDescription("HAVING");
query_plan.addStep(std::move(having_step));
@ -2118,7 +2128,7 @@ void InterpreterSelectQuery::executeTotalsAndHaving(
query_plan.getCurrentDataStream(),
overflow_row,
expression,
has_having ? getSelectQuery().having()->getColumnName(context->getSettingsRef()) : "",
has_having ? getSelectQuery().having()->getColumnName() : "",
settings.totals_mode,
settings.totals_auto_threshold,
final);
@ -2429,7 +2439,10 @@ void InterpreterSelectQuery::executePreLimit(QueryPlan & query_plan, bool do_not
}
auto limit = std::make_unique<LimitStep>(query_plan.getCurrentDataStream(), limit_length, limit_offset);
limit->setStepDescription("preliminary LIMIT");
if (do_not_skip_offset)
limit->setStepDescription("preliminary LIMIT (with OFFSET)");
else
limit->setStepDescription("preliminary LIMIT (without OFFSET)");
query_plan.addStep(std::move(limit));
}
}
@ -2443,7 +2456,7 @@ void InterpreterSelectQuery::executeLimitBy(QueryPlan & query_plan)
Names columns;
for (const auto & elem : query.limitBy()->children)
columns.emplace_back(elem->getColumnName(context->getSettingsRef()));
columns.emplace_back(elem->getColumnName());
UInt64 length = getLimitUIntValue(query.limitByLength(), context, "LIMIT");
UInt64 offset = (query.limitByOffset() ? getLimitUIntValue(query.limitByOffset(), context, "OFFSET") : 0);

View File

@ -436,6 +436,133 @@ static void compileAddIntoAggregateStatesFunctions(llvm::Module & module, const
b.CreateRetVoid();
}
static void compileAddIntoAggregateStatesFunctionsSinglePlace(llvm::Module & module, const std::vector<AggregateFunctionWithOffset> & functions, const std::string & name)
{
auto & context = module.getContext();
llvm::IRBuilder<> b(context);
auto * size_type = b.getIntNTy(sizeof(size_t) * 8);
auto * places_type = b.getInt8Ty()->getPointerTo();
auto * column_data_type = llvm::StructType::get(b.getInt8PtrTy(), b.getInt8PtrTy());
auto * aggregate_loop_func_declaration = llvm::FunctionType::get(b.getVoidTy(), { size_type, column_data_type->getPointerTo(), places_type }, false);
auto * aggregate_loop_func_definition = llvm::Function::Create(aggregate_loop_func_declaration, llvm::Function::ExternalLinkage, name, module);
auto * arguments = aggregate_loop_func_definition->args().begin();
llvm::Value * rows_count_arg = arguments++;
llvm::Value * columns_arg = arguments++;
llvm::Value * place_arg = arguments++;
/// Initialize ColumnDataPlaceholder llvm representation of ColumnData
auto * entry = llvm::BasicBlock::Create(b.getContext(), "entry", aggregate_loop_func_definition);
b.SetInsertPoint(entry);
std::vector<ColumnDataPlaceholder> columns;
size_t previous_columns_size = 0;
for (const auto & function : functions)
{
auto argument_types = function.function->getArgumentTypes();
ColumnDataPlaceholder data_placeholder;
size_t function_arguments_size = argument_types.size();
for (size_t column_argument_index = 0; column_argument_index < function_arguments_size; ++column_argument_index)
{
const auto & argument_type = argument_types[column_argument_index];
auto * data = b.CreateLoad(column_data_type, b.CreateConstInBoundsGEP1_64(column_data_type, columns_arg, previous_columns_size + column_argument_index));
data_placeholder.data_init = b.CreatePointerCast(b.CreateExtractValue(data, {0}), toNativeType(b, removeNullable(argument_type))->getPointerTo());
data_placeholder.null_init = argument_type->isNullable() ? b.CreateExtractValue(data, {1}) : nullptr;
columns.emplace_back(data_placeholder);
}
previous_columns_size += function_arguments_size;
}
/// Initialize loop
auto * end = llvm::BasicBlock::Create(b.getContext(), "end", aggregate_loop_func_definition);
auto * loop = llvm::BasicBlock::Create(b.getContext(), "loop", aggregate_loop_func_definition);
b.CreateCondBr(b.CreateICmpEQ(rows_count_arg, llvm::ConstantInt::get(size_type, 0)), end, loop);
b.SetInsertPoint(loop);
auto * counter_phi = b.CreatePHI(rows_count_arg->getType(), 2);
counter_phi->addIncoming(llvm::ConstantInt::get(size_type, 0), entry);
for (auto & col : columns)
{
col.data = b.CreatePHI(col.data_init->getType(), 2);
col.data->addIncoming(col.data_init, entry);
if (col.null_init)
{
col.null = b.CreatePHI(col.null_init->getType(), 2);
col.null->addIncoming(col.null_init, entry);
}
}
previous_columns_size = 0;
for (const auto & function : functions)
{
size_t aggregate_function_offset = function.aggregate_data_offset;
const auto * aggregate_function_ptr = function.function;
auto arguments_types = function.function->getArgumentTypes();
std::vector<llvm::Value *> arguments_values;
size_t function_arguments_size = arguments_types.size();
arguments_values.resize(function_arguments_size);
for (size_t column_argument_index = 0; column_argument_index < function_arguments_size; ++column_argument_index)
{
auto * column_argument_data = columns[previous_columns_size + column_argument_index].data;
auto * column_argument_null_data = columns[previous_columns_size + column_argument_index].null;
auto & argument_type = arguments_types[column_argument_index];
auto * value = b.CreateLoad(toNativeType(b, removeNullable(argument_type)), column_argument_data);
if (!argument_type->isNullable())
{
arguments_values[column_argument_index] = value;
continue;
}
auto * is_null = b.CreateICmpNE(b.CreateLoad(b.getInt8Ty(), column_argument_null_data), b.getInt8(0));
auto * nullable_unitilized = llvm::Constant::getNullValue(toNativeType(b, argument_type));
auto * nullable_value = b.CreateInsertValue(b.CreateInsertValue(nullable_unitilized, value, {0}), is_null, {1});
arguments_values[column_argument_index] = nullable_value;
}
auto * aggregation_place_with_offset = b.CreateConstInBoundsGEP1_64(nullptr, place_arg, aggregate_function_offset);
aggregate_function_ptr->compileAdd(b, aggregation_place_with_offset, arguments_types, arguments_values);
previous_columns_size += function_arguments_size;
}
/// End of loop
auto * cur_block = b.GetInsertBlock();
for (auto & col : columns)
{
col.data->addIncoming(b.CreateConstInBoundsGEP1_64(nullptr, col.data, 1), cur_block);
if (col.null)
col.null->addIncoming(b.CreateConstInBoundsGEP1_64(nullptr, col.null, 1), cur_block);
}
auto * value = b.CreateAdd(counter_phi, llvm::ConstantInt::get(size_type, 1));
counter_phi->addIncoming(value, cur_block);
b.CreateCondBr(b.CreateICmpEQ(value, rows_count_arg), end, loop);
b.SetInsertPoint(end);
b.CreateRetVoid();
}
static void compileMergeAggregatesStates(llvm::Module & module, const std::vector<AggregateFunctionWithOffset> & functions, const std::string & name)
{
auto & context = module.getContext();
@ -569,6 +696,7 @@ CompiledAggregateFunctions compileAggregateFunctions(CHJIT & jit, const std::vec
std::string create_aggregate_states_functions_name = functions_dump_name + "_create";
std::string add_aggregate_states_functions_name = functions_dump_name + "_add";
std::string add_aggregate_states_functions_name_single_place = functions_dump_name + "_add_single_place";
std::string merge_aggregate_states_functions_name = functions_dump_name + "_merge";
std::string insert_aggregate_states_functions_name = functions_dump_name + "_insert";
@ -576,17 +704,20 @@ CompiledAggregateFunctions compileAggregateFunctions(CHJIT & jit, const std::vec
{
compileCreateAggregateStatesFunctions(module, functions, create_aggregate_states_functions_name);
compileAddIntoAggregateStatesFunctions(module, functions, add_aggregate_states_functions_name);
compileAddIntoAggregateStatesFunctionsSinglePlace(module, functions, add_aggregate_states_functions_name_single_place);
compileMergeAggregatesStates(module, functions, merge_aggregate_states_functions_name);
compileInsertAggregatesIntoResultColumns(module, functions, insert_aggregate_states_functions_name);
});
auto create_aggregate_states_function = reinterpret_cast<JITCreateAggregateStatesFunction>(compiled_module.function_name_to_symbol[create_aggregate_states_functions_name]);
auto add_into_aggregate_states_function = reinterpret_cast<JITAddIntoAggregateStatesFunction>(compiled_module.function_name_to_symbol[add_aggregate_states_functions_name]);
auto add_into_aggregate_states_function_single_place = reinterpret_cast<JITAddIntoAggregateStatesFunctionSinglePlace>(compiled_module.function_name_to_symbol[add_aggregate_states_functions_name_single_place]);
auto merge_aggregate_states_function = reinterpret_cast<JITMergeAggregateStatesFunction>(compiled_module.function_name_to_symbol[merge_aggregate_states_functions_name]);
auto insert_aggregate_states_function = reinterpret_cast<JITInsertAggregateStatesIntoColumnsFunction>(compiled_module.function_name_to_symbol[insert_aggregate_states_functions_name]);
assert(create_aggregate_states_function);
assert(add_into_aggregate_states_function);
assert(add_into_aggregate_states_function_single_place);
assert(merge_aggregate_states_function);
assert(insert_aggregate_states_function);
@ -598,6 +729,7 @@ CompiledAggregateFunctions compileAggregateFunctions(CHJIT & jit, const std::vec
{
.create_aggregate_states_function = create_aggregate_states_function,
.add_into_aggregate_states_function = add_into_aggregate_states_function,
.add_into_aggregate_states_function_single_place = add_into_aggregate_states_function_single_place,
.merge_aggregate_states_function = merge_aggregate_states_function,
.insert_aggregates_into_columns_function = insert_aggregate_states_function,

View File

@ -54,6 +54,7 @@ struct AggregateFunctionWithOffset
using JITCreateAggregateStatesFunction = void (*)(AggregateDataPtr);
using JITAddIntoAggregateStatesFunction = void (*)(ColumnDataRowsSize, ColumnData *, AggregateDataPtr *);
using JITAddIntoAggregateStatesFunctionSinglePlace = void (*)(ColumnDataRowsSize, ColumnData *, AggregateDataPtr);
using JITMergeAggregateStatesFunction = void (*)(AggregateDataPtr, AggregateDataPtr);
using JITInsertAggregateStatesIntoColumnsFunction = void (*)(ColumnDataRowsSize, ColumnData *, AggregateDataPtr *);
@ -61,6 +62,8 @@ struct CompiledAggregateFunctions
{
JITCreateAggregateStatesFunction create_aggregate_states_function;
JITAddIntoAggregateStatesFunction add_into_aggregate_states_function;
JITAddIntoAggregateStatesFunctionSinglePlace add_into_aggregate_states_function_single_place;
JITMergeAggregateStatesFunction merge_aggregate_states_function;
JITInsertAggregateStatesIntoColumnsFunction insert_aggregates_into_columns_function;
@ -75,6 +78,7 @@ struct CompiledAggregateFunctions
*
* JITCreateAggregateStatesFunction will initialize aggregate data ptr with initial aggregate states values.
* JITAddIntoAggregateStatesFunction will update aggregate states for aggregate functions with specified ColumnData.
* JITAddIntoAggregateStatesFunctionSinglePlace will update single aggregate state for aggregate functions with specified ColumnData.
* JITMergeAggregateStatesFunction will merge aggregate states for aggregate functions.
* JITInsertAggregateStatesIntoColumnsFunction will insert aggregate states for aggregate functions into result columns.
*/

View File

@ -1075,12 +1075,11 @@ private:
if (!rows_added)
return {};
correctLowcardAndNullability(columns_right);
Block res = result_sample_block.cloneEmpty();
addLeftColumns(res, rows_added);
addRightColumns(res, columns_right);
copySameKeys(res);
correctLowcardAndNullability(res);
return res;
}

View File

@ -57,6 +57,7 @@ NamesAndTypesList QueryLogElement::getNamesAndTypes()
{"current_database", std::make_shared<DataTypeString>()},
{"query", std::make_shared<DataTypeString>()},
{"formatted_query", std::make_shared<DataTypeString>()},
{"normalized_query_hash", std::make_shared<DataTypeUInt64>()},
{"query_kind", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
{"databases", std::make_shared<DataTypeArray>(
@ -151,6 +152,7 @@ void QueryLogElement::appendToBlock(MutableColumns & columns) const
columns[i++]->insertData(current_database.data(), current_database.size());
columns[i++]->insertData(query.data(), query.size());
columns[i++]->insertData(formatted_query.data(), formatted_query.size());
columns[i++]->insert(normalized_query_hash);
columns[i++]->insertData(query_kind.data(), query_kind.size());

View File

@ -51,6 +51,7 @@ struct QueryLogElement
String current_database;
String query;
String formatted_query;
UInt64 normalized_query_hash{};
String query_kind;

View File

@ -609,6 +609,27 @@ std::vector<const ASTFunction *> getWindowFunctions(ASTPtr & query, const ASTSel
return data.window_functions;
}
class MarkTupleLiteralsAsLegacyData
{
public:
using TypeToVisit = ASTLiteral;
static void visit(ASTLiteral & literal, ASTPtr &)
{
if (literal.value.getType() == Field::Types::Tuple)
literal.use_legacy_column_name_of_tuple = true;
}
};
using MarkTupleLiteralsAsLegacyMatcher = OneTypeMatcher<MarkTupleLiteralsAsLegacyData>;
using MarkTupleLiteralsAsLegacyVisitor = InDepthNodeVisitor<MarkTupleLiteralsAsLegacyMatcher, true>;
void markTupleLiteralsAsLegacy(ASTPtr & query)
{
MarkTupleLiteralsAsLegacyVisitor::Data data;
MarkTupleLiteralsAsLegacyVisitor(data).visit(query);
}
}
TreeRewriterResult::TreeRewriterResult(
@ -927,6 +948,9 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
/// Executing scalar subqueries - replacing them with constant values.
executeScalarSubqueries(query, getContext(), subquery_depth, result.scalars, select_options.only_analyze);
if (settings.legacy_column_name_of_tuple_literal)
markTupleLiteralsAsLegacy(query);
TreeOptimizer::apply(query, result, tables_with_columns, getContext());
/// array_join_alias_to_name, array_join_result_to_source.
@ -994,6 +1018,9 @@ TreeRewriterResultPtr TreeRewriter::analyze(
/// Executing scalar subqueries. Column defaults could be a scalar subquery.
executeScalarSubqueries(query, getContext(), 0, result.scalars, false);
if (settings.legacy_column_name_of_tuple_literal)
markTupleLiteralsAsLegacy(query);
TreeOptimizer::optimizeIf(query, result.aliases, settings.optimize_if_chain_to_multiif);
if (allow_aggregations)

View File

@ -39,7 +39,7 @@ std::pair<Field, std::shared_ptr<const IDataType>> evaluateConstantExpression(co
if (context->getSettingsRef().normalize_function_names)
FunctionNameNormalizer().visit(ast.get());
String name = ast->getColumnName(context->getSettingsRef());
String name = ast->getColumnName();
auto syntax_result = TreeRewriter(context).analyze(ast, source_columns);
ExpressionActionsPtr expr_for_constant_folding = ExpressionAnalyzer(ast, syntax_result, context).getConstActions();

View File

@ -265,7 +265,11 @@ static void onExceptionBeforeStart(const String & query_for_logging, ContextPtr
// Try log query_kind if ast is valid
if (ast)
{
elem.query_kind = ast->getQueryKindString();
if (settings.log_formatted_queries)
elem.formatted_query = queryToString(ast);
}
// We don't calculate databases, tables and columns when the query isn't able to start
@ -641,6 +645,8 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
elem.current_database = context->getCurrentDatabase();
elem.query = query_for_logging;
if (settings.log_formatted_queries)
elem.formatted_query = queryToString(ast);
elem.normalized_query_hash = normalizedQueryHash<false>(query_for_logging);
elem.client_info = client_info;

View File

@ -29,17 +29,6 @@ namespace ErrorCodes
namespace
{
void changeNullability(MutableColumnPtr & mutable_column)
{
ColumnPtr column = std::move(mutable_column);
if (const auto * nullable = checkAndGetColumn<ColumnNullable>(*column))
column = nullable->getNestedColumnPtr();
else
column = makeNullable(column);
mutable_column = IColumn::mutate(std::move(column));
}
ColumnPtr changeLowCardinality(const ColumnPtr & column, const ColumnPtr & dst_sample)
{
if (dst_sample->lowCardinality())
@ -52,6 +41,24 @@ ColumnPtr changeLowCardinality(const ColumnPtr & column, const ColumnPtr & dst_s
return column->convertToFullColumnIfLowCardinality();
}
struct LowcardAndNull
{
bool is_lowcard;
bool is_nullable;
};
LowcardAndNull getLowcardAndNullability(const ColumnPtr & col)
{
if (col->lowCardinality())
{
/// Currently only `LowCardinality(Nullable(T))` is possible, but not `Nullable(LowCardinality(T))`
assert(!col->canBeInsideNullable());
const auto * col_as_lc = assert_cast<const ColumnLowCardinality *>(col.get());
return {true, col_as_lc->nestedIsNullable()};
}
return {false, col->isNullable()};
}
}
namespace JoinCommon
@ -91,35 +98,32 @@ DataTypePtr convertTypeToNullable(const DataTypePtr & type)
if (dict_type->canBeInsideNullable())
return std::make_shared<DataTypeLowCardinality>(makeNullable(dict_type));
}
return makeNullable(type);
if (type->canBeInsideNullable())
return makeNullable(type);
return type;
}
void convertColumnToNullable(ColumnWithTypeAndName & column, bool remove_low_card)
void convertColumnToNullable(ColumnWithTypeAndName & column)
{
if (remove_low_card && column.type->lowCardinality())
{
column.column = recursiveRemoveLowCardinality(column.column);
column.type = recursiveRemoveLowCardinality(column.type);
}
if (column.type->isNullable() || !canBecomeNullable(column.type))
return;
column.type = convertTypeToNullable(column.type);
if (column.column)
if (!column.column)
return;
if (column.column->lowCardinality())
{
if (column.column->lowCardinality())
{
/// Convert nested to nullable, not LowCardinality itself
auto mut_col = IColumn::mutate(std::move(column.column));
ColumnLowCardinality * col_as_lc = assert_cast<ColumnLowCardinality *>(mut_col.get());
if (!col_as_lc->nestedIsNullable())
col_as_lc->nestedToNullable();
column.column = std::move(mut_col);
}
else
column.column = makeNullable(column.column);
/// Convert nested to nullable, not LowCardinality itself
auto mut_col = IColumn::mutate(std::move(column.column));
ColumnLowCardinality * col_as_lc = assert_cast<ColumnLowCardinality *>(mut_col.get());
if (!col_as_lc->nestedIsNullable())
col_as_lc->nestedToNullable();
column.column = std::move(mut_col);
}
else if (column.column->canBeInsideNullable())
{
column.column = makeNullable(column.column);
}
}
@ -146,20 +150,18 @@ void removeColumnNullability(ColumnWithTypeAndName & column)
col_as_lc->nestedRemoveNullable();
column.column = std::move(mut_col);
}
return;
}
if (!column.type->isNullable())
return;
column.type = static_cast<const DataTypeNullable &>(*column.type).getNestedType();
if (column.column)
else
{
const auto * nullable_column = checkAndGetColumn<ColumnNullable>(*column.column);
ColumnPtr nested_column = nullable_column->getNestedColumnPtr();
MutableColumnPtr mutable_column = IColumn::mutate(std::move(nested_column));
column.column = std::move(mutable_column);
column.type = removeNullable(column.type);
if (column.column && column.column->isNullable())
{
const auto * nullable_column = checkAndGetColumn<ColumnNullable>(*column.column);
ColumnPtr nested_column = nullable_column->getNestedColumnPtr();
MutableColumnPtr mutable_column = IColumn::mutate(std::move(nested_column));
column.column = std::move(mutable_column);
}
}
}
@ -534,32 +536,42 @@ void NotJoined::setRightIndex(size_t right_pos, size_t result_position)
void NotJoined::extractColumnChanges(size_t right_pos, size_t result_pos)
{
const auto & src = saved_block_sample.getByPosition(right_pos).column;
const auto & dst = result_sample_block.getByPosition(result_pos).column;
auto src_props = getLowcardAndNullability(saved_block_sample.getByPosition(right_pos).column);
auto dst_props = getLowcardAndNullability(result_sample_block.getByPosition(result_pos).column);
if (!src->isNullable() && dst->isNullable())
right_nullability_adds.push_back(right_pos);
if (src_props.is_nullable != dst_props.is_nullable)
right_nullability_changes.push_back({result_pos, dst_props.is_nullable});
if (src->isNullable() && !dst->isNullable())
right_nullability_removes.push_back(right_pos);
ColumnPtr src_not_null = JoinCommon::emptyNotNullableClone(src);
ColumnPtr dst_not_null = JoinCommon::emptyNotNullableClone(dst);
if (src_not_null->lowCardinality() != dst_not_null->lowCardinality())
right_lowcard_changes.push_back({right_pos, dst_not_null});
if (src_props.is_lowcard != dst_props.is_lowcard)
right_lowcard_changes.push_back({result_pos, dst_props.is_lowcard});
}
void NotJoined::correctLowcardAndNullability(MutableColumns & columns_right)
void NotJoined::correctLowcardAndNullability(Block & block)
{
for (size_t pos : right_nullability_removes)
changeNullability(columns_right[pos]);
for (auto & [pos, added] : right_nullability_changes)
{
auto & col = block.getByPosition(pos);
if (added)
JoinCommon::convertColumnToNullable(col);
else
JoinCommon::removeColumnNullability(col);
}
for (auto & [pos, dst_sample] : right_lowcard_changes)
columns_right[pos] = changeLowCardinality(std::move(columns_right[pos]), dst_sample)->assumeMutable();
for (size_t pos : right_nullability_adds)
changeNullability(columns_right[pos]);
for (auto & [pos, added] : right_lowcard_changes)
{
auto & col = block.getByPosition(pos);
if (added)
{
if (!col.type->lowCardinality())
col.type = std::make_shared<DataTypeLowCardinality>(col.type);
col.column = changeLowCardinality(col.column, col.type->createColumn());
}
else
{
col.column = recursiveRemoveLowCardinality(col.column);
col.type = recursiveRemoveLowCardinality(col.type);
}
}
}
void NotJoined::addLeftColumns(Block & block, size_t rows_added) const
@ -580,11 +592,6 @@ void NotJoined::addRightColumns(Block & block, MutableColumns & columns_right) c
{
auto & right_column = columns_right[pr.first];
auto & result_column = block.getByPosition(pr.second).column;
#ifndef NDEBUG
if (result_column->getName() != right_column->getName())
throw Exception("Wrong columns assign in RIGHT|FULL JOIN: " + result_column->getName() +
" " + right_column->getName(), ErrorCodes::LOGICAL_ERROR);
#endif
result_column = std::move(right_column);
}
}

View File

@ -19,7 +19,7 @@ namespace JoinCommon
{
bool canBecomeNullable(const DataTypePtr & type);
DataTypePtr convertTypeToNullable(const DataTypePtr & type);
void convertColumnToNullable(ColumnWithTypeAndName & column, bool remove_low_card = false);
void convertColumnToNullable(ColumnWithTypeAndName & column);
void convertColumnsToNullable(Block & block, size_t starting_pos = 0);
void removeColumnNullability(ColumnWithTypeAndName & column);
void changeColumnRepresentation(const ColumnPtr & src_column, ColumnPtr & dst_column);
@ -70,7 +70,7 @@ public:
NotJoined(const TableJoin & table_join, const Block & saved_block_sample_, const Block & right_sample_block,
const Block & result_sample_block_, const Names & key_names_left_ = {}, const Names & key_names_right_ = {});
void correctLowcardAndNullability(MutableColumns & columns_right);
void correctLowcardAndNullability(Block & block);
void addLeftColumns(Block & block, size_t rows_added) const;
void addRightColumns(Block & block, MutableColumns & columns_right) const;
void copySameKeys(Block & block) const;
@ -92,10 +92,9 @@ private:
///
std::unordered_map<size_t, size_t> same_result_keys;
/// Which right columns (saved in parent) need nullability change before placing them in result block
std::vector<size_t> right_nullability_adds;
std::vector<size_t> right_nullability_removes;
std::vector<std::pair<size_t, bool>> right_nullability_changes;
/// Which right columns (saved in parent) need LowCardinality change before placing them in result block
std::vector<std::pair<size_t, ColumnPtr>> right_lowcard_changes;
std::vector<std::pair<size_t, bool>> right_lowcard_changes;
void setRightIndex(size_t right_pos, size_t result_position);
void extractColumnChanges(size_t right_pos, size_t result_pos);

View File

@ -24,16 +24,6 @@ namespace ErrorCodes
}
void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr) const
{
appendColumnNameImpl(ostr, nullptr);
}
void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr, const Settings & settings) const
{
appendColumnNameImpl(ostr, &settings);
}
void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr, const Settings * settings) const
{
if (name == "view")
throw Exception("Table function view cannot be used as an expression", ErrorCodes::UNEXPECTED_EXPRESSION);
@ -48,10 +38,7 @@ void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr, const Settings * sett
if (it != parameters->children.begin())
writeCString(", ", ostr);
if (settings)
(*it)->appendColumnName(ostr, *settings);
else
(*it)->appendColumnName(ostr);
(*it)->appendColumnName(ostr);
}
writeChar(')', ostr);
}
@ -64,10 +51,7 @@ void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr, const Settings * sett
if (it != arguments->children.begin())
writeCString(", ", ostr);
if (settings)
(*it)->appendColumnName(ostr, *settings);
else
(*it)->appendColumnName(ostr);
(*it)->appendColumnName(ostr);
}
}

View File

@ -54,10 +54,6 @@ public:
protected:
void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
void appendColumnNameImpl(WriteBuffer & ostr) const override;
void appendColumnNameImpl(WriteBuffer & ostr, const Settings & settings) const override;
private:
void appendColumnNameImpl(WriteBuffer & ostr, const Settings * settings) const;
};

View File

@ -50,16 +50,14 @@ String FieldVisitorToColumnName::operator() (const Tuple & x) const
}
void ASTLiteral::appendColumnNameImpl(WriteBuffer & ostr, const Settings & settings) const
{
if (settings.legacy_column_name_of_tuple_literal)
appendColumnNameImplLegacy(ostr);
else
appendColumnNameImpl(ostr);
}
void ASTLiteral::appendColumnNameImpl(WriteBuffer & ostr) const
{
if (use_legacy_column_name_of_tuple)
{
appendColumnNameImplLegacy(ostr);
return;
}
/// 100 - just arbitrary value.
constexpr auto min_elements_for_hashing = 100;

View File

@ -33,6 +33,10 @@ public:
*/
String unique_column_name;
/// For compatibility reasons in distributed queries,
/// we may need to use legacy column name for tuple literal.
bool use_legacy_column_name_of_tuple = false;
/** Get the text that identifies this element. */
String getID(char delim) const override { return "Literal" + (delim + applyVisitor(FieldVisitorDump(), value)); }
@ -44,7 +48,6 @@ protected:
void formatImplWithoutAlias(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
void appendColumnNameImpl(WriteBuffer & ostr) const override;
void appendColumnNameImpl(WriteBuffer & ostr, const Settings & settings) const override;
private:
/// Legacy version of 'appendColumnNameImpl'. It differs only with tuple literals.

View File

@ -48,14 +48,6 @@ void ASTWithAlias::appendColumnName(WriteBuffer & ostr) const
appendColumnNameImpl(ostr);
}
void ASTWithAlias::appendColumnName(WriteBuffer & ostr, const Settings & settings) const
{
if (prefer_alias_to_column_name && !alias.empty())
writeString(alias, ostr);
else
appendColumnNameImpl(ostr, settings);
}
void ASTWithAlias::appendColumnNameWithoutAlias(WriteBuffer & ostr) const
{
appendColumnNameImpl(ostr);

View File

@ -21,10 +21,8 @@ public:
using IAST::IAST;
void appendColumnName(WriteBuffer & ostr) const final;
void appendColumnName(WriteBuffer & ostr, const Settings & settings) const final;
void appendColumnNameWithoutAlias(WriteBuffer & ostr) const final;
String getAliasOrColumnName() const override { return alias.empty() ? getColumnName() : alias; }
String getAliasOrColumnName(const Settings & settings) const override { return alias.empty() ? getColumnName(settings) : alias; }
String tryGetAlias() const override { return alias; }
void setAlias(const String & to) override { alias = to; }
@ -35,7 +33,6 @@ public:
protected:
virtual void appendColumnNameImpl(WriteBuffer & ostr) const = 0;
virtual void appendColumnNameImpl(WriteBuffer & ostr, const Settings &) const { appendColumnNameImpl(ostr); }
};
/// helper for setting aliases and chaining result to other functions

View File

@ -109,14 +109,6 @@ String IAST::getColumnName() const
}
String IAST::getColumnName(const Settings & settings) const
{
WriteBufferFromOwnString write_buffer;
appendColumnName(write_buffer, settings);
return write_buffer.str();
}
String IAST::getColumnNameWithoutAlias() const
{
WriteBufferFromOwnString write_buffer;

View File

@ -42,7 +42,6 @@ public:
/** Get the canonical name of the column if the element is a column */
String getColumnName() const;
String getColumnName(const Settings & settings) const;
/** Same as the above but ensure no alias names are used. This is for index analysis */
String getColumnNameWithoutAlias() const;
@ -52,8 +51,6 @@ public:
throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::LOGICAL_ERROR);
}
virtual void appendColumnName(WriteBuffer & ostr, const Settings &) const { appendColumnName(ostr); }
virtual void appendColumnNameWithoutAlias(WriteBuffer &) const
{
throw Exception("Trying to get name of not a column: " + getID(), ErrorCodes::LOGICAL_ERROR);
@ -61,7 +58,6 @@ public:
/** Get the alias, if any, or the canonical name of the column, if it is not. */
virtual String getAliasOrColumnName() const { return getColumnName(); }
virtual String getAliasOrColumnName(const Settings & settings) const { return getColumnName(settings); }
/** Get the alias, if any, or an empty string if it does not exist, or if the element does not support aliases. */
virtual String tryGetAlias() const { return String(); }

View File

@ -992,17 +992,14 @@ void ReadFromMergeTree::initializePipeline(QueryPipeline & pipeline, const Build
});
}
Block cur_header = result_projection ? result_projection->getResultColumns()
: pipe.getHeader();
Block cur_header = pipe.getHeader();
auto append_actions = [&result_projection, &cur_header](ActionsDAGPtr actions)
auto append_actions = [&result_projection](ActionsDAGPtr actions)
{
if (!result_projection)
result_projection = std::move(actions);
else
result_projection = ActionsDAG::merge(std::move(*result_projection), std::move(*actions));
cur_header = result_projection->getResultColumns();
};
/// By the way, if a distributed query or query to a Merge table is made, then the `_sample_factor` column can have different values.
@ -1017,6 +1014,9 @@ void ReadFromMergeTree::initializePipeline(QueryPipeline & pipeline, const Build
append_actions(std::move(adding_column));
}
if (result_projection)
cur_header = result_projection->updateHeader(cur_header);
/// Extra columns may be returned (for example, if sampling is used).
/// Convert pipe to step header structure.
if (!isCompatibleHeader(cur_header, getOutputStream().header))

View File

@ -49,7 +49,7 @@ void SourceWithProgress::setProcessListElement(QueryStatus * elem)
void SourceWithProgress::work()
{
if (!limits.speed_limits.checkTimeLimit(total_stopwatch.elapsed(), limits.timeout_overflow_mode))
if (!limits.speed_limits.checkTimeLimit(total_stopwatch, limits.timeout_overflow_mode))
{
cancel();
}

View File

@ -32,7 +32,7 @@ void LimitsCheckingTransform::transform(Chunk & chunk)
info.started = true;
}
if (!limits.speed_limits.checkTimeLimit(info.total_stopwatch.elapsed(), limits.timeout_overflow_mode))
if (!limits.speed_limits.checkTimeLimit(info.total_stopwatch, limits.timeout_overflow_mode))
{
stopReading();
return;

View File

@ -195,8 +195,8 @@ KeeperTCPHandler::KeeperTCPHandler(IServer & server_, const Poco::Net::StreamSoc
, log(&Poco::Logger::get("NuKeeperTCPHandler"))
, global_context(Context::createCopy(server.context()))
, keeper_dispatcher(global_context->getKeeperStorageDispatcher())
, operation_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000)
, session_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.session_timeout_ms", Coordination::DEFAULT_SESSION_TIMEOUT_MS) * 1000)
, operation_timeout(0, global_context->getConfigRef().getUInt("keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000)
, session_timeout(0, global_context->getConfigRef().getUInt("keeper_server.session_timeout_ms", Coordination::DEFAULT_SESSION_TIMEOUT_MS) * 1000)
, poll_wrapper(std::make_unique<SocketInterruptablePollWrapper>(socket_))
, responses(std::make_unique<ThreadSafeResponseQueue>())
{

View File

@ -32,7 +32,7 @@ message ExternalTable {
// Data to insert to the external table.
// If a method with streaming input (i.e. ExecuteQueryWithStreamInput() or ExecuteQueryWithStreamIO()) is used,
// then data for insertion to the same external table can be split between multiple QueryInfos.
string data = 3;
bytes data = 3;
// Format of the data to insert to the external table.
string format = 4;
@ -53,10 +53,10 @@ message QueryInfo {
string database = 4;
// Input data, used both as data for INSERT query and as data for the input() function.
string input_data = 5;
bytes input_data = 5;
// Delimiter for input_data, inserted between input_data from adjacent QueryInfos.
string input_data_delimiter = 6;
bytes input_data_delimiter = 6;
// Default output format. If not specified, 'TabSeparated' is used.
string output_format = 7;
@ -128,9 +128,9 @@ message Exception {
// Result of execution of a query which is sent back by the ClickHouse server to the client.
message Result {
// Output of the query, represented in the `output_format` or in a format specified in `query`.
string output = 1;
string totals = 2;
string extremes = 3;
bytes output = 1;
bytes totals = 2;
bytes extremes = 3;
repeated LogEntry logs = 4;
Progress progress = 5;

View File

@ -3213,8 +3213,7 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc
if (!partition_ast.value)
{
if (!MergeTreePartInfo::validatePartitionID(partition_ast.id, format_version))
throw Exception("Invalid partition format: " + partition_ast.id, ErrorCodes::INVALID_PARTITION_VALUE);
MergeTreePartInfo::validatePartitionID(partition_ast.id, format_version);
return partition_ast.id;
}
@ -3225,10 +3224,7 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc
if (partition_lit && partition_lit->value.getType() == Field::Types::String)
{
String partition_id = partition_lit->value.get<String>();
if (partition_id.size() != 6 || !std::all_of(partition_id.begin(), partition_id.end(), isNumericASCII))
throw Exception(
"Invalid partition format: " + partition_id + ". Partition should consist of 6 digits: YYYYMM",
ErrorCodes::INVALID_PARTITION_VALUE);
MergeTreePartInfo::validatePartitionID(partition_id, format_version);
return partition_id;
}
}

View File

@ -9,6 +9,7 @@ namespace DB
namespace ErrorCodes
{
extern const int BAD_DATA_PART_NAME;
extern const int INVALID_PARTITION_VALUE;
}
@ -21,38 +22,25 @@ MergeTreePartInfo MergeTreePartInfo::fromPartName(const String & part_name, Merg
}
bool MergeTreePartInfo::validatePartitionID(const String & partition_id, MergeTreeDataFormatVersion format_version)
void MergeTreePartInfo::validatePartitionID(const String & partition_id, MergeTreeDataFormatVersion format_version)
{
if (partition_id.empty())
return false;
ReadBufferFromString in(partition_id);
throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Partition id is empty");
if (format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING)
{
UInt32 min_yyyymmdd = 0;
UInt32 max_yyyymmdd = 0;
if (!tryReadIntText(min_yyyymmdd, in)
|| !checkChar('_', in)
|| !tryReadIntText(max_yyyymmdd, in)
|| !checkChar('_', in))
{
return false;
}
if (partition_id.size() != 6 || !std::all_of(partition_id.begin(), partition_id.end(), isNumericASCII))
throw Exception(ErrorCodes::INVALID_PARTITION_VALUE,
"Invalid partition format: {}. Partition should consist of 6 digits: YYYYMM",
partition_id);
}
else
{
while (!in.eof())
{
char c;
readChar(c, in);
if (c == '_')
break;
}
auto is_valid_char = [](char c) { return c == '-' || isAlphaNumericASCII(c); };
if (!std::all_of(partition_id.begin(), partition_id.end(), is_valid_char))
throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Invalid partition format: {}", partition_id);
}
return in.eof();
}
bool MergeTreePartInfo::tryParsePartName(const String & part_name, MergeTreePartInfo * part_info, MergeTreeDataFormatVersion format_version)

View File

@ -88,7 +88,7 @@ struct MergeTreePartInfo
}
/// Simple sanity check for partition ID. Checking that it's not too long or too short, doesn't contain a lot of '_'.
static bool validatePartitionID(const String & partition_id, MergeTreeDataFormatVersion format_version);
static void validatePartitionID(const String & partition_id, MergeTreeDataFormatVersion format_version);
static MergeTreePartInfo fromPartName(const String & part_name, MergeTreeDataFormatVersion format_version); // -V1071

View File

@ -124,7 +124,7 @@ struct Settings;
M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \
M(String, storage_policy, "default", "Name of storage disk policy", 0) \
M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \
M(Bool, allow_remote_fs_zero_copy_replication, false, "Allow Zero-copy replication over remote fs", 0) \
M(Bool, allow_remote_fs_zero_copy_replication, true, "Allow Zero-copy replication over remote fs", 0) \
M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \
M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \
M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. This setting is the default that can be overridden by the query-level setting with the same name.", 0) \

View File

@ -144,9 +144,14 @@ void ReplicatedMergeTreeMergeStrategyPicker::refreshState()
if (current_replica_index_tmp < 0 || active_replicas_tmp.size() < 2)
{
LOG_WARNING(storage.log, "Can't find current replica in the active replicas list, or too few active replicas to use execute_merges_on_single_replica_time_threshold!");
/// we can reset the settings w/o lock (it's atomic)
execute_merges_on_single_replica_time_threshold = 0;
if (execute_merges_on_single_replica_time_threshold > 0)
{
LOG_WARNING(storage.log, "Can't find current replica in the active replicas list, or too few active replicas to use 'execute_merges_on_single_replica_time_threshold'");
/// we can reset the settings w/o lock (it's atomic)
execute_merges_on_single_replica_time_threshold = 0;
}
/// default value of remote_fs_execute_merges_on_single_replica_time_threshold is not 0
/// so we write no warning in log here
remote_fs_execute_merges_on_single_replica_time_threshold = 0;
return;
}

Some files were not shown because too many files have changed in this diff Show More