Merge branch 'master' of github.com:ClickHouse/ClickHouse into fix_issue_22028

This commit is contained in:
Slach 2021-03-28 18:41:22 +01:00
commit 9761db7efb
241 changed files with 3956 additions and 957 deletions

2
.gitmodules vendored
View File

@ -93,7 +93,7 @@
url = https://github.com/ClickHouse-Extras/libunwind.git
[submodule "contrib/simdjson"]
path = contrib/simdjson
url = https://github.com/ClickHouse-Extras/simdjson.git
url = https://github.com/simdjson/simdjson.git
[submodule "contrib/rapidjson"]
path = contrib/rapidjson
url = https://github.com/ClickHouse-Extras/rapidjson

View File

@ -1069,11 +1069,11 @@ public:
}
template <typename DateOrTime>
inline LUTIndex addMonthsIndex(DateOrTime v, Int64 delta) const
inline LUTIndex NO_SANITIZE_UNDEFINED addMonthsIndex(DateOrTime v, Int64 delta) const
{
const Values & values = lut[toLUTIndex(v)];
Int64 month = static_cast<Int64>(values.month) + delta;
Int64 month = values.month + delta;
if (month > 0)
{

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit 3d3683e77753cfe015a05fae95ddf418e19f59e1
Subproject commit 70468326ad5d72e9497944838484c591dae054ea

2
contrib/replxx vendored

@ -1 +1 @@
Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc
Subproject commit 2b24f14594d7606792b92544bb112a6322ba34d7

2
contrib/simdjson vendored

@ -1 +1 @@
Subproject commit 3190d66a49059092a1753dc35595923debfc1698
Subproject commit 95b4870e20be5f97d9dcf63b23b1c6f520c366c1

View File

@ -18,6 +18,7 @@ RUN apt-get update \
clickhouse-client=$version \
clickhouse-common-static=$version \
locales \
tzdata \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf \
&& apt-get clean

View File

@ -32,6 +32,7 @@ RUN groupadd -r clickhouse --gid=101 \
clickhouse-server=$version \
locales \
wget \
tzdata \
&& rm -rf \
/var/lib/apt/lists/* \
/var/cache/debconf \

View File

@ -21,7 +21,9 @@ RUN addgroup -S -g 101 clickhouse \
&& chown clickhouse:clickhouse /var/lib/clickhouse \
&& chown root:clickhouse /var/log/clickhouse-server \
&& chmod +x /entrypoint.sh \
&& apk add --no-cache su-exec bash \
&& apk add --no-cache su-exec bash tzdata \
&& cp /usr/share/zoneinfo/UTC /etc/localtime \
&& echo "UTC" > /etc/timezone \
&& chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client
# we need to allow "others" access to clickhouse folder, because docker container

View File

@ -46,9 +46,11 @@ DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --
TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)"
USER_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=user_files_path || true)"
LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.log || true)"
LOG_DIR="$(dirname "$LOG_PATH" || true)"
LOG_DIR=""
if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi
ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)"
ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH" || true)"
ERROR_LOG_DIR=""
if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi
FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)"
CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"

View File

@ -292,6 +292,7 @@ function run_tests
01318_decrypt # Depends on OpenSSL
01663_aes_msan # Depends on OpenSSL
01667_aes_args_check # Depends on OpenSSL
01776_decrypt_aead_size_check # Depends on OpenSSL
01281_unsucceeded_insert_select_queries_counter
01292_create_user
01294_lazy_database_concurrent

View File

@ -266,14 +266,13 @@ for query_index in queries_to_run:
try:
# Will also detect too long queries during warmup stage
res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': 10})
res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': args.max_query_seconds})
except clickhouse_driver.errors.Error as e:
# Add query id to the exception to make debugging easier.
e.args = (prewarm_id, *e.args)
e.message = prewarm_id + ': ' + e.message
raise
print(f'prewarm\t{query_index}\t{prewarm_id}\t{conn_index}\t{c.last_query.elapsed}')
except KeyboardInterrupt:
raise
@ -320,7 +319,7 @@ for query_index in queries_to_run:
for conn_index, c in enumerate(this_query_connections):
try:
res = c.execute(q, query_id = run_id)
res = c.execute(q, query_id = run_id, settings = {'max_execution_time': args.max_query_seconds})
except clickhouse_driver.errors.Error as e:
# Add query id to the exception to make debugging easier.
e.args = (run_id, *e.args)

View File

@ -2,7 +2,6 @@
FROM ubuntu:20.04
RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven python3 --yes --no-install-recommends
RUN wget https://github.com/sqlancer/sqlancer/archive/master.zip -O /sqlancer.zip
RUN mkdir /sqlancer && \
cd /sqlancer && \

View File

@ -26,6 +26,7 @@ def process_result(result_folder):
with open(err_path, 'r') as f:
if 'AssertionError' in f.read():
summary.append((test, "FAIL"))
status = 'failure'
else:
summary.append((test, "OK"))

View File

@ -11,7 +11,7 @@ service clickhouse-server start && sleep 5
cd /sqlancer/sqlancer-master
export TIMEOUT=60
export TIMEOUT=300
export NUM_QUERIES=1000
( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPWhere | tee /test_output/TLPWhere.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPWhere.err

View File

@ -3,7 +3,7 @@ toc_priority: 8
toc_title: PostgreSQL
---
# PosgtreSQL {#postgresql}
# PostgreSQL {#postgresql}
The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server.

View File

@ -19,7 +19,7 @@ $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test
Using [CLI interface](../../interfaces/cli.md):
``` bash
$ echo '{"foo":"bar"}' | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow"
$ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow"
```
Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead.

View File

@ -50,7 +50,7 @@ The supported formats are:
| [Parquet](#data-format-parquet) | ✔ | ✔ |
| [Arrow](#data-format-arrow) | ✔ | ✔ |
| [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ |
| [ORC](#data-format-orc) | ✔ | |
| [ORC](#data-format-orc) | ✔ | |
| [RowBinary](#rowbinary) | ✔ | ✔ |
| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ |
| [Native](#native) | ✔ | ✔ |
@ -1284,32 +1284,33 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e
## ORC {#data-format-orc}
[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse.
[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the [Hadoop](https://hadoop.apache.org/) ecosystem.
### Data Types Matching {#data_types-matching-3}
The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` queries.
The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| ORC data type (`INSERT`) | ClickHouse data type |
|--------------------------|-----------------------------------------------------|
| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) |
| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) |
| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) |
| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) |
| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) |
| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) |
| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) |
| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) |
| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) |
| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) |
| `DATE32` | [Date](../sql-reference/data-types/date.md) |
| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) |
| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) |
| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) |
| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) |
|--------------------------|-----------------------------------------------------|--------------------------|
| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` |
| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` |
| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` |
| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` |
| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` |
| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` |
| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` |
ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.
Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
The data types of ClickHouse table columns dont have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column.
@ -1321,6 +1322,14 @@ You can insert ORC data from a file into ClickHouse table by the following comma
$ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
```
### Selecting Data {#selecting-data-2}
You can select data from a ClickHouse table and save them into some file in the ORC format by the following command:
``` bash
$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc}
```
To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md).
## LineAsString {#lineasstring}

View File

@ -9,7 +9,7 @@ Columns:
- `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — the number of times this error has been happened.
- `last_error_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — time when the last error happened.
- `last_error_message` ([String](../../sql-reference/data-types/string.md)) — message for the last error.
- `last_error_stacktrace` ([String](../../sql-reference/data-types/string.md)) — stacktrace for the last error.
- `last_error_trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored.
- `remote` ([UInt8](../../sql-reference/data-types/int-uint.md)) — remote exception (i.e. received during one of the distributed query).
**Example**
@ -25,3 +25,12 @@ LIMIT 1
│ CANNOT_OPEN_FILE │ 76 │ 1 │
└──────────────────┴──────┴───────┘
```
``` sql
WITH arrayMap(x -> demangle(addressToSymbol(x)), last_error_trace) AS all
SELECT name, arrayStringConcat(all, '\n') AS res
FROM system.errors
LIMIT 1
SETTINGS allow_introspection_functions=1\G
```

View File

@ -320,8 +320,6 @@ Similar to `cache`, but stores data on SSD and index in RAM.
<write_buffer_size>1048576</write_buffer_size>
<!-- Path where cache file will be stored. -->
<path>/var/lib/clickhouse/clickhouse_dictionaries/test_dict</path>
<!-- Max number on stored keys in the cache. Rounded up to a power of two. -->
<max_stored_keys>1048576</max_stored_keys>
</ssd_cache>
</layout>
```
@ -329,8 +327,8 @@ Similar to `cache`, but stores data on SSD and index in RAM.
or
``` sql
LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576))
LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict))
```
### complex_key_ssd_cache {#complex-key-ssd-cache}

View File

@ -23,7 +23,9 @@ ClickHouse supports the standard grammar for defining windows and window functio
| `GROUPS` frame | not supported |
| Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported |
| `rank()`, `dense_rank()`, `row_number()` | supported |
| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`|
| `lag/lead(value, offset)` | Not supported. Workarounds: |
| | 1) replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`|
| | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` |
## References

View File

@ -49,7 +49,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT
| [Parquet](#data-format-parquet) | ✔ | ✔ |
| [Arrow](#data-format-arrow) | ✔ | ✔ |
| [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ |
| [ORC](#data-format-orc) | ✔ | |
| [ORC](#data-format-orc) | ✔ | |
| [RowBinary](#rowbinary) | ✔ | ✔ |
| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ |
| [Native](#native) | ✔ | ✔ |
@ -1203,45 +1203,53 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_
## ORC {#data-format-orc}
[Apache ORC](https://orc.apache.org/) - это column-oriented формат данных, распространённый в экосистеме Hadoop. Вы можете только вставлять данные этого формата в ClickHouse.
[Apache ORC](https://orc.apache.org/) — это столбцовый формат данных, распространенный в экосистеме [Hadoop](https://hadoop.apache.org/).
### Соответствие типов данных {#sootvetstvie-tipov-dannykh-1}
Таблица показывает поддержанные типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT`.
Таблица ниже содержит поддерживаемые типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT` и `SELECT`.
| Тип данных ORC (`INSERT`) | Тип данных ClickHouse |
|---------------------------|-----------------------------------------------------|
| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) |
| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) |
| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) |
| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) |
| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) |
| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) |
| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) |
| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) |
| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) |
| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) |
| `DATE32` | [Date](../sql-reference/data-types/date.md) |
| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) |
| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) |
| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) |
| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | Тип данных ORC (`SELECT`) |
|---------------------------|-----------------------------------------------------|---------------------------|
| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` |
| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` |
| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` |
| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` |
| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` |
| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` |
| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` |
ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`.
ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных ORC `DECIMAL` как `Decimal128`.
Неподдержанные типы данных ORC: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
Неподдерживаемые типы данных ORC: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных, ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse.
Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse.
### Вставка данных {#vstavka-dannykh-1}
Данные ORC можно вставить в таблицу ClickHouse командой:
Чтобы вставить в ClickHouse данные из файла в формате ORC, используйте команду следующего вида:
``` bash
$ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
```
Для обмена данных с Hadoop можно использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md).
### Вывод данных {#vyvod-dannykh-1}
Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида:
``` bash
$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc}
```
Для обмена данных с экосистемой Hadoop вы можете использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md).
## LineAsString {#lineasstring}

View File

@ -318,8 +318,6 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
<write_buffer_size>1048576</write_buffer_size>
<!-- Path where cache file will be stored. -->
<path>/var/lib/clickhouse/clickhouse_dictionaries/test_dict</path>
<!-- Max number on stored keys in the cache. Rounded up to a power of two. -->
<max_stored_keys>1048576</max_stored_keys>
</ssd_cache>
</layout>
```
@ -327,8 +325,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
или
``` sql
LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576))
LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict))
```
### complex_key_ssd_cache {#complex-key-ssd-cache}

View File

@ -672,7 +672,7 @@ neighbor(column, offset[, default_value])
Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных.
Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю.
Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию изне подзапроса.
Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса.
**Аргументы**

View File

@ -26,7 +26,7 @@ numpy==1.19.2
Pygments==2.5.2
pymdown-extensions==8.0
python-slugify==4.0.1
PyYAML==5.3.1
PyYAML==5.4.1
repackage==0.7.3
requests==2.24.0
singledispatch==3.4.0.3

View File

@ -8,10 +8,10 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
#include <IO/TimeoutSetter.h>
#include <DataStreams/NativeBlockInputStream.h>
#include <DataStreams/NativeBlockOutputStream.h>
#include <Client/Connection.h>
#include <Client/TimeoutSetter.h>
#include <Common/ClickHouseRevision.h>
#include <Common/Exception.h>
#include <Common/NetException.h>

View File

@ -16,7 +16,6 @@ SRCS(
HedgedConnections.cpp
HedgedConnectionsFactory.cpp
MultiplexedConnections.cpp
TimeoutSetter.cpp
)

View File

@ -560,7 +560,7 @@ namespace DB
{
namespace ErrorCodes
{
#define M(VALUE, NAME) extern const Value NAME = VALUE;
#define M(VALUE, NAME) extern const ErrorCode NAME = VALUE;
APPLY_FOR_ERROR_CODES(M)
#undef M
@ -587,7 +587,7 @@ namespace ErrorCodes
ErrorCode end() { return END + 1; }
void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace)
void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace)
{
if (error_code >= end())
{
@ -596,10 +596,10 @@ namespace ErrorCodes
error_code = end() - 1;
}
values[error_code].increment(remote, message, stacktrace);
values[error_code].increment(remote, message, trace);
}
void ErrorPairHolder::increment(bool remote, const std::string & message, const std::string & stacktrace)
void ErrorPairHolder::increment(bool remote, const std::string & message, const FramePointers & trace)
{
const auto now = std::chrono::system_clock::now();
@ -609,7 +609,7 @@ namespace ErrorCodes
++error.count;
error.message = message;
error.stacktrace = stacktrace;
error.trace = trace;
error.error_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
}
ErrorPair ErrorPairHolder::get()

View File

@ -1,11 +1,12 @@
#pragma once
#include <stddef.h>
#include <cstddef>
#include <cstdint>
#include <utility>
#include <mutex>
#include <common/types.h>
#include <string_view>
#include <vector>
#include <common/types.h>
/** Allows to count number of simultaneously happening error codes.
* See also Exception.cpp for incrementing part.
@ -19,6 +20,7 @@ namespace ErrorCodes
/// ErrorCode identifier (index in array).
using ErrorCode = int;
using Value = size_t;
using FramePointers = std::vector<void *>;
/// Get name of error_code by identifier.
/// Returns statically allocated string.
@ -33,7 +35,7 @@ namespace ErrorCodes
/// Message for the last error.
std::string message;
/// Stacktrace for the last error.
std::string stacktrace;
FramePointers trace;
};
struct ErrorPair
{
@ -46,7 +48,7 @@ namespace ErrorCodes
{
public:
ErrorPair get();
void increment(bool remote, const std::string & message, const std::string & stacktrace);
void increment(bool remote, const std::string & message, const FramePointers & trace);
private:
ErrorPair value;
@ -60,7 +62,7 @@ namespace ErrorCodes
ErrorCode end();
/// Add value for specified error_code.
void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace);
void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace);
}
}

View File

@ -36,7 +36,7 @@ namespace ErrorCodes
/// - Aborts the process if error code is LOGICAL_ERROR.
/// - Increments error codes statistics.
void handle_error_code([[maybe_unused]] const std::string & msg, const std::string & stacktrace, int code, bool remote)
void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool remote, const Exception::FramePointers & trace)
{
// In debug builds and builds with sanitizers, treat LOGICAL_ERROR as an assertion failure.
// Log the message before we fail.
@ -47,20 +47,21 @@ void handle_error_code([[maybe_unused]] const std::string & msg, const std::stri
abort();
}
#endif
ErrorCodes::increment(code, remote, msg, stacktrace);
ErrorCodes::increment(code, remote, msg, trace);
}
Exception::Exception(const std::string & msg, int code, bool remote_)
: Poco::Exception(msg, code)
, remote(remote_)
{
handle_error_code(msg, getStackTraceString(), code, remote);
handle_error_code(msg, code, remote, getStackFramePointers());
}
Exception::Exception(const std::string & msg, const Exception & nested, int code)
: Poco::Exception(msg, nested, code)
{
handle_error_code(msg, getStackTraceString(), code, remote);
handle_error_code(msg, code, remote, getStackFramePointers());
}
Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc)
@ -101,6 +102,31 @@ std::string Exception::getStackTraceString() const
#endif
}
Exception::FramePointers Exception::getStackFramePointers() const
{
FramePointers frame_pointers;
#ifdef STD_EXCEPTION_HAS_STACK_TRACE
{
frame_pointers.resize(get_stack_trace_size());
for (size_t i = 0; i < frame_pointers.size(); ++i)
{
frame_pointers[i] = get_stack_trace_frames()[i];
}
}
#else
{
size_t stack_trace_size = trace.getSize();
size_t stack_trace_offset = trace.getOffset();
frame_pointers.reserve(stack_trace_size - stack_trace_offset);
for (size_t i = stack_trace_offset; i < stack_trace_size; ++i)
{
frame_pointers.push_back(trace.getFramePointers()[i]);
}
}
#endif
return frame_pointers;
}
void throwFromErrno(const std::string & s, int code, int the_errno)
{

View File

@ -24,6 +24,8 @@ namespace DB
class Exception : public Poco::Exception
{
public:
using FramePointers = std::vector<void *>;
Exception() = default;
Exception(const std::string & msg, int code, bool remote_ = false);
Exception(const std::string & msg, const Exception & nested, int code);
@ -66,6 +68,8 @@ public:
bool isRemoteException() const { return remote; }
std::string getStackTraceString() const;
/// Used for system.errors
FramePointers getStackFramePointers() const;
private:
#ifndef STD_EXCEPTION_HAS_STACK_TRACE

View File

@ -271,13 +271,13 @@ private:
};
template <typename Key, typename Mapped>
struct DefaultCellDisposer
struct DefaultLRUHashMapCellDisposer
{
void operator()(const Key &, const Mapped &) const {}
};
template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
template <typename Key, typename Value, typename Disposer = DefaultLRUHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
using LRUHashMap = LRUHashMapImpl<Key, Value, Disposer, Hash, false>;
template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
template <typename Key, typename Value, typename Disposer = DefaultLRUHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
using LRUHashMapWithSavedHash = LRUHashMapImpl<Key, Value, Disposer, Hash, true>;

View File

@ -692,6 +692,30 @@ public:
assign(from.begin(), from.end());
}
void erase(const_iterator first, const_iterator last)
{
iterator first_no_const = const_cast<iterator>(first);
iterator last_no_const = const_cast<iterator>(last);
size_t items_to_move = end() - last;
while (items_to_move != 0)
{
*first_no_const = *last_no_const;
++first_no_const;
++last_no_const;
--items_to_move;
}
this->c_end = reinterpret_cast<char *>(first_no_const);
}
void erase(const_iterator pos)
{
this->erase(pos, pos + 1);
}
bool operator== (const PODArray & rhs) const
{

View File

@ -92,3 +92,57 @@ TEST(Common, PODInsertElementSizeNotMultipleOfLeftPadding)
EXPECT_EQ(arr1_initially_empty.size(), items_to_insert_size);
}
TEST(Common, PODErase)
{
{
PaddedPODArray<UInt64> items {0,1,2,3,4,5,6,7,8,9};
PaddedPODArray<UInt64> expected;
expected = {0,1,2,3,4,5,6,7,8,9};
items.erase(items.begin(), items.begin());
EXPECT_EQ(items, expected);
items.erase(items.end(), items.end());
EXPECT_EQ(items, expected);
}
{
PaddedPODArray<UInt64> actual {0,1,2,3,4,5,6,7,8,9};
PaddedPODArray<UInt64> expected;
expected = {0,1,4,5,6,7,8,9};
actual.erase(actual.begin() + 2, actual.begin() + 4);
EXPECT_EQ(actual, expected);
expected = {0,1,4};
actual.erase(actual.begin() + 3, actual.end());
EXPECT_EQ(actual, expected);
expected = {};
actual.erase(actual.begin(), actual.end());
EXPECT_EQ(actual, expected);
for (size_t i = 0; i < 10; ++i)
actual.emplace_back(static_cast<UInt64>(i));
expected = {0,1,4,5,6,7,8,9};
actual.erase(actual.begin() + 2, actual.begin() + 4);
EXPECT_EQ(actual, expected);
expected = {0,1,4};
actual.erase(actual.begin() + 3, actual.end());
EXPECT_EQ(actual, expected);
expected = {};
actual.erase(actual.begin(), actual.end());
EXPECT_EQ(actual, expected);
}
{
PaddedPODArray<UInt64> actual {0,1,2,3,4,5,6,7,8,9};
PaddedPODArray<UInt64> expected;
expected = {1,2,3,4,5,6,7,8,9};
actual.erase(actual.begin());
EXPECT_EQ(actual, expected);
}
}

View File

@ -14,7 +14,7 @@ PEERDIR(
clickhouse/base/common
clickhouse/base/pcg-random
clickhouse/base/widechar_width
contrib/libs/libcpuid/libcpuid
contrib/libs/libcpuid
contrib/libs/openssl
contrib/libs/poco/NetSSL_OpenSSL
contrib/libs/re2

View File

@ -13,7 +13,7 @@ PEERDIR(
clickhouse/base/common
clickhouse/base/pcg-random
clickhouse/base/widechar_width
contrib/libs/libcpuid/libcpuid
contrib/libs/libcpuid
contrib/libs/openssl
contrib/libs/poco/NetSSL_OpenSSL
contrib/libs/re2

View File

@ -51,7 +51,7 @@ bool CachedCompressedReadBuffer::nextImpl()
{
owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer();
owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes);
decompress(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
}

View File

@ -21,7 +21,7 @@ bool CompressedReadBuffer::nextImpl()
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
return true;
}
@ -48,7 +48,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
/// If the decompressed block fits entirely where it needs to be copied.
if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
{
decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
bytes_read += size_decompressed;
bytes += size_decompressed;
}
@ -61,9 +61,9 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
pos = working_buffer.begin();
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
pos = working_buffer.begin();
bytes_read += read(to + bytes_read, n - bytes_read);
break;

View File

@ -184,7 +184,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed,
}
void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs)
{
ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks);
ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed);
@ -210,11 +210,38 @@ void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, s
ErrorCodes::CANNOT_DECOMPRESS);
}
}
}
void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
{
readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
codec->decompress(compressed_buffer, size_compressed_without_checksum, to);
}
void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum)
{
readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
if (codec->isNone())
{
/// Shortcut for NONE codec to avoid extra memcpy.
/// We doing it by changing the buffer `to` to point to existing uncompressed data.
UInt8 header_size = ICompressionCodec::getHeaderSize();
if (size_compressed_without_checksum < header_size)
throw Exception(ErrorCodes::CORRUPTED_DATA,
"Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})",
size_compressed_without_checksum, static_cast<size_t>(header_size));
to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum);
}
else
codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin());
}
/// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_)
: compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_)

View File

@ -3,6 +3,7 @@
#include <Common/PODArray.h>
#include <Compression/LZ4_decompress_faster.h>
#include <Compression/ICompressionCodec.h>
#include <IO/BufferBase.h>
namespace DB
@ -37,7 +38,12 @@ protected:
/// Returns number of compressed bytes read.
size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum, bool always_copy);
void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
/// Decompress into memory pointed by `to`
void decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
/// This method can change location of `to` to avoid unnecessary copy if data is uncompressed.
/// It is more efficient for compression codec NONE but not suitable if you want to decompress into specific location.
void decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum);
public:
/// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.

View File

@ -31,7 +31,7 @@ bool CompressedReadBufferFromFile::nextImpl()
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
return true;
}
@ -108,7 +108,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
/// If the decompressed block fits entirely where it needs to be copied.
if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
{
decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
bytes_read += size_decompressed;
bytes += size_decompressed;
}
@ -122,9 +122,9 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
pos = working_buffer.begin();
decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
pos = working_buffer.begin();
bytes_read += read(to + bytes_read, n - bytes_read);
break;

View File

@ -98,7 +98,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch
UInt8 header_size = getHeaderSize();
if (source_size < header_size)
throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", source_size, size_t(header_size));
throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", source_size, static_cast<size_t>(header_size));
uint8_t our_method = getMethodByte();
uint8_t method = source[0];

View File

@ -31,6 +31,8 @@ struct Settings;
M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \
M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \
M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \
M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \
M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \
M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0)
DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

View File

@ -30,6 +30,8 @@ NuKeeperServer::NuKeeperServer(
, state_manager(nuraft::cs_new<NuKeeperStateManager>(server_id, "test_keeper_server", config, coordination_settings))
, responses_queue(responses_queue_)
{
if (coordination_settings->quorum_reads)
LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Quorum reads enabled, NuKeeper will work slower.");
}
void NuKeeperServer::startup()
@ -59,6 +61,7 @@ void NuKeeperServer::startup()
params.reserved_log_items_ = coordination_settings->reserved_log_items;
params.snapshot_distance_ = coordination_settings->snapshot_distance;
params.stale_log_gap_ = coordination_settings->stale_log_gap;
params.fresh_log_gap_ = coordination_settings->fresh_log_gap;
params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds();
params.auto_forwarding_ = coordination_settings->auto_forwarding;
params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2;
@ -106,7 +109,7 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coord
void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session)
{
auto [session_id, request] = request_for_session;
if (isLeaderAlive() && request->isReadRequest())
if (!coordination_settings->quorum_reads && isLeaderAlive() && request->isReadRequest())
{
state_machine->processReadRequest(request_for_session);
}
@ -185,6 +188,9 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
if (next_index < last_commited || next_index - last_commited <= 1)
commited_store = true;
if (initialized_flag)
return nuraft::cb_func::ReturnCode::Ok;
auto set_initialized = [this] ()
{
std::unique_lock lock(initialized_mutex);
@ -196,10 +202,27 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
{
case nuraft::cb_func::BecomeLeader:
{
if (commited_store) /// We become leader and store is empty, ready to serve requests
/// We become leader and store is empty or we already committed it
if (commited_store || initial_batch_committed)
set_initialized();
return nuraft::cb_func::ReturnCode::Ok;
}
case nuraft::cb_func::BecomeFollower:
case nuraft::cb_func::GotAppendEntryReqFromLeader:
{
if (isLeaderAlive())
{
auto leader_index = raft_instance->get_leader_committed_log_idx();
auto our_index = raft_instance->get_committed_log_idx();
/// This may happen when we start RAFT cluster from scratch.
/// Node first became leader, and after that some other node became leader.
/// BecameFresh for this node will not be called because it was already fresh
/// when it was leader.
if (leader_index < our_index + coordination_settings->fresh_log_gap)
set_initialized();
}
return nuraft::cb_func::ReturnCode::Ok;
}
case nuraft::cb_func::BecomeFresh:
{
set_initialized(); /// We are fresh follower, ready to serve requests.
@ -209,6 +232,7 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
{
if (isLeader()) /// We have committed our log store and we are leader, ready to serve requests.
set_initialized();
initial_batch_committed = true;
return nuraft::cb_func::ReturnCode::Ok;
}
default: /// ignore other events
@ -220,7 +244,7 @@ void NuKeeperServer::waitInit()
{
std::unique_lock lock(initialized_mutex);
int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds();
if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; }))
if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); }))
throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization");
}

View File

@ -31,8 +31,9 @@ private:
ResponsesQueue & responses_queue;
std::mutex initialized_mutex;
bool initialized_flag = false;
std::atomic<bool> initialized_flag = false;
std::condition_variable initialized_cv;
std::atomic<bool> initial_batch_committed = false;
nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param);

View File

@ -241,9 +241,10 @@ NuKeeperStorageSnapshot::~NuKeeperStorageSnapshot()
storage->disableSnapshotMode();
}
NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_)
NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_)
: snapshots_path(snapshots_path_)
, snapshots_to_keep(snapshots_to_keep_)
, storage_tick_time(storage_tick_time_)
{
namespace fs = std::filesystem;
@ -325,22 +326,24 @@ nuraft::ptr<nuraft::buffer> NuKeeperSnapshotManager::serializeSnapshotToBuffer(c
return writer.getBuffer();
}
SnapshotMetadataPtr NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer)
SnapshotMetaAndStorage NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const
{
ReadBufferFromNuraftBuffer reader(buffer);
CompressedReadBuffer compressed_reader(reader);
return NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader);
auto storage = std::make_unique<NuKeeperStorage>(storage_tick_time);
auto snapshot_metadata = NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader);
return std::make_pair(snapshot_metadata, std::move(storage));
}
SnapshotMetadataPtr NuKeeperSnapshotManager::restoreFromLatestSnapshot(NuKeeperStorage * storage)
SnapshotMetaAndStorage NuKeeperSnapshotManager::restoreFromLatestSnapshot()
{
if (existing_snapshots.empty())
return nullptr;
return {};
auto buffer = deserializeLatestSnapshotBufferFromDisk();
if (!buffer)
return nullptr;
return deserializeSnapshotFromBuffer(storage, buffer);
return {};
return deserializeSnapshotFromBuffer(buffer);
}
void NuKeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded()

View File

@ -40,17 +40,20 @@ public:
using NuKeeperStorageSnapshotPtr = std::shared_ptr<NuKeeperStorageSnapshot>;
using CreateSnapshotCallback = std::function<void(NuKeeperStorageSnapshotPtr &&)>;
using SnapshotMetaAndStorage = std::pair<SnapshotMetadataPtr, NuKeeperStoragePtr>;
class NuKeeperSnapshotManager
{
public:
NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_);
NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_ = 500);
SnapshotMetadataPtr restoreFromLatestSnapshot(NuKeeperStorage * storage);
SnapshotMetaAndStorage restoreFromLatestSnapshot();
static nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const NuKeeperStorageSnapshot & snapshot);
std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, size_t up_to_log_idx);
static SnapshotMetadataPtr deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer);
SnapshotMetaAndStorage deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const;
nuraft::ptr<nuraft::buffer> deserializeSnapshotBufferFromDisk(size_t up_to_log_idx) const;
nuraft::ptr<nuraft::buffer> deserializeLatestSnapshotBufferFromDisk();
@ -74,6 +77,7 @@ private:
const std::string snapshots_path;
const size_t snapshots_to_keep;
std::map<size_t, std::string> existing_snapshots;
size_t storage_tick_time;
};
struct CreateSnapshotTask

View File

@ -4,6 +4,7 @@
#include <IO/ReadHelpers.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Coordination/NuKeeperSnapshotManager.h>
#include <future>
namespace DB
{
@ -37,8 +38,7 @@ NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_)
: coordination_settings(coordination_settings_)
, storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds())
, snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep)
, snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep, coordination_settings->dead_session_check_period_ms.totalMicroseconds())
, responses_queue(responses_queue_)
, snapshots_queue(snapshots_queue_)
, last_committed_idx(0)
@ -60,7 +60,7 @@ void NuKeeperStateMachine::init()
try
{
latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index);
latest_snapshot_meta = snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_buf);
std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf);
last_committed_idx = latest_snapshot_meta->get_last_log_idx();
loaded = true;
break;
@ -83,6 +83,9 @@ void NuKeeperStateMachine::init()
{
LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx);
}
if (!storage)
storage = std::make_unique<NuKeeperStorage>(coordination_settings->dead_session_check_period_ms.totalMilliseconds());
}
nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
@ -96,7 +99,7 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
nuraft::buffer_serializer bs(response);
{
std::lock_guard lock(storage_lock);
session_id = storage.getSessionID(session_timeout_ms);
session_id = storage->getSessionID(session_timeout_ms);
bs.put_i64(session_id);
}
LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms);
@ -109,7 +112,7 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
NuKeeperStorage::ResponsesForSessions responses_for_sessions;
{
std::lock_guard lock(storage_lock);
responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id, log_idx);
responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, log_idx);
for (auto & response_for_session : responses_for_sessions)
responses_queue.push(response_for_session);
}
@ -133,7 +136,7 @@ bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
{
std::lock_guard lock(storage_lock);
snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_ptr);
std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr);
}
last_committed_idx = s.get_last_log_idx();
return true;
@ -157,7 +160,7 @@ void NuKeeperStateMachine::create_snapshot(
CreateSnapshotTask snapshot_task;
{
std::lock_guard lock(storage_lock);
snapshot_task.snapshot = std::make_shared<NuKeeperStorageSnapshot>(&storage, snapshot_meta_copy);
snapshot_task.snapshot = std::make_shared<NuKeeperStorageSnapshot>(storage.get(), snapshot_meta_copy);
}
snapshot_task.create_snapshot = [this, when_done] (NuKeeperStorageSnapshotPtr && snapshot)
@ -179,7 +182,7 @@ void NuKeeperStateMachine::create_snapshot(
{
/// Must do it with lock (clearing elements from list)
std::lock_guard lock(storage_lock);
storage.clearGarbageAfterSnapshot();
storage->clearGarbageAfterSnapshot();
/// Destroy snapshot with lock
snapshot.reset();
LOG_TRACE(log, "Cleared garbage after snapshot");
@ -214,7 +217,7 @@ void NuKeeperStateMachine::save_logical_snp_obj(
if (obj_id == 0)
{
std::lock_guard lock(storage_lock);
NuKeeperStorageSnapshot snapshot(&storage, s.get_last_log_idx());
NuKeeperStorageSnapshot snapshot(storage.get(), s.get_last_log_idx());
cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot);
}
else
@ -225,7 +228,28 @@ void NuKeeperStateMachine::save_logical_snp_obj(
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
cloned_meta = nuraft::snapshot::deserialize(*snp_buf);
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx());
/// Sometimes NuRaft can call save and create snapshots from different threads
/// at once. To avoid race conditions we serialize snapshots through snapshots_queue
/// TODO: make something better
CreateSnapshotTask snapshot_task;
std::shared_ptr<std::promise<void>> waiter = std::make_shared<std::promise<void>>();
auto future = waiter->get_future();
snapshot_task.snapshot = nullptr;
snapshot_task.create_snapshot = [this, waiter, cloned_buffer, log_idx = s.get_last_log_idx()] (NuKeeperStorageSnapshotPtr &&)
{
try
{
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, log_idx);
LOG_DEBUG(log, "Saved snapshot {} to path {}", log_idx, result_path);
}
catch (...)
{
tryLogCurrentException(log);
}
waiter->set_value();
};
snapshots_queue.push(std::move(snapshot_task));
future.wait();
{
std::lock_guard lock(snapshots_lock);
@ -233,7 +257,6 @@ void NuKeeperStateMachine::save_logical_snp_obj(
latest_snapshot_meta = cloned_meta;
}
LOG_DEBUG(log, "Created snapshot {} with path {}", s.get_last_log_idx(), result_path);
obj_id++;
}
@ -271,7 +294,7 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS
NuKeeperStorage::ResponsesForSessions responses;
{
std::lock_guard lock(storage_lock);
responses = storage.processRequest(request_for_session.request, request_for_session.session_id, std::nullopt);
responses = storage->processRequest(request_for_session.request, request_for_session.session_id, std::nullopt);
}
for (const auto & response : responses)
responses_queue.push(response);
@ -280,13 +303,13 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS
std::unordered_set<int64_t> NuKeeperStateMachine::getDeadSessions()
{
std::lock_guard lock(storage_lock);
return storage.getDeadSessions();
return storage->getDeadSessions();
}
void NuKeeperStateMachine::shutdownStorage()
{
std::lock_guard lock(storage_lock);
storage.finalize();
storage->finalize();
}
}

View File

@ -52,7 +52,7 @@ public:
NuKeeperStorage & getStorage()
{
return storage;
return *storage;
}
void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session);
@ -68,7 +68,7 @@ private:
CoordinationSettingsPtr coordination_settings;
NuKeeperStorage storage;
NuKeeperStoragePtr storage;
NuKeeperSnapshotManager snapshot_manager;

View File

@ -233,7 +233,7 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest
struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
{
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t /*session_id*/) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperRemoveResponse & response = dynamic_cast<Coordination::ZooKeeperRemoveResponse &>(*response_ptr);
@ -257,7 +257,12 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
{
auto prev_node = it->value;
if (prev_node.stat.ephemeralOwner != 0)
ephemerals[session_id].erase(request.path);
{
auto ephemerals_it = ephemerals.find(prev_node.stat.ephemeralOwner);
ephemerals_it->second.erase(request.path);
if (ephemerals_it->second.empty())
ephemerals.erase(ephemerals_it);
}
auto child_basename = getBaseName(it->key);
container.updateValue(parentPath(request.path), [&child_basename] (NuKeeperStorage::Node & parent)
@ -271,10 +276,10 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
container.erase(request.path);
undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename]
undo = [prev_node, &container, &ephemerals, path = request.path, child_basename]
{
if (prev_node.stat.ephemeralOwner != 0)
ephemerals[session_id].emplace(path);
ephemerals[prev_node.stat.ephemeralOwner].emplace(path);
container.insert(path, prev_node);
container.updateValue(parentPath(path), [&child_basename] (NuKeeperStorage::Node & parent)
@ -377,7 +382,6 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest
{
return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED);
}
};
struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest
@ -641,6 +645,13 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor
for (const auto & ephemeral_path : it->second)
{
container.erase(ephemeral_path);
container.updateValue(parentPath(ephemeral_path), [&ephemeral_path] (NuKeeperStorage::Node & parent)
{
--parent.stat.numChildren;
++parent.stat.cversion;
parent.children.erase(getBaseName(ephemeral_path));
});
auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED);
results.insert(results.end(), responses.begin(), responses.end());
}

View File

@ -131,4 +131,6 @@ public:
}
};
using NuKeeperStoragePtr = std::unique_ptr<NuKeeperStorage>;
}

View File

@ -132,6 +132,10 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati
coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config);
request_thread = ThreadFromGlobalPool([this] { requestThread(); });
responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); });
server = std::make_unique<NuKeeperServer>(myid, coordination_settings, config, responses_queue, snapshots_queue);
try
{
@ -148,10 +152,8 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati
throw;
}
request_thread = ThreadFromGlobalPool([this] { requestThread(); });
responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); });
snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); });
LOG_DEBUG(log, "Dispatcher initialized");
}

View File

@ -897,25 +897,25 @@ TEST(CoordinationTest, TestStorageSnapshotSimple)
manager.serializeSnapshotBufferToDisk(*buf, 2);
EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin"));
DB::NuKeeperStorage restored_storage(500);
auto debuf = manager.deserializeSnapshotBufferFromDisk(2);
manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);
EXPECT_EQ(restored_storage.container.size(), 3);
EXPECT_EQ(restored_storage.container.getValue("/").children.size(), 1);
EXPECT_EQ(restored_storage.container.getValue("/hello").children.size(), 1);
EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").children.size(), 0);
auto [snapshot_meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf);
EXPECT_EQ(restored_storage.container.getValue("/").data, "");
EXPECT_EQ(restored_storage.container.getValue("/hello").data, "world");
EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").data, "somedata");
EXPECT_EQ(restored_storage.session_id_counter, 7);
EXPECT_EQ(restored_storage.zxid, 2);
EXPECT_EQ(restored_storage.ephemerals.size(), 2);
EXPECT_EQ(restored_storage.ephemerals[3].size(), 1);
EXPECT_EQ(restored_storage.ephemerals[1].size(), 1);
EXPECT_EQ(restored_storage.session_and_timeout.size(), 2);
EXPECT_EQ(restored_storage->container.size(), 3);
EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1);
EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1);
EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0);
EXPECT_EQ(restored_storage->container.getValue("/").data, "");
EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world");
EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata");
EXPECT_EQ(restored_storage->session_id_counter, 7);
EXPECT_EQ(restored_storage->zxid, 2);
EXPECT_EQ(restored_storage->ephemerals.size(), 2);
EXPECT_EQ(restored_storage->ephemerals[3].size(), 1);
EXPECT_EQ(restored_storage->ephemerals[1].size(), 1);
EXPECT_EQ(restored_storage->session_and_timeout.size(), 2);
}
TEST(CoordinationTest, TestStorageSnapshotMoreWrites)
@ -946,15 +946,14 @@ TEST(CoordinationTest, TestStorageSnapshotMoreWrites)
manager.serializeSnapshotBufferToDisk(*buf, 50);
EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin"));
DB::NuKeeperStorage restored_storage(500);
auto debuf = manager.deserializeSnapshotBufferFromDisk(50);
manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);
auto [meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf);
EXPECT_EQ(restored_storage.container.size(), 51);
EXPECT_EQ(restored_storage->container.size(), 51);
for (size_t i = 0; i < 50; ++i)
{
EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
}
}
@ -987,14 +986,13 @@ TEST(CoordinationTest, TestStorageSnapshotManySnapshots)
EXPECT_TRUE(fs::exists("./snapshots/snapshot_250.bin"));
DB::NuKeeperStorage restored_storage(500);
manager.restoreFromLatestSnapshot(&restored_storage);
auto [meta, restored_storage] = manager.restoreFromLatestSnapshot();
EXPECT_EQ(restored_storage.container.size(), 251);
EXPECT_EQ(restored_storage->container.size(), 251);
for (size_t i = 0; i < 250; ++i)
{
EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
}
}
@ -1040,12 +1038,11 @@ TEST(CoordinationTest, TestStorageSnapshotMode)
EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i)));
}
DB::NuKeeperStorage restored_storage(500);
manager.restoreFromLatestSnapshot(&restored_storage);
auto [meta, restored_storage] = manager.restoreFromLatestSnapshot();
for (size_t i = 0; i < 50; ++i)
{
EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
}
}
@ -1071,8 +1068,7 @@ TEST(CoordinationTest, TestStorageSnapshotBroken)
plain_buf.truncate(34);
plain_buf.sync();
DB::NuKeeperStorage restored_storage(500);
EXPECT_THROW(manager.restoreFromLatestSnapshot(&restored_storage), DB::Exception);
EXPECT_THROW(manager.restoreFromLatestSnapshot(), DB::Exception);
}
nuraft::ptr<nuraft::buffer> getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
@ -1236,6 +1232,37 @@ TEST(CoordinationTest, TestStateMachineAndLogStore)
}
}
TEST(CoordinationTest, TestEphemeralNodeRemove)
{
using namespace Coordination;
using namespace DB;
ChangelogDirTest snapshots("./snapshots");
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
ResponsesQueue queue;
SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<NuKeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
state_machine->init();
std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>();
request_c->path = "/hello";
request_c->is_ephemeral = true;
auto entry_c = getLogEntryFromZKRequest(0, 1, request_c);
state_machine->commit(1, entry_c->get_buf());
const auto & storage = state_machine->getStorage();
EXPECT_EQ(storage.ephemerals.size(), 1);
std::shared_ptr<ZooKeeperRemoveRequest> request_d = std::make_shared<ZooKeeperRemoveRequest>();
request_d->path = "/hello";
/// Delete from other session
auto entry_d = getLogEntryFromZKRequest(0, 2, request_d);
state_machine->commit(2, entry_d->get_buf());
EXPECT_EQ(storage.ephemerals.size(), 0);
}
int main(int argc, char ** argv)
{
Poco::AutoPtr<Poco::ConsoleChannel> channel(new Poco::ConsoleChannel(std::cerr));

View File

@ -953,3 +953,26 @@ void writeFieldText(const Field & x, WriteBuffer & buf);
String toString(const Field & x);
}
template <>
struct fmt::formatter<DB::Field>
{
constexpr auto parse(format_parse_context & ctx)
{
auto it = ctx.begin();
auto end = ctx.end();
/// Only support {}.
if (it != end && *it != '}')
throw format_error("invalid format");
return it;
}
template <typename FormatContext>
auto format(const DB::Field & x, FormatContext & ctx)
{
return format_to(ctx.out(), "{}", toString(x));
}
};

View File

@ -101,7 +101,7 @@ template <DictionaryKeyType dictionary_key_type>
double CacheDictionary<dictionary_key_type>::getLoadFactor() const
{
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
return static_cast<double>(cache_storage_ptr->getSize()) / cache_storage_ptr->getMaxSize();
return cache_storage_ptr->getLoadFactor();
}
template <DictionaryKeyType dictionary_key_type>
@ -333,9 +333,7 @@ Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
FetchResult result_of_fetch_from_storage;
{
/// Write lock on storage
const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request);
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <chrono>
#include <variant>
#include <pcg_random.hpp>
@ -30,28 +31,31 @@ struct CacheDictionaryStorageConfiguration
const DictionaryLifetime lifetime;
};
/** Keys are stored in LRUCache and column values are serialized into arena.
Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored.
Columns are serialized by rows.
When cell is removed from LRUCache data associated with it is also removed from arena.
In case of complex key we also store key data in arena and it is removed from arena.
*/
/** ICacheDictionaryStorage implementation that keeps key in hash table with fixed collision length.
* Value in hash table point to index in attributes arrays.
*/
template <DictionaryKeyType dictionary_key_type>
class CacheDictionaryStorage final : public ICacheDictionaryStorage
{
static constexpr size_t max_collision_length = 10;
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage");
explicit CacheDictionaryStorage(CacheDictionaryStorageConfiguration & configuration_)
explicit CacheDictionaryStorage(
const DictionaryStructure & dictionary_structure,
CacheDictionaryStorageConfiguration & configuration_)
: configuration(configuration_)
, rnd_engine(randomSeed())
, cache(configuration.max_size_in_cells, false, { arena })
{
size_t cells_size = roundUpToPowerOfTwoOrZero(std::max(configuration.max_size_in_cells, max_collision_length));
cells.resize_fill(cells_size);
size_overlap_mask = cells_size - 1;
setup(dictionary_structure);
}
bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
@ -71,9 +75,7 @@ public:
const DictionaryStorageFetchRequest & fetch_request) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
{
return fetchColumnsForKeysImpl<SimpleKeysStorageFetchResult>(keys, fetch_request);
}
else
throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
}
@ -109,9 +111,7 @@ public:
const DictionaryStorageFetchRequest & column_fetch_requests) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
{
return fetchColumnsForKeysImpl<ComplexKeysStorageFetchResult>(keys, column_fetch_requests);
}
else
throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
}
@ -140,79 +140,162 @@ public:
throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
}
size_t getSize() const override { return cache.size(); }
size_t getSize() const override { return size; }
size_t getMaxSize() const override { return cache.getMaxSize(); }
double getLoadFactor() const override { return static_cast<double>(size) / configuration.max_size_in_cells; }
size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); }
size_t getBytesAllocated() const override
{
size_t attributes_size_in_bytes = 0;
size_t attributes_size = attributes.size();
for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index)
{
getAttributeContainer(attribute_index, [&](const auto & container)
{
attributes_size_in_bytes += container.capacity() * sizeof(container[0]);
});
}
return arena.size() + sizeof(Cell) * configuration.max_size_in_cells + attributes_size_in_bytes;
}
private:
struct FetchedKey
{
FetchedKey(size_t element_index_, bool is_default_)
: element_index(element_index_)
, is_default(is_default_)
{}
size_t element_index;
bool is_default;
};
template <typename KeysStorageFetchResult>
ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl(
KeysStorageFetchResult fetchColumnsForKeysImpl(
const PaddedPODArray<KeyType> & keys,
const DictionaryStorageFetchRequest & fetch_request)
{
KeysStorageFetchResult result;
result.fetched_columns = fetch_request.makeAttributesResultColumns();
result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
result.key_index_to_state.resize_fill(keys.size());
const auto now = std::chrono::system_clock::now();
const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
size_t fetched_columns_index = 0;
size_t keys_size = keys.size();
std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
size_t keys_size = keys.size();
PaddedPODArray<FetchedKey> fetched_keys;
fetched_keys.resize_fill(keys_size);
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys[key_index];
auto * it = cache.find(key);
auto [key_state, cell_index] = getKeyStateAndCellIndex(key, now);
if (it)
if (unlikely(key_state == KeyState::not_found))
{
/// Columns values for key are serialized in cache now deserialize them
const auto & cell = it->getMapped();
result.key_index_to_state[key_index] = {KeyState::not_found};
++result.not_found_keys_size;
continue;
}
bool has_deadline = cellHasDeadline(cell);
auto & cell = cells[cell_index];
if (has_deadline && now > cell.deadline + max_lifetime_seconds)
{
result.key_index_to_state[key_index] = {KeyState::not_found};
++result.not_found_keys_size;
continue;
}
else if (has_deadline && now > cell.deadline)
{
result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index};
++result.expired_keys_size;
}
else
{
result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index};
++result.found_keys_size;
}
result.expired_keys_size += static_cast<size_t>(key_state == KeyState::expired);
++fetched_columns_index;
result.key_index_to_state[key_index] = {key_state, fetched_columns_index};
fetched_keys[fetched_columns_index] = FetchedKey(cell.element_index, cell.is_default);
if (cell.isDefault())
++fetched_columns_index;
result.key_index_to_state[key_index].setDefaultValue(cell.is_default);
result.default_keys_size += cell.is_default;
}
result.found_keys_size = keys_size - (result.expired_keys_size + result.not_found_keys_size);
for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index)
{
if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index))
continue;
auto & attribute = attributes[attribute_index];
const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index);
size_t fetched_keys_size = fetched_keys.size();
auto & fetched_column = *result.fetched_columns[attribute_index];
fetched_column.reserve(fetched_keys_size);
if (unlikely(attribute.is_complex_type))
{
auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
{
result.key_index_to_state[key_index].setDefault();
++result.default_keys_size;
insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index);
}
else
{
const char * place_for_serialized_columns = cell.place_for_serialized_columns;
deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns);
auto fetched_key = fetched_keys[fetched_key_index];
if (unlikely(fetched_key.is_default))
fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index));
else
fetched_column.insert(container[fetched_key.element_index]);
}
}
else
{
result.key_index_to_state[key_index] = {KeyState::not_found};
++result.not_found_keys_size;
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
using ColumnType =
std::conditional_t<std::is_same_v<AttributeType, String>, ColumnString,
std::conditional_t<IsDecimalNumber<AttributeType>, ColumnDecimal<ValueType>,
ColumnVector<AttributeType>>>;
auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
ColumnType & column_typed = static_cast<ColumnType &>(fetched_column);
if constexpr (std::is_same_v<ColumnType, ColumnString>)
{
for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
{
auto fetched_key = fetched_keys[fetched_key_index];
if (unlikely(fetched_key.is_default))
column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
else
{
auto item = container[fetched_key.element_index];
column_typed.insertData(item.data, item.size);
}
}
}
else
{
auto & data = column_typed.getData();
for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
{
auto fetched_key = fetched_keys[fetched_key_index];
if (unlikely(fetched_key.is_default))
column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
else
{
auto item = container[fetched_key.element_index];
data.push_back(item);
}
}
}
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
}
@ -221,58 +304,108 @@ private:
void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
{
Arena temporary_values_pool;
size_t columns_to_serialize_size = columns.size();
PaddedPODArray<StringRef> temporary_column_data(columns_to_serialize_size);
const auto now = std::chrono::system_clock::now();
size_t keys_size = keys.size();
Field column_value;
for (size_t key_index = 0; key_index < keys_size; ++key_index)
for (size_t key_index = 0; key_index < keys.size(); ++key_index)
{
size_t allocated_size_for_columns = 0;
const char * block_start = nullptr;
auto key = keys[key_index];
auto * it = cache.find(key);
for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index)
size_t cell_index = getCellIndexForInsert(key);
auto & cell = cells[cell_index];
bool cell_was_default = cell.is_default;
cell.is_default = false;
bool was_inserted = cell.deadline == 0;
if (was_inserted)
{
auto & column = columns[column_index];
temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start);
allocated_size_for_columns += temporary_column_data[column_index].size;
}
if constexpr (std::is_same_v<KeyType, StringRef>)
cell.key = copyStringInArena(key);
else
cell.key = key;
char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns);
memcpy(reinterpret_cast<void*>(place_for_serialized_columns), reinterpret_cast<const void*>(block_start), allocated_size_for_columns);
for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index)
{
auto & column = columns[attribute_index];
if (it)
{
/// Cell exists need to free previous serialized place and update deadline
auto & cell = it->getMapped();
getAttributeContainer(attribute_index, [&](auto & container)
{
container.emplace_back();
cell.element_index = container.size() - 1;
if (cell.place_for_serialized_columns)
arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
using ElementType = std::decay_t<decltype(container[0])>;
setCellDeadline(cell, now);
cell.allocated_size_for_columns = allocated_size_for_columns;
cell.place_for_serialized_columns = place_for_serialized_columns;
column->get(key_index, column_value);
if constexpr (std::is_same_v<ElementType, Field>)
container.back() = column_value;
else if constexpr (std::is_same_v<ElementType, StringRef>)
{
const String & string_value = column_value.get<String>();
StringRef string_value_ref = StringRef {string_value.data(), string_value.size()};
StringRef inserted_value = copyStringInArena(string_value_ref);
container.back() = inserted_value;
}
else
container.back() = column_value.get<NearestFieldType<ElementType>>();
});
}
++size;
}
else
{
/// No cell exists so create and put in cache
Cell cell;
if (cell.key != key)
{
if constexpr (std::is_same_v<KeyType, StringRef>)
{
char * data = const_cast<char *>(cell.key.data);
arena.free(data, cell.key.size);
cell.key = copyStringInArena(key);
}
else
cell.key = key;
}
setCellDeadline(cell, now);
cell.allocated_size_for_columns = allocated_size_for_columns;
cell.place_for_serialized_columns = place_for_serialized_columns;
/// Put values into existing index
size_t index_to_use = cell.element_index;
insertCellInCache(key, cell);
for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index)
{
auto & column = columns[attribute_index];
getAttributeContainer(attribute_index, [&](auto & container)
{
using ElementType = std::decay_t<decltype(container[0])>;
column->get(key_index, column_value);
if constexpr (std::is_same_v<ElementType, Field>)
container[index_to_use] = column_value;
else if constexpr (std::is_same_v<ElementType, StringRef>)
{
const String & string_value = column_value.get<String>();
StringRef string_ref_value = StringRef {string_value.data(), string_value.size()};
StringRef inserted_value = copyStringInArena(string_ref_value);
if (!cell_was_default)
{
StringRef previous_value = container[index_to_use];
arena.free(const_cast<char *>(previous_value.data), previous_value.size);
}
container[index_to_use] = inserted_value;
}
else
container[index_to_use] = column_value.get<NearestFieldType<ElementType>>();
});
}
}
temporary_values_pool.rollback(allocated_size_for_columns);
setCellDeadline(cell, now);
}
}
@ -280,94 +413,224 @@ private:
{
const auto now = std::chrono::system_clock::now();
for (auto key : keys)
size_t keys_size = keys.size();
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto * it = cache.find(key);
auto key = keys[key_index];
if (it)
size_t cell_index = getCellIndexForInsert(key);
auto & cell = cells[cell_index];
bool was_inserted = cell.deadline == 0;
bool cell_was_default = cell.is_default;
cell.is_default = true;
if (was_inserted)
{
auto & cell = it->getMapped();
if constexpr (std::is_same_v<KeyType, StringRef>)
cell.key = copyStringInArena(key);
else
cell.key = key;
setCellDeadline(cell, now);
for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
{
getAttributeContainer(attribute_index, [&](auto & container)
{
container.emplace_back();
cell.element_index = container.size() - 1;
});
}
if (cell.place_for_serialized_columns)
arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
cell.allocated_size_for_columns = 0;
cell.place_for_serialized_columns = nullptr;
++size;
}
else
{
Cell cell;
for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
{
getAttributeContainer(attribute_index, [&](const auto & container)
{
using ElementType = std::decay_t<decltype(container[0])>;
setCellDeadline(cell, now);
cell.allocated_size_for_columns = 0;
cell.place_for_serialized_columns = nullptr;
if constexpr (std::is_same_v<ElementType, StringRef>)
{
if (!cell_was_default)
{
StringRef previous_value = container[cell.element_index];
arena.free(const_cast<char *>(previous_value.data), previous_value.size);
}
}
});
}
insertCellInCache(key, cell);
if (cell.key != key)
{
if constexpr (std::is_same_v<KeyType, StringRef>)
{
char * data = const_cast<char *>(cell.key.data);
arena.free(data, cell.key.size);
cell.key = copyStringInArena(key);
}
else
cell.key = key;
}
}
setCellDeadline(cell, now);
}
}
PaddedPODArray<KeyType> getCachedKeysImpl() const
{
PaddedPODArray<KeyType> result;
result.reserve(cache.size());
result.reserve(size);
for (auto & node : cache)
for (auto & cell : cells)
{
auto & cell = node.getMapped();
if (cell.isDefault())
if (cell.deadline == 0)
continue;
result.emplace_back(node.getKey());
if (cell.is_default)
continue;
result.emplace_back(cell.key);
}
return result;
}
template <typename GetContainerFunc>
void getAttributeContainer(size_t attribute_index, GetContainerFunc && func)
{
auto & attribute = attributes[attribute_index];
auto & attribute_type = attribute.type;
if (unlikely(attribute.is_complex_type))
{
auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
std::forward<GetContainerFunc>(func)(container);
}
else
{
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
std::forward<GetContainerFunc>(func)(container);
};
callOnDictionaryAttributeType(attribute_type, type_call);
}
}
template <typename GetContainerFunc>
void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) const
{
return const_cast<std::decay_t<decltype(*this)> *>(this)->template getAttributeContainer(attribute_index, std::forward<GetContainerFunc>(func));
}
StringRef copyStringInArena(StringRef value_to_copy)
{
size_t value_to_copy_size = value_to_copy.size;
char * place_for_key = arena.alloc(value_to_copy_size);
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(value_to_copy.data), value_to_copy_size);
StringRef updated_value{place_for_key, value_to_copy_size};
return updated_value;
}
void setup(const DictionaryStructure & dictionary_structure)
{
/// For each dictionary attribute create storage attribute
/// For simple attributes create PODArray, for complex vector of Fields
attributes.reserve(dictionary_structure.attributes.size());
for (const auto & dictionary_attribute : dictionary_structure.attributes)
{
auto attribute_type = dictionary_attribute.underlying_type;
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
attributes.emplace_back();
auto & last_attribute = attributes.back();
last_attribute.type = attribute_type;
last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array;
if (dictionary_attribute.is_nullable)
last_attribute.attribute_container = std::vector<Field>();
else
last_attribute.attribute_container = PaddedPODArray<ValueType>();
};
callOnDictionaryAttributeType(attribute_type, type_call);
}
}
using TimePoint = std::chrono::system_clock::time_point;
struct Cell
{
TimePoint deadline;
size_t allocated_size_for_columns;
char * place_for_serialized_columns;
inline bool isDefault() const { return place_for_serialized_columns == nullptr; }
inline void setDefault()
{
place_for_serialized_columns = nullptr;
allocated_size_for_columns = 0;
}
KeyType key;
size_t element_index;
bool is_default;
time_t deadline;
};
void insertCellInCache(KeyType & key, const Cell & cell)
struct Attribute
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
{
/// Copy complex key into arena and put in cache
size_t key_size = key.size;
char * place_for_key = arena.alloc(key_size);
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
KeyType updated_key{place_for_key, key_size};
key = updated_key;
}
AttributeUnderlyingType type;
bool is_complex_type;
cache.insert(key, cell);
}
std::variant<
PaddedPODArray<UInt8>,
PaddedPODArray<UInt16>,
PaddedPODArray<UInt32>,
PaddedPODArray<UInt64>,
PaddedPODArray<UInt128>,
PaddedPODArray<Int8>,
PaddedPODArray<Int16>,
PaddedPODArray<Int32>,
PaddedPODArray<Int64>,
PaddedPODArray<Decimal32>,
PaddedPODArray<Decimal64>,
PaddedPODArray<Decimal128>,
PaddedPODArray<Float32>,
PaddedPODArray<Float64>,
PaddedPODArray<StringRef>,
std::vector<Field>> attribute_container;
};
inline static bool cellHasDeadline(const Cell & cell)
{
return cell.deadline != std::chrono::system_clock::from_time_t(0);
}
CacheDictionaryStorageConfiguration configuration;
pcg64 rnd_engine;
size_t size_overlap_mask = 0;
size_t size = 0;
PaddedPODArray<Cell> cells;
ArenaWithFreeLists arena;
std::vector<Attribute> attributes;
inline void setCellDeadline(Cell & cell, TimePoint now)
{
if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
{
cell.deadline = std::chrono::system_clock::from_time_t(0);
/// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
/// to the expiration time. And it overflows pretty well.
auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
cell.deadline = std::chrono::system_clock::to_time_t(deadline);
return;
}
@ -375,44 +638,75 @@ private:
size_t max_sec_lifetime = configuration.lifetime.max_sec;
std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
cell.deadline = now + std::chrono::seconds(distribution(rnd_engine));
auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
cell.deadline = std::chrono::system_clock::to_time_t(deadline);
}
template <typename>
friend class ArenaCellDisposer;
CacheDictionaryStorageConfiguration configuration;
ArenaWithFreeLists arena;
pcg64 rnd_engine;
class ArenaCellDisposer
inline size_t getCellIndex(const KeyType key) const
{
public:
ArenaWithFreeLists & arena;
const size_t hash = DefaultHash<KeyType>()(key);
const size_t index = hash & size_overlap_mask;
return index;
}
template <typename Key, typename Value>
void operator()(const Key & key, const Value & value) const
using KeyStateAndCellIndex = std::pair<KeyState::State, size_t>;
inline KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const
{
size_t place_value = getCellIndex(key);
const size_t place_value_end = place_value + max_collision_length;
time_t max_lifetime_seconds = static_cast<time_t>(configuration.strict_max_lifetime_seconds);
for (; place_value < place_value_end; ++place_value)
{
/// In case of complex key we keep it in arena
if constexpr (std::is_same_v<Key, StringRef>)
arena.free(const_cast<char *>(key.data), key.size);
const auto cell_place_value = place_value & size_overlap_mask;
const auto & cell = cells[cell_place_value];
if (value.place_for_serialized_columns)
arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns);
if (cell.key != key)
continue;
if (unlikely(now > cell.deadline + max_lifetime_seconds))
return std::make_pair(KeyState::not_found, cell_place_value);
if (unlikely(now > cell.deadline))
return std::make_pair(KeyState::expired, cell_place_value);
return std::make_pair(KeyState::found, cell_place_value);
}
};
using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellDisposer>;
using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellDisposer>;
return std::make_pair(KeyState::not_found, place_value & size_overlap_mask);
}
using CacheLRUHashMap = std::conditional_t<
dictionary_key_type == DictionaryKeyType::simple,
SimpleKeyLRUHashMap,
ComplexKeyLRUHashMap>;
inline size_t getCellIndexForInsert(const KeyType & key) const
{
size_t place_value = getCellIndex(key);
const size_t place_value_end = place_value + max_collision_length;
size_t oldest_place_value = place_value;
CacheLRUHashMap cache;
time_t oldest_time = std::numeric_limits<time_t>::max();
for (; place_value < place_value_end; ++place_value)
{
const size_t cell_place_value = place_value & size_overlap_mask;
const Cell cell = cells[cell_place_value];
if (cell.deadline == 0)
return cell_place_value;
if (cell.key == key)
return cell_place_value;
if (cell.deadline < oldest_time)
{
oldest_time = cell.deadline;
oldest_place_value = cell_place_value;
}
}
return oldest_place_value;
}
};
}

View File

@ -12,9 +12,9 @@ struct KeyState
{
enum State: uint8_t
{
not_found = 2,
expired = 4,
found = 8,
not_found = 0,
expired = 1,
found = 2,
};
KeyState(State state_, size_t fetched_column_index_)
@ -31,9 +31,10 @@ struct KeyState
inline bool isNotFound() const { return state == State::not_found; }
inline bool isDefault() const { return is_default; }
inline void setDefault() { is_default = true; }
inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; }
/// Valid only if keyState is found or expired
inline size_t getFetchedColumnIndex() const { return fetched_column_index; }
inline void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; }
private:
State state = not_found;
size_t fetched_column_index = 0;
@ -111,8 +112,8 @@ public:
/// Return size of keys in storage
virtual size_t getSize() const = 0;
/// Return maximum size of keys in storage
virtual size_t getMaxSize() const = 0;
/// Returns storage load factor
virtual double getLoadFactor() const = 0;
/// Return bytes allocated in storage
virtual size_t getBytesAllocated() const = 0;

View File

@ -17,7 +17,7 @@
#include <Common/Arena.h>
#include <Common/ArenaWithFreeLists.h>
#include <Common/MemorySanitizer.h>
#include <Common/HashTable/LRUHashMap.h>
#include <Common/HashTable/HashMap.h>
#include <IO/AIO.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/ICacheDictionaryStorage.h>
@ -56,7 +56,6 @@ struct SSDCacheDictionaryStorageConfiguration
const std::string file_path;
const size_t max_partitions_count;
const size_t max_stored_keys;
const size_t block_size;
const size_t file_blocks_size;
const size_t read_buffer_blocks_size;
@ -127,7 +126,7 @@ public:
/// Reset block with new block_data
/// block_data must be filled with zeroes if it is new block
ALWAYS_INLINE inline void reset(char * new_block_data)
inline void reset(char * new_block_data)
{
block_data = new_block_data;
current_block_offset = block_header_size;
@ -135,13 +134,13 @@ public:
}
/// Check if it is enough place to write key in block
ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const
inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const
{
return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size;
}
/// Check if it is enough place to write key in block
ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const
inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const
{
const StringRef & key = cache_key.key;
size_t complex_key_size = sizeof(key.size) + key.size;
@ -152,7 +151,7 @@ public:
/// Write key and returns offset in ssd cache block where data is written
/// It is client responsibility to check if there is enough place in block to write key
/// Returns true if key was written and false if there was not enough place to write key
ALWAYS_INLINE inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block)
inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block)
{
assert(cache_key.size > 0);
@ -181,7 +180,7 @@ public:
return true;
}
ALWAYS_INLINE inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block)
inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block)
{
assert(cache_key.size > 0);
@ -216,20 +215,20 @@ public:
return true;
}
ALWAYS_INLINE inline size_t getKeysSize() const { return keys_size; }
inline size_t getKeysSize() const { return keys_size; }
/// Write keys size into block header
ALWAYS_INLINE inline void writeKeysSize()
inline void writeKeysSize()
{
char * keys_size_offset_data = block_data + block_header_check_sum_size;
std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t));
}
/// Get check sum from block header
ALWAYS_INLINE inline size_t getCheckSum() const { return unalignedLoad<size_t>(block_data); }
inline size_t getCheckSum() const { return unalignedLoad<size_t>(block_data); }
/// Calculate check sum in block
ALWAYS_INLINE inline size_t calculateCheckSum() const
inline size_t calculateCheckSum() const
{
size_t calculated_check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));
@ -237,7 +236,7 @@ public:
}
/// Check if check sum from block header matched calculated check sum in block
ALWAYS_INLINE inline bool checkCheckSum() const
inline bool checkCheckSum() const
{
size_t calculated_check_sum = calculateCheckSum();
size_t check_sum = getCheckSum();
@ -246,16 +245,16 @@ public:
}
/// Write check sum in block header
ALWAYS_INLINE inline void writeCheckSum()
inline void writeCheckSum()
{
size_t check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));
std::memcpy(block_data, &check_sum, sizeof(size_t));
}
ALWAYS_INLINE inline size_t getBlockSize() const { return block_size; }
inline size_t getBlockSize() const { return block_size; }
/// Returns block data
ALWAYS_INLINE inline char * getBlockData() const { return block_data; }
inline char * getBlockData() const { return block_data; }
/// Read keys that were serialized in block
/// It is client responsibility to ensure that simple or complex keys were written in block
@ -337,9 +336,7 @@ inline bool operator==(const SSDCacheIndex & lhs, const SSDCacheIndex & rhs)
return lhs.block_index == rhs.block_index && lhs.offset_in_block == rhs.offset_in_block;
}
/** SSDCacheMemoryBuffer initialized with block size and memory buffer blocks size.
* Allocate block_size * memory_buffer_blocks_size bytes with page alignment.
* Logically represents multiple memory_buffer_blocks_size blocks and current write block.
/** Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks and current write block.
* If key cannot be written into current_write_block, current block keys size and check summ is written
* and buffer increase index of current_write_block_index.
* If current_write_block_index == memory_buffer_blocks_size write key will always returns true.
@ -444,7 +441,7 @@ private:
size_t current_block_index = 0;
};
/// TODO: Add documentation
/// Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks on file system
template <typename SSDCacheKeyType>
class SSDCacheFileBuffer : private boost::noncopyable
{
@ -614,11 +611,13 @@ public:
}
template <typename FetchBlockFunc>
ALWAYS_INLINE void fetchBlocks(char * read_buffer, size_t read_from_file_buffer_blocks_size, const PaddedPODArray<size_t> & blocks_to_fetch, FetchBlockFunc && func) const
void fetchBlocks(size_t read_from_file_buffer_blocks_size, const PaddedPODArray<size_t> & blocks_to_fetch, FetchBlockFunc && func) const
{
if (blocks_to_fetch.empty())
return;
Memory<Allocator<true>> read_buffer(read_from_file_buffer_blocks_size * block_size, 4096);
size_t blocks_to_fetch_size = blocks_to_fetch.size();
PaddedPODArray<iocb> requests;
@ -631,7 +630,7 @@ public:
{
iocb request{};
char * buffer_place = read_buffer + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);
char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);
#if defined(__FreeBSD__)
request.aio.aio_lio_opcode = LIO_READ;
@ -751,7 +750,7 @@ private:
int fd = -1;
};
ALWAYS_INLINE inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
{
#if defined(__FreeBSD__)
return posix_fallocate(fd, offset, len);
@ -760,7 +759,7 @@ private:
#endif
}
ALWAYS_INLINE inline static char * getRequestBuffer(const iocb & request)
inline static char * getRequestBuffer(const iocb & request)
{
char * result = nullptr;
@ -773,7 +772,7 @@ private:
return result;
}
ALWAYS_INLINE inline static ssize_t eventResult(io_event & event)
inline static ssize_t eventResult(io_event & event)
{
ssize_t bytes_written;
@ -795,7 +794,13 @@ private:
size_t current_blocks_size = 0;
};
/// TODO: Add documentation
/** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions.
* Data is first written in memory buffer.
* If memory buffer is full then buffer is flushed to disk partition.
* If memory buffer cannot be flushed to associated disk partition, then if partition
* can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused.
* Index maps key to partition block and offset.
*/
template <DictionaryKeyType dictionary_key_type>
class SSDCacheDictionaryStorage final : public ICacheDictionaryStorage
{
@ -806,9 +811,7 @@ public:
explicit SSDCacheDictionaryStorage(const SSDCacheDictionaryStorageConfiguration & configuration_)
: configuration(configuration_)
, file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size)
, read_from_file_buffer(configuration_.block_size * configuration_.read_buffer_blocks_size, 4096)
, rnd_engine(randomSeed())
, index(configuration.max_stored_keys, false, { complex_key_arena })
{
memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size);
}
@ -897,14 +900,31 @@ public:
size_t getSize() const override { return index.size(); }
size_t getMaxSize() const override {return index.getMaxSize(); }
double getLoadFactor() const override
{
size_t partitions_size = memory_buffer_partitions.size();
if (partitions_size == configuration.max_partitions_count)
return 1.0;
auto & current_memory_partition = memory_buffer_partitions[current_partition_index];
size_t full_partitions = partitions_size - 1;
size_t blocks_in_memory = (full_partitions * configuration.write_buffer_blocks_size) + current_memory_partition.getCurrentBlockIndex();
size_t blocks_on_disk = file_buffer.getCurrentBlockIndex();
size_t max_blocks_size = (configuration.file_blocks_size + configuration.write_buffer_blocks_size) * configuration.max_partitions_count;
double load_factor = static_cast<double>(blocks_in_memory + blocks_on_disk) / max_blocks_size;
return load_factor;
}
size_t getBytesAllocated() const override
{
size_t memory_partitions_bytes_size = memory_buffer_partitions.size() * configuration.write_buffer_blocks_size * configuration.block_size;
size_t file_partitions_bytes_size = memory_buffer_partitions.size() * configuration.file_blocks_size * configuration.block_size;
return index.getSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size;
return index.getBufferSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size;
}
private:
@ -920,8 +940,7 @@ private:
default_value
};
TimePoint deadline;
time_t deadline;
SSDCacheIndex index;
size_t in_memory_partition_index;
CellState state;
@ -933,13 +952,12 @@ private:
struct KeyToBlockOffset
{
KeyToBlockOffset(size_t key_index_, size_t offset_in_block_, bool is_expired_)
: key_index(key_index_), offset_in_block(offset_in_block_), is_expired(is_expired_)
KeyToBlockOffset(size_t key_index_, size_t offset_in_block_)
: key_index(key_index_), offset_in_block(offset_in_block_)
{}
size_t key_index = 0;
size_t offset_in_block = 0;
bool is_expired = false;
};
template <typename Result>
@ -950,20 +968,24 @@ private:
Result result;
result.fetched_columns = fetch_request.makeAttributesResultColumns();
result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
result.key_index_to_state.resize_fill(keys.size());
const auto now = std::chrono::system_clock::now();
const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
size_t fetched_columns_index = 0;
using BlockIndexToKeysMap = std::unordered_map<size_t, std::vector<KeyToBlockOffset>, DefaultHash<size_t>>;
using BlockIndexToKeysMap = absl::flat_hash_map<size_t, PaddedPODArray<KeyToBlockOffset>, DefaultHash<size_t>>;
BlockIndexToKeysMap block_to_keys_map;
absl::flat_hash_set<size_t, DefaultHash<size_t>> unique_blocks_to_request;
PaddedPODArray<size_t> blocks_to_request;
std::chrono::seconds strict_max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
time_t strict_max_lifetime_seconds = static_cast<time_t>(configuration.strict_max_lifetime_seconds);
size_t keys_size = keys.size();
for (size_t attribute_size = 0; attribute_size < fetch_request.attributesSize(); ++attribute_size)
if (fetch_request.shouldFillResultColumnWithIndex(attribute_size))
result.fetched_columns[attribute_size]->reserve(keys_size);
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys[key_index];
@ -978,9 +1000,7 @@ private:
const auto & cell = it->getMapped();
bool has_deadline = cellHasDeadline(cell);
if (has_deadline && now > cell.deadline + strict_max_lifetime_seconds)
if (unlikely(now > cell.deadline + strict_max_lifetime_seconds))
{
++result.not_found_keys_size;
continue;
@ -989,14 +1009,14 @@ private:
bool cell_is_expired = false;
KeyState::State key_state = KeyState::found;
if (has_deadline && now > cell.deadline)
if (now > cell.deadline)
{
cell_is_expired = true;
key_state = KeyState::expired;
}
result.expired_keys_size += cell_is_expired;
result.found_keys_size += !cell_is_expired;
result.expired_keys_size += static_cast<size_t>(cell_is_expired);
result.found_keys_size += static_cast<size_t>(!cell_is_expired);
switch (cell.state)
{
@ -1012,13 +1032,20 @@ private:
}
case Cell::on_disk:
{
block_to_keys_map[cell.index.block_index].emplace_back(key_index, cell.index.offset_in_block, cell_is_expired);
PaddedPODArray<KeyToBlockOffset> & keys_block = block_to_keys_map[cell.index.block_index];
keys_block.emplace_back(key_index, cell.index.offset_in_block);
if (!unique_blocks_to_request.contains(cell.index.block_index))
{
KeyState::State state = cell_is_expired ? KeyState::expired : KeyState::found;
/// Fetched column index will be set later during fetch blocks
result.key_index_to_state[key_index] = {state, 0};
auto insert_result = unique_blocks_to_request.insert(cell.index.block_index);
bool was_inserted = insert_result.second;
if (was_inserted)
blocks_to_request.emplace_back(cell.index.block_index);
unique_blocks_to_request.insert(cell.index.block_index);
}
break;
}
case Cell::default_value:
@ -1037,7 +1064,7 @@ private:
/// Sort blocks by offset before start async io requests
std::sort(blocks_to_request.begin(), blocks_to_request.end());
file_buffer.fetchBlocks(read_from_file_buffer.m_data, configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data)
file_buffer.fetchBlocks(configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data)
{
auto & keys_in_block = block_to_keys_map[block_index];
@ -1046,10 +1073,7 @@ private:
char * key_data = block_data + key_in_block.offset_in_block;
deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data);
if (key_in_block.is_expired)
result.key_index_to_state[key_in_block.key_index] = {KeyState::expired, fetched_columns_index};
else
result.key_index_to_state[key_in_block.key_index] = {KeyState::found, fetched_columns_index};
result.key_index_to_state[key_in_block.key_index].setFetchedColumnIndex(fetched_columns_index);
++fetched_columns_index;
}
@ -1087,7 +1111,7 @@ private:
throw Exception("Serialized columns size is greater than allowed block size and metadata", ErrorCodes::UNSUPPORTED_METHOD);
/// We cannot reuse place that is already allocated in file or memory cache so we erase key from index
index.erase(key);
eraseKeyFromIndex(key);
Cell cell;
setCellDeadline(cell, now);
@ -1114,8 +1138,7 @@ private:
for (auto key : keys)
{
/// We cannot reuse place that is already allocated in file or memory cache so we erase key from index
index.erase(key);
eraseKeyFromIndex(key);
Cell cell;
@ -1135,7 +1158,7 @@ private:
key = updated_key;
}
index.insert(key, cell);
index[key] = cell;
}
}
@ -1188,7 +1211,7 @@ private:
cell.index = cache_index;
cell.in_memory_partition_index = current_partition_index;
index.insert(ssd_cache_key.key, cell);
index[ssd_cache_key.key] = cell;
break;
}
else
@ -1218,7 +1241,7 @@ private:
if (old_key_cell.isOnDisk() &&
old_key_block >= block_index_in_file_before_write &&
old_key_block < file_read_end_block_index)
index.erase(old_key);
eraseKeyFromIndex(old_key);
}
}
}
@ -1271,7 +1294,7 @@ private:
cell.index = cache_index;
cell.in_memory_partition_index = current_partition_index;
index.insert(ssd_cache_key.key, cell);
index[ssd_cache_key.key] = cell;
break;
}
else
@ -1296,16 +1319,12 @@ private:
}
}
inline static bool cellHasDeadline(const Cell & cell)
{
return cell.deadline != std::chrono::system_clock::from_time_t(0);
}
inline void setCellDeadline(Cell & cell, TimePoint now)
{
if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
{
cell.deadline = std::chrono::system_clock::from_time_t(0);
auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
cell.deadline = std::chrono::system_clock::to_time_t(deadline);
return;
}
@ -1313,47 +1332,45 @@ private:
size_t max_sec_lifetime = configuration.lifetime.max_sec;
std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
cell.deadline = now + std::chrono::seconds{distribution(rnd_engine)};
auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
cell.deadline = std::chrono::system_clock::to_time_t(deadline);
}
template <typename>
friend class ArenaCellKeyDisposer;
inline void eraseKeyFromIndex(KeyType key)
{
auto it = index.find(key);
if (it == nullptr)
return;
/// In case of complex key in arena key is serialized from hash table
KeyType key_copy = it->getKey();
index.erase(key);
if constexpr (std::is_same_v<KeyType, StringRef>)
complex_key_arena.free(const_cast<char *>(key_copy.data), key_copy.size);
}
SSDCacheDictionaryStorageConfiguration configuration;
SSDCacheFileBuffer<SSDCacheKeyType> file_buffer;
Memory<Allocator<true>> read_from_file_buffer;
std::vector<SSDCacheMemoryBuffer<SSDCacheKeyType>> memory_buffer_partitions;
pcg64 rnd_engine;
class ArenaCellKeyDisposer
{
public:
ArenaWithFreeLists & arena;
using SimpleKeyHashMap = HashMap<UInt64, Cell>;
using ComplexKeyHashMap = HashMapWithSavedHash<StringRef, Cell>;
template <typename Key, typename Value>
void operator()(const Key & key, const Value &) const
{
/// In case of complex key we keep it in arena
if constexpr (std::is_same_v<Key, StringRef>)
arena.free(const_cast<char *>(key.data), key.size);
}
};
using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellKeyDisposer>;
using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellKeyDisposer>;
using CacheLRUHashMap = std::conditional_t<
using CacheMap = std::conditional_t<
dictionary_key_type == DictionaryKeyType::simple,
SimpleKeyLRUHashMap,
ComplexKeyLRUHashMap>;
SimpleKeyHashMap,
ComplexKeyHashMap>;
ArenaWithFreeLists complex_key_arena;
CacheLRUHashMap index;
CacheMap index;
size_t current_partition_index = 0;

View File

@ -1,154 +0,0 @@
clickhouse-client --query="DROP TABLE IF EXISTS simple_cache_dictionary_table_source";
clickhouse-client --query="CREATE TABLE simple_cache_dictionary_table_source (id UInt64, value1 String, value2 UInt64, value3 String, value4 Float64, value5 Decimal64(4)) ENGINE=TinyLog;"
clickhouse-client --query="INSERT INTO simple_cache_dictionary_table_source SELECT number, concat('Value1 ', toString(number)), number, concat('Value3 ', toString(number)), toFloat64(number), cast(number, 'Decimal64(4)') FROM system.numbers LIMIT 1000000;"
clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_simple_cache_dictionary (
id UInt64,
value1 String,
value2 UInt64,
value3 String,
value4 Float64,
value5 Decimal64(4)
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
LIFETIME(MIN 300 MAX 300)
LAYOUT(CACHE(SIZE_IN_CELLS 100000));"
clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_ssd_simple_cache_dictionary (
id UInt64,
value1 String,
value2 UInt64,
value3 String,
value4 Float64,
value5 Decimal64(4)
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
LIFETIME(MIN 300 MAX 300)
LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 WRITE_BUFFER_SIZE 327680 MAX_STORED_KEYS 1048576 PATH '/opt/mkita/ClickHouse/build_release/programs/ssd_cache'));"
clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_dummy_simple_cache_dictionary (
id UInt64,
value1 String,
value2 UInt64,
value3 String,
value4 Float64,
value5 Decimal64(4)
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
LIFETIME(MIN 300 MAX 300)
LAYOUT(DUMMY_SIMPLE());"
./clickhouse-benchmark --query="SELECT
dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value1', number),
dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value2', number),
dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value3', number),
dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value4', number),
dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value5', number)
FROM system.numbers
LIMIT 10000
FORMAT Null"
./clickhouse-benchmark --query="SELECT
dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number)
FROM system.numbers
LIMIT 10000
FORMAT Null"
./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null"
./clickhouse-benchmark --query="SELECT
dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number)
FROM system.numbers
LIMIT 10000
FORMAT Null"
./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null"
SELECT
dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number),
dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value2', number),
dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value3', number),
dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value4', number),
dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value5', number)
FROM system.numbers
LIMIT 10000
FORMAT Null
SELECT dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000 FORMAT Null
SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000
FORMAT Null
SELECT
dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number)
FROM system.numbers
LIMIT 10000
FORMAT
Null
SELECT
dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number)
FROM system.numbers
LIMIT 10000
FORMAT
Null
SELECT
dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number)
FROM system.numbers
LIMIT 10000
FORMAT Null
SELECT
dictGet('clickhouse_simple_cache_dictionary', 'value1', number)
FROM system.numbers
LIMIT 100000
FORMAT Null
SELECT
dictGet('clickhouse_simple_cache_dictionary', 'value2', number)
FROM system.numbers
LIMIT 100000
FORMAT Null
SELECT
dictGet('clickhouse_simple_cache_dictionary', 'value3', number)
FROM system.numbers
LIMIT 100000
FORMAT Null
SELECT
dictGet('clickhouse_simple_cache_dictionary', 'value4', number)
FROM system.numbers
LIMIT 100000
FORMAT Null
SELECT
dictGet('clickhouse_simple_cache_dictionary', 'value5', number)
FROM system.numbers
LIMIT 100000
FORMAT Null
SELECT
dictGet('clickhouse_simple_cache_dictionary', 'value1', number),
dictGet('clickhouse_simple_cache_dictionary', 'value2', number),
dictGet('clickhouse_simple_cache_dictionary', 'value3', number),
dictGet('clickhouse_simple_cache_dictionary', 'value4', number),
dictGet('clickhouse_simple_cache_dictionary', 'value5', number)
FROM system.numbers
LIMIT 100000
FORMAT Null
SELECT * FROM clickhouse_simple_cache_dictionary_table;

View File

@ -1,6 +1,6 @@
#include "CacheDictionary.h"
#include "SSDCacheDictionaryStorage.h"
#include "CacheDictionaryStorage.h"
#include "SSDCacheDictionaryStorage.h"
#include <Dictionaries/DictionaryFactory.h>
namespace DB
@ -20,13 +20,13 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration(
const DictionaryLifetime & dict_lifetime,
DictionaryKeyType dictionary_key_type)
{
String dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache.";
String dictionary_type_prefix = (dictionary_key_type == DictionaryKeyType::complex) ? ".complex_key_cache." : ".cache.";
String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix;
const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells");
if (size == 0)
throw Exception(ErrorCodes::TOO_SMALL_BUFFER_SIZE,
"({}: cache dictionary cannot have 0 cells",
"({}): cache dictionary cannot have 0 cells",
full_name);
size_t dict_lifetime_seconds = static_cast<size_t>(dict_lifetime.max_sec);
@ -59,7 +59,6 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration(
static constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES;
static constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES;
static constexpr size_t DEFAULT_MAX_STORED_KEYS = 100000;
static constexpr size_t DEFAULT_PARTITIONS_COUNT = 16;
const size_t max_partitions_count
@ -94,16 +93,11 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration(
if (directory_path.at(0) != '/')
directory_path = std::filesystem::path{config.getString("path")}.concat(directory_path).string();
const size_t max_stored_keys_in_partition
= config.getInt64(dictionary_configuration_prefix + "max_stored_keys", DEFAULT_MAX_STORED_KEYS);
const size_t rounded_size = roundUpToPowerOfTwoOrZero(max_stored_keys_in_partition);
SSDCacheDictionaryStorageConfiguration configuration{
strict_max_lifetime_seconds,
dict_lifetime,
directory_path,
max_partitions_count,
rounded_size,
block_size,
file_size / block_size,
read_buffer_size / block_size,
@ -194,7 +188,8 @@ DictionaryPtr createCacheDictionaryLayout(
const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false);
auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type);
auto storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(storage_configuration);
std::shared_ptr<ICacheDictionaryStorage> storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(dict_struct, storage_configuration);
auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type);

View File

@ -209,7 +209,13 @@ void DiskCacheWrapper::clearDirectory(const String & path)
void DiskCacheWrapper::moveDirectory(const String & from_path, const String & to_path)
{
if (cache_disk->exists(from_path))
{
/// Destination directory may not be empty if previous directory move attempt was failed.
if (cache_disk->exists(to_path) && cache_disk->isDirectory(to_path))
cache_disk->clearDirectory(to_path);
cache_disk->moveDirectory(from_path, to_path);
}
DiskDecorator::moveDirectory(from_path, to_path);
}

View File

@ -538,8 +538,9 @@ private:
[[maybe_unused]] const auto block_size = static_cast<size_t>(EVP_CIPHER_block_size(evp_cipher));
[[maybe_unused]] const auto iv_size = static_cast<size_t>(EVP_CIPHER_iv_length(evp_cipher));
const auto key_size = static_cast<size_t>(EVP_CIPHER_key_length(evp_cipher));
const auto tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1
const size_t key_size = static_cast<size_t>(EVP_CIPHER_key_length(evp_cipher));
static constexpr size_t tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1
auto decrypted_result_column = ColumnString::create();
auto & decrypted_result_column_data = decrypted_result_column->getChars();
@ -549,9 +550,17 @@ private:
size_t resulting_size = 0;
for (size_t r = 0; r < input_rows_count; ++r)
{
resulting_size += input_column->getDataAt(r).size + 1;
size_t string_size = input_column->getDataAt(r).size;
resulting_size += string_size + 1; /// With terminating zero.
if constexpr (mode == CipherMode::RFC5116_AEAD_AES_GCM)
{
if (string_size < tag_size)
throw Exception("Encrypted data is smaller than the size of additional data for AEAD mode, cannot decrypt.",
ErrorCodes::BAD_ARGUMENTS);
resulting_size -= tag_size;
}
}
#if defined(MEMORY_SANITIZER)
@ -565,6 +574,7 @@ private:
decrypted_result_column_data.resize(resulting_size);
#endif
}
auto * decrypted = decrypted_result_column_data.data();
KeyHolder<mode> key_holder;
@ -631,7 +641,7 @@ private:
// 1.a.2: Set AAD if present
if (aad_column)
{
const auto aad_data = aad_column->getDataAt(r);
StringRef aad_data = aad_column->getDataAt(r);
int tmp_len = 0;
if (aad_data.size != 0 && EVP_DecryptUpdate(evp_ctx, nullptr, &tmp_len,
reinterpret_cast<const unsigned char *>(aad_data.data), aad_data.size) != 1)

View File

@ -42,11 +42,11 @@ struct SimdJSONParser
ALWAYS_INLINE bool isBool() const { return element.type() == simdjson::dom::element_type::BOOL; }
ALWAYS_INLINE bool isNull() const { return element.type() == simdjson::dom::element_type::NULL_VALUE; }
ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().first; }
ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().first; }
ALWAYS_INLINE double getDouble() const { return element.get_double().first; }
ALWAYS_INLINE bool getBool() const { return element.get_bool().first; }
ALWAYS_INLINE std::string_view getString() const { return element.get_string().first; }
ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().value_unsafe(); }
ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().value_unsafe(); }
ALWAYS_INLINE double getDouble() const { return element.get_double().value_unsafe(); }
ALWAYS_INLINE bool getBool() const { return element.get_bool().value_unsafe(); }
ALWAYS_INLINE std::string_view getString() const { return element.get_string().value_unsafe(); }
ALWAYS_INLINE Array getArray() const;
ALWAYS_INLINE Object getObject() const;
@ -75,7 +75,7 @@ struct SimdJSONParser
ALWAYS_INLINE Iterator begin() const { return array.begin(); }
ALWAYS_INLINE Iterator end() const { return array.end(); }
ALWAYS_INLINE size_t size() const { return array.size(); }
ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).first; }
ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).value_unsafe(); }
private:
simdjson::dom::array array;
@ -111,7 +111,7 @@ struct SimdJSONParser
if (x.error())
return false;
result = x.first;
result = x.value_unsafe();
return true;
}
@ -137,7 +137,7 @@ struct SimdJSONParser
if (document.error())
return false;
result = document.first;
result = document.value_unsafe();
return true;
}
@ -155,12 +155,12 @@ private:
inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const
{
return element.get_array().first;
return element.get_array().value_unsafe();
}
inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const
{
return element.get_object().first;
return element.get_object().value_unsafe();
}
}

View File

@ -49,8 +49,11 @@ public:
{}
template <typename ... Args>
inline auto execute(const DateTime64 & t, Args && ... args) const
inline auto NO_SANITIZE_UNDEFINED execute(const DateTime64 & t, Args && ... args) const
{
/// Type conversion from float to integer may be required.
/// We are Ok with implementation specific result for out of range and denormals conversion.
if constexpr (TransformHasExecuteOverload_v<DateTime64, decltype(scale_multiplier), Args...>)
{
return wrapped_transform.execute(t, scale_multiplier, std::forward<Args>(args)...);

View File

@ -90,7 +90,70 @@ struct ExtractFirstSignificantSubdomain
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
}
}
/// The difference with execute() is due to custom TLD list can have records of any level,
/// not only 2-nd level (like non-custom variant), so it requires more lookups.
template <class Lookup>
static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
{
res_data = data;
res_size = 0;
Pos tmp;
size_t domain_length;
ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
if (domain_length == 0)
return;
if (out_domain_end)
*out_domain_end = tmp + domain_length;
/// cut useless dot
if (tmp[domain_length - 1] == '.')
--domain_length;
res_data = tmp;
res_size = domain_length;
auto begin = tmp;
auto end = begin + domain_length;
const char * last_2_periods[2]{};
const char * prev = begin - 1;
auto pos = find_first_symbols<'.'>(begin, end);
while (pos < end)
{
if (lookup(pos + 1, end - pos - 1))
{
res_data += prev + 1 - begin;
res_size = end - 1 - prev;
return;
}
last_2_periods[1] = last_2_periods[0];
last_2_periods[0] = pos;
prev = pos;
pos = find_first_symbols<'.'>(pos + 1, end);
}
/// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing
if (!last_2_periods[0])
return;
/// if there is domain of the second level -> always return itself
if (!last_2_periods[1])
{
res_size = last_2_periods[0] - begin;
return;
}
/// if there is domain of the 3+ level, and zero records in TLD list ->
/// fallback to domain of the second level
res_data += last_2_periods[1] + 1 - begin;
res_size = last_2_periods[0] - last_2_periods[1] - 1;
}
};
}

View File

@ -17,10 +17,10 @@ namespace ErrorCodes
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
struct FirstSignificantSubdomainCustomtLookup
struct FirstSignificantSubdomainCustomLookup
{
const TLDList & tld_list;
FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name)
FirstSignificantSubdomainCustomLookup(const std::string & tld_list_name)
: tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name))
{
}
@ -63,7 +63,7 @@ public:
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
{
const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue<String>());
FirstSignificantSubdomainCustomLookup tld_lookup(column_tld_list_name->getValue<String>());
/// FIXME: convertToFullColumnIfConst() is suboptimal
auto column = arguments[0].column->convertToFullColumnIfConst();
@ -79,7 +79,7 @@ public:
ErrorCodes::ILLEGAL_COLUMN);
}
static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup,
static void vector(FirstSignificantSubdomainCustomLookup & tld_lookup,
const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
{

View File

@ -10,7 +10,7 @@ struct CutToFirstSignificantSubdomainCustom
{
static size_t getReserveLengthForElement() { return 15; }
static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
static void execute(FirstSignificantSubdomainCustomLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
res_size = 0;
@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom
Pos tmp_data;
size_t tmp_length;
Pos domain_end;
ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
ExtractFirstSignificantSubdomain<without_www>::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
if (tmp_length == 0)
return;

View File

@ -190,7 +190,7 @@ private:
}
static constexpr size_t MAX_ARRAY_SIZE = 1ULL << 30;
if (static_cast<size_t>(max_key - min_key) > MAX_ARRAY_SIZE)
if (static_cast<size_t>(max_key) - static_cast<size_t>(min_key) > MAX_ARRAY_SIZE)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in the result of function {}", getName());
/* fill the result arrays */

View File

@ -16,6 +16,7 @@ namespace ErrorCodes
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int BAD_ARGUMENTS;
}
namespace
@ -110,6 +111,9 @@ public:
arguments[2].column->getFloat64(i),
max_width);
if (!isFinite(width))
throw Exception("Value of width must not be NaN and Inf", ErrorCodes::BAD_ARGUMENTS);
size_t next_size = current_offset + UnicodeBar::getWidthInBytes(width) + 1;
dst_chars.resize(next_size);
UnicodeBar::render(width, reinterpret_cast<char *>(&dst_chars[current_offset]));

View File

@ -41,7 +41,8 @@ void registerFunctionThrowIf(FunctionFactory &);
void registerFunctionVersion(FunctionFactory &);
void registerFunctionBuildId(FunctionFactory &);
void registerFunctionUptime(FunctionFactory &);
void registerFunctionTimeZone(FunctionFactory &);
void registerFunctionTimezone(FunctionFactory &);
void registerFunctionTimezoneOf(FunctionFactory &);
void registerFunctionRunningAccumulate(FunctionFactory &);
void registerFunctionRunningDifference(FunctionFactory &);
void registerFunctionRunningDifferenceStartingWithFirstValue(FunctionFactory &);
@ -111,7 +112,8 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
registerFunctionVersion(factory);
registerFunctionBuildId(factory);
registerFunctionUptime(factory);
registerFunctionTimeZone(factory);
registerFunctionTimezone(factory);
registerFunctionTimezoneOf(factory);
registerFunctionRunningAccumulate(factory);
registerFunctionRunningDifference(factory);
registerFunctionRunningDifferenceStartingWithFirstValue(factory);

View File

@ -12,13 +12,13 @@ namespace
/** Returns the server time zone.
*/
class FunctionTimeZone : public IFunction
class FunctionTimezone : public IFunction
{
public:
static constexpr auto name = "timezone";
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionTimeZone>();
return std::make_shared<FunctionTimezone>();
}
String getName() const override
@ -45,9 +45,10 @@ public:
}
void registerFunctionTimeZone(FunctionFactory & factory)
void registerFunctionTimezone(FunctionFactory & factory)
{
factory.registerFunction<FunctionTimeZone>();
factory.registerFunction<FunctionTimezone>();
factory.registerAlias("timeZone", "timezone");
}
}

View File

@ -0,0 +1,118 @@
#include <Functions/IFunctionImpl.h>
#include <Functions/FunctionFactory.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeDateTime.h>
#include <common/DateLUTImpl.h>
#include <Core/Field.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
namespace
{
/** timezoneOf(x) - get the name of the timezone of DateTime data type.
* Example: Europe/Moscow.
*/
class ExecutableFunctionTimezoneOf : public IExecutableFunctionImpl
{
public:
static constexpr auto name = "timezoneOf";
String getName() const override { return name; }
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
/// Execute the function on the columns.
ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
DataTypePtr type_no_nullable = removeNullable(arguments[0].type);
return DataTypeString().createColumnConst(input_rows_count,
dynamic_cast<const TimezoneMixin &>(*type_no_nullable).getTimeZone().getTimeZone());
}
};
class BaseFunctionTimezoneOf : public IFunctionBaseImpl
{
public:
BaseFunctionTimezoneOf(DataTypes argument_types_, DataTypePtr return_type_)
: argument_types(std::move(argument_types_)), return_type(std::move(return_type_)) {}
static constexpr auto name = "timezoneOf";
String getName() const override { return name; }
bool isDeterministic() const override { return true; }
bool isDeterministicInScopeOfQuery() const override { return true; }
const DataTypes & getArgumentTypes() const override { return argument_types; }
const DataTypePtr & getResultType() const override { return return_type; }
ExecutableFunctionImplPtr prepare(const ColumnsWithTypeAndName &) const override
{
return std::make_unique<ExecutableFunctionTimezoneOf>();
}
ColumnPtr getResultIfAlwaysReturnsConstantAndHasArguments(const ColumnsWithTypeAndName & arguments) const override
{
DataTypePtr type_no_nullable = removeNullable(arguments[0].type);
return DataTypeString().createColumnConst(1,
dynamic_cast<const TimezoneMixin &>(*type_no_nullable).getTimeZone().getTimeZone());
}
private:
DataTypes argument_types;
DataTypePtr return_type;
};
class FunctionTimezoneOfBuilder : public IFunctionOverloadResolverImpl
{
public:
static constexpr auto name = "timezoneOf";
String getName() const override { return name; }
static FunctionOverloadResolverImplPtr create(const Context &) { return std::make_unique<FunctionTimezoneOfBuilder>(); }
size_t getNumberOfArguments() const override { return 1; }
DataTypePtr getReturnType(const DataTypes & types) const override
{
DataTypePtr type_no_nullable = removeNullable(types[0]);
if (isDateTime(type_no_nullable) || isDateTime64(type_no_nullable))
return std::make_shared<DataTypeString>();
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad argument for function {}, should be DateTime or DateTime64", name);
}
FunctionBaseImplPtr build(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
{
return std::make_unique<BaseFunctionTimezoneOf>(DataTypes{arguments[0].type}, return_type);
}
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t /*number_of_arguments*/) const override { return {0}; }
};
}
void registerFunctionTimezoneOf(FunctionFactory & factory)
{
factory.registerFunction<FunctionTimezoneOfBuilder>();
factory.registerAlias("timeZoneOf", "timezoneOf");
}
}

View File

@ -21,11 +21,11 @@ namespace
{
/// Just changes time zone information for data type. The calculation is free.
class FunctionToTimeZone : public IFunction
class FunctionToTimezone : public IFunction
{
public:
static constexpr auto name = "toTimeZone";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionToTimeZone>(); }
static constexpr auto name = "toTimezone";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionToTimezone>(); }
String getName() const override
{
@ -64,7 +64,8 @@ public:
void registerFunctionToTimeZone(FunctionFactory & factory)
{
factory.registerFunction<FunctionToTimeZone>();
factory.registerFunction<FunctionToTimezone>();
factory.registerAlias("toTimeZone", "toTimezone");
}
}

View File

@ -467,6 +467,7 @@ SRCS(
timeSlot.cpp
timeSlots.cpp
timezone.cpp
timezoneOf.cpp
timezoneOffset.cpp
toColumnTypeName.cpp
toCustomWeek.cpp
@ -506,7 +507,7 @@ SRCS(
toStartOfTenMinutes.cpp
toStartOfYear.cpp
toTime.cpp
toTimeZone.cpp
toTimezone.cpp
toTypeName.cpp
toUnixTimestamp64Micro.cpp
toUnixTimestamp64Milli.cpp

View File

@ -106,7 +106,7 @@ void BrotliWriteBuffer::finish()
try
{
finishImpl();
out->next();
out->finalize();
finished = true;
}
catch (...)

View File

@ -105,7 +105,7 @@ void LZMADeflatingWriteBuffer::finish()
try
{
finishImpl();
out->next();
out->finalize();
finished = true;
}
catch (...)

View File

@ -82,6 +82,7 @@ bool PeekableReadBuffer::peekNext()
checkpoint.emplace(memory.data());
checkpoint_in_own_memory = true;
}
if (currentlyReadFromOwnMemory())
{
/// Update buffer size
@ -99,7 +100,6 @@ bool PeekableReadBuffer::peekNext()
pos_offset = 0;
}
BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset);
}
peeked_size += bytes_to_copy;
@ -113,12 +113,21 @@ void PeekableReadBuffer::rollbackToCheckpoint(bool drop)
{
checkStateCorrect();
if (!checkpoint)
throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
assert(checkpoint);
if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
{
/// Both checkpoint and position are in the same buffer.
pos = *checkpoint;
else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory
}
else
{
/// Checkpoint is in own memory and position is not.
assert(checkpointInOwnMemory());
/// Switch to reading from own memory.
BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data());
}
if (drop)
dropCheckpoint();
@ -134,6 +143,7 @@ bool PeekableReadBuffer::nextImpl()
checkStateCorrect();
bool res;
bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end() && currentlyReadFromOwnMemory();
if (checkpoint)
{
@ -163,6 +173,13 @@ bool PeekableReadBuffer::nextImpl()
BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset());
nextimpl_working_buffer_offset = sub_buf.offset();
if (checkpoint_at_end)
{
checkpoint.emplace(working_buffer.begin());
peeked_size = 0;
checkpoint_in_own_memory = false;
}
checkStateCorrect();
return res;
}

View File

@ -43,10 +43,7 @@ public:
/// Forget checkpoint and all data between checkpoint and position
ALWAYS_INLINE inline void dropCheckpoint()
{
#ifndef NDEBUG
if (!checkpoint)
throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
#endif
assert(checkpoint);
if (!currentlyReadFromOwnMemory())
{
/// Don't need to store unread data anymore

View File

@ -1,6 +1,7 @@
#include <Poco/Net/NetException.h>
#include <IO/ReadBufferFromPocoSocket.h>
#include <IO/TimeoutSetter.h>
#include <Common/Exception.h>
#include <Common/NetException.h>
#include <Common/Stopwatch.h>
@ -27,23 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl()
ssize_t bytes_read = 0;
Stopwatch watch;
int flags = 0;
if (async_callback)
flags |= MSG_DONTWAIT;
/// Add more details to exceptions.
try
{
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
/// If async_callback is specified, and read is blocking, run async_callback and try again later.
/// If async_callback is specified, and read will block, run async_callback and try again later.
/// It is expected that file descriptor may be polled externally.
/// Note that receive timeout is not checked here. External code should check it while polling.
while (bytes_read < 0 && async_callback && errno == EAGAIN)
{
while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ))
async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), socket_description);
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
}
/// receiveBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout),
/// but we want to get this exception exactly after receive_timeout. So, set send_timeout = receive_timeout
/// before receiveBytes.
std::unique_ptr<TimeoutSetter> timeout_setter = nullptr;
if (socket.secure())
timeout_setter = std::make_unique<TimeoutSetter>(dynamic_cast<Poco::Net::StreamSocket &>(socket), socket.getReceiveTimeout(), socket.getReceiveTimeout());
bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size());
}
catch (const Poco::Net::NetException & e)
{

View File

@ -1,4 +1,4 @@
#include "TimeoutSetter.h"
#include <IO/TimeoutSetter.h>
#include <common/logger_useful.h>

View File

@ -1,6 +1,7 @@
#include <Poco/Net/NetException.h>
#include <IO/WriteBufferFromPocoSocket.h>
#include <IO/TimeoutSetter.h>
#include <Common/Exception.h>
#include <Common/NetException.h>
@ -40,6 +41,13 @@ void WriteBufferFromPocoSocket::nextImpl()
/// Add more details to exceptions.
try
{
/// sendBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout),
/// but we want to get this exception exactly after send_timeout. So, set receive_timeout = send_timeout
/// before sendBytes.
std::unique_ptr<TimeoutSetter> timeout_setter = nullptr;
if (socket.secure())
timeout_setter = std::make_unique<TimeoutSetter>(dynamic_cast<Poco::Net::StreamSocket &>(socket), socket.getSendTimeout(), socket.getSendTimeout());
res = socket.impl()->sendBytes(working_buffer.begin() + bytes_written, offset() - bytes_written);
}
catch (const Poco::Net::NetException & e)

View File

@ -120,7 +120,7 @@ WriteBufferFromS3::~WriteBufferFromS3()
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
tryLogCurrentException(log);
}
}

View File

@ -107,7 +107,7 @@ void ZlibDeflatingWriteBuffer::finish()
try
{
finishImpl();
out->next();
out->finalize();
finished = true;
}
catch (...)

View File

@ -94,7 +94,7 @@ void ZstdDeflatingWriteBuffer::finish()
try
{
finishImpl();
out->next();
out->finalize();
finished = true;
}
catch (...)

View File

@ -6,11 +6,6 @@
#include <IO/ConcatReadBuffer.h>
#include <IO/PeekableReadBuffer.h>
namespace DB::ErrorCodes
{
extern const int LOGICAL_ERROR;
}
static void readAndAssert(DB::ReadBuffer & buf, const char * str)
{
size_t n = strlen(str);
@ -48,20 +43,6 @@ try
readAndAssert(peekable, "01234");
}
#ifndef ABORT_ON_LOGICAL_ERROR
bool exception = false;
try
{
peekable.rollbackToCheckpoint();
}
catch (DB::Exception & e)
{
if (e.code() != DB::ErrorCodes::LOGICAL_ERROR)
throw;
exception = true;
}
ASSERT_TRUE(exception);
#endif
assertAvailable(peekable, "56789");
readAndAssert(peekable, "56");

View File

@ -50,6 +50,7 @@ SRCS(
ReadBufferFromPocoSocket.cpp
ReadHelpers.cpp
SeekAvoidingReadBuffer.cpp
TimeoutSetter.cpp
UseSSL.cpp
WriteBufferFromFile.cpp
WriteBufferFromFileBase.cpp

View File

@ -818,13 +818,10 @@ private:
if (!min_id)
min_id = getMinIDToFinishLoading(forced_to_reload);
if (info->state_id >= min_id)
return true; /// stop
if (info->loading_id < min_id)
startLoading(*info, forced_to_reload, *min_id);
/// Wait for the next event if loading wasn't completed, and stop otherwise.
/// Wait for the next event if loading wasn't completed, or stop otherwise.
return (info->state_id >= min_id);
};
@ -850,9 +847,6 @@ private:
if (filter && !filter(name))
continue;
if (info.state_id >= min_id)
continue;
if (info.loading_id < min_id)
startLoading(info, forced_to_reload, *min_id);

View File

@ -260,7 +260,8 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
renamed = true;
}
database->loadStoredObjects(context, has_force_restore_data_flag, create.attach && force_attach);
/// We use global context here, because storages lifetime is bigger than query context lifetime
database->loadStoredObjects(context.getGlobalContext(), has_force_restore_data_flag, create.attach && force_attach);
}
catch (...)
{
@ -970,7 +971,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
if (create.as_table_function)
{
const auto & factory = TableFunctionFactory::instance();
res = factory.get(create.as_table_function, context)->execute(create.as_table_function, context, create.table, properties.columns);
auto table_func = factory.get(create.as_table_function, context);
res = table_func->execute(create.as_table_function, context, create.table, properties.columns);
res->renameInMemory({create.database, create.table, create.uuid});
}
else

View File

@ -393,7 +393,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
view = nullptr;
}
if (try_move_to_prewhere && storage && query.where() && !query.prewhere() && !query.final())
if (try_move_to_prewhere && storage && query.where() && !query.prewhere())
{
/// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable
if (const auto & column_sizes = storage->getColumnSizes(); !column_sizes.empty())

View File

@ -9,8 +9,6 @@
#include <Common/ActionBlocker.h>
#include <common/types.h>
#include <Poco/Net/HTMLForm.h>
#include <atomic>
#include <map>
#include <shared_mutex>

View File

@ -1,5 +1,6 @@
#include <Interpreters/WindowDescription.h>
#include <Core/Field.h>
#include <IO/Operators.h>
#include <Parsers/ASTFunction.h>
@ -60,7 +61,7 @@ void WindowFrame::toString(WriteBuffer & buf) const
}
else
{
buf << abs(begin_offset);
buf << applyVisitor(FieldVisitorToString(), begin_offset);
buf << " "
<< (begin_preceding ? "PRECEDING" : "FOLLOWING");
}
@ -77,7 +78,7 @@ void WindowFrame::toString(WriteBuffer & buf) const
}
else
{
buf << abs(end_offset);
buf << applyVisitor(FieldVisitorToString(), end_offset);
buf << " "
<< (end_preceding ? "PRECEDING" : "FOLLOWING");
}
@ -121,23 +122,33 @@ void WindowFrame::checkValid() const
if (end_type == BoundaryType::Offset
&& begin_type == BoundaryType::Offset)
{
// Frame starting with following rows can't have preceding rows.
if (!(end_preceding && !begin_preceding))
// Frame start offset must be less or equal that the frame end offset.
bool begin_less_equal_end;
if (begin_preceding && end_preceding)
{
// Frame start offset must be less or equal that the frame end offset.
const bool begin_before_end
= begin_offset * (begin_preceding ? -1 : 1)
<= end_offset * (end_preceding ? -1 : 1);
if (!begin_before_end)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Frame start offset {} {} does not precede the frame end offset {} {}",
begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING",
end_offset, end_preceding ? "PRECEDING" : "FOLLOWING");
}
return;
begin_less_equal_end = begin_offset >= end_offset;
}
else if (begin_preceding && !end_preceding)
{
begin_less_equal_end = true;
}
else if (!begin_preceding && end_preceding)
{
begin_less_equal_end = false;
}
else /* if (!begin_preceding && !end_preceding) */
{
begin_less_equal_end = begin_offset <= end_offset;
}
if (!begin_less_equal_end)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Frame start offset {} {} does not precede the frame end offset {} {}",
begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING",
end_offset, end_preceding ? "PRECEDING" : "FOLLOWING");
}
return;
}
throw Exception(ErrorCodes::BAD_ARGUMENTS,

View File

@ -44,14 +44,13 @@ struct WindowFrame
// Offset might be both preceding and following, controlled by begin_preceding,
// but the offset value must be positive.
BoundaryType begin_type = BoundaryType::Unbounded;
// This should have been a Field but I'm getting some crazy linker errors.
int64_t begin_offset = 0;
Field begin_offset = 0;
bool begin_preceding = true;
// Here as well, Unbounded can only be UNBOUNDED FOLLOWING, and end_preceding
// must be false.
BoundaryType end_type = BoundaryType::Current;
int64_t end_offset = 0;
Field end_offset = 0;
bool end_preceding = false;

View File

@ -377,6 +377,11 @@ Field convertFieldToType(const Field & from_value, const IDataType & to_type, co
else if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(&to_type))
{
const IDataType & nested_type = *nullable_type->getNestedType();
/// NULL remains NULL after any conversion.
if (WhichDataType(nested_type).isNothing())
return {};
if (from_type_hint && from_type_hint->equals(nested_type))
return from_value;
return convertFieldToTypeImpl(from_value, nested_type, from_type_hint);

View File

@ -290,8 +290,6 @@ std::optional<Blocks> evaluateExpressionOverConstantCondition(const ASTPtr & nod
{
Blocks result;
// TODO: `node` may be always-false literal.
if (const auto * fn = node->as<ASTFunction>())
{
const auto dnf = analyzeFunction(fn, target_expr);
@ -350,6 +348,14 @@ std::optional<Blocks> evaluateExpressionOverConstantCondition(const ASTPtr & nod
}
}
}
else if (const auto * literal = node->as<ASTLiteral>())
{
// Check if it's always true or false.
if (literal->value.getType() == Field::Types::UInt64 && literal->value.get<UInt64>() == 0)
return {result};
else
return {};
}
return {result};
}

View File

@ -137,8 +137,8 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
if (window())
{
s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str <<
"WINDOW " << (s.hilite ? hilite_none : "");
window()->formatImpl(s, state, frame);
"WINDOW" << (s.hilite ? hilite_none : "");
window()->as<ASTExpressionList &>().formatImplMultiline(s, state, frame);
}
if (orderBy())

View File

@ -35,6 +35,8 @@ String ASTWindowDefinition::getID(char) const
void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
FormatState & state, FormatStateStacked format_frame) const
{
format_frame.expression_list_prepend_whitespace = false;
if (partition_by)
{
settings.ostr << "PARTITION BY ";
@ -70,7 +72,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
}
else
{
settings.ostr << abs(frame.begin_offset);
settings.ostr << applyVisitor(FieldVisitorToString(),
frame.begin_offset);
settings.ostr << " "
<< (!frame.begin_preceding ? "FOLLOWING" : "PRECEDING");
}
@ -85,7 +88,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
}
else
{
settings.ostr << abs(frame.end_offset);
settings.ostr << applyVisitor(FieldVisitorToString(),
frame.end_offset);
settings.ostr << " "
<< (!frame.end_preceding ? "FOLLOWING" : "PRECEDING");
}

View File

@ -581,30 +581,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
else if (parser_literal.parse(pos, ast_literal, expected))
{
const Field & value = ast_literal->as<ASTLiteral &>().value;
if (!isInt64FieldType(value.getType()))
if ((node->frame.type == WindowFrame::FrameType::Rows
|| node->frame.type == WindowFrame::FrameType::Groups)
&& !(value.getType() == Field::Types::UInt64
|| (value.getType() == Field::Types::Int64
&& value.get<Int64>() >= 0)))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Only integer frame offsets are supported, '{}' is not supported.",
"Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.",
WindowFrame::toString(node->frame.type),
applyVisitor(FieldVisitorToString(), value),
Field::Types::toString(value.getType()));
}
node->frame.begin_offset = value.get<Int64>();
node->frame.begin_offset = value;
node->frame.begin_type = WindowFrame::BoundaryType::Offset;
// We can easily get a UINT64_MAX here, which doesn't even fit into
// int64_t. Not sure what checks we are going to need here after we
// support floats and dates.
if (node->frame.begin_offset > INT_MAX || node->frame.begin_offset < INT_MIN)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Frame offset must be between {} and {}, but {} is given",
INT_MAX, INT_MIN, node->frame.begin_offset);
}
if (node->frame.begin_offset < 0)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Frame start offset must be greater than zero, {} given",
node->frame.begin_offset);
}
}
else
{
@ -652,28 +642,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
else if (parser_literal.parse(pos, ast_literal, expected))
{
const Field & value = ast_literal->as<ASTLiteral &>().value;
if (!isInt64FieldType(value.getType()))
if ((node->frame.type == WindowFrame::FrameType::Rows
|| node->frame.type == WindowFrame::FrameType::Groups)
&& !(value.getType() == Field::Types::UInt64
|| (value.getType() == Field::Types::Int64
&& value.get<Int64>() >= 0)))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Only integer frame offsets are supported, '{}' is not supported.",
"Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.",
WindowFrame::toString(node->frame.type),
applyVisitor(FieldVisitorToString(), value),
Field::Types::toString(value.getType()));
}
node->frame.end_offset = value.get<Int64>();
node->frame.end_offset = value;
node->frame.end_type = WindowFrame::BoundaryType::Offset;
if (node->frame.end_offset > INT_MAX || node->frame.end_offset < INT_MIN)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Frame offset must be between {} and {}, but {} is given",
INT_MAX, INT_MIN, node->frame.end_offset);
}
if (node->frame.end_offset < 0)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Frame end offset must be greater than zero, {} given",
node->frame.end_offset);
}
}
else
{

View File

@ -275,7 +275,8 @@ Token Lexer::nextTokenImpl()
else
++pos;
}
return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end);
pos = end;
return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, pos);
}
}
return Token(TokenType::Slash, token_begin, pos);

View File

@ -3,6 +3,7 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <Common/Arena.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/getLeastSupertype.h>
#include <Interpreters/ExpressionActions.h>
#include <Interpreters/convertFieldToType.h>
@ -27,7 +28,8 @@ public:
virtual ~IWindowFunction() = default;
// Must insert the result for current_row.
virtual void windowInsertResultInto(IColumn & to, const WindowTransform * transform) = 0;
virtual void windowInsertResultInto(const WindowTransform * transform,
size_t function_index) = 0;
};
// Compares ORDER BY column values at given rows to find the boundaries of frame:
@ -37,7 +39,7 @@ template <typename ColumnType>
static int compareValuesWithOffset(const IColumn * _compared_column,
size_t compared_row, const IColumn * _reference_column,
size_t reference_row,
uint64_t _offset,
const Field & _offset,
bool offset_is_preceding)
{
// Casting the columns to the known type here makes it faster, probably
@ -46,7 +48,8 @@ static int compareValuesWithOffset(const IColumn * _compared_column,
_compared_column);
const auto * reference_column = assert_cast<const ColumnType *>(
_reference_column);
const auto offset = static_cast<typename ColumnType::ValueType>(_offset);
const auto offset = _offset.get<typename ColumnType::ValueType>();
assert(offset >= 0);
const auto compared_value_data = compared_column->getDataAt(compared_row);
assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
@ -101,6 +104,53 @@ static int compareValuesWithOffset(const IColumn * _compared_column,
}
}
// A specialization of compareValuesWithOffset for floats.
template <typename ColumnType>
static int compareValuesWithOffsetFloat(const IColumn * _compared_column,
size_t compared_row, const IColumn * _reference_column,
size_t reference_row,
const Field & _offset,
bool offset_is_preceding)
{
// Casting the columns to the known type here makes it faster, probably
// because the getData call can be devirtualized.
const auto * compared_column = assert_cast<const ColumnType *>(
_compared_column);
const auto * reference_column = assert_cast<const ColumnType *>(
_reference_column);
const auto offset = _offset.get<typename ColumnType::ValueType>();
assert(offset >= 0);
const auto compared_value_data = compared_column->getDataAt(compared_row);
assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
auto compared_value = unalignedLoad<typename ColumnType::ValueType>(
compared_value_data.data);
const auto reference_value_data = reference_column->getDataAt(reference_row);
assert(reference_value_data.size == sizeof(typename ColumnType::ValueType));
auto reference_value = unalignedLoad<typename ColumnType::ValueType>(
reference_value_data.data);
// Floats overflow to Inf and the comparison will work normally, so we don't
// have to do anything.
if (offset_is_preceding)
{
reference_value -= offset;
}
else
{
reference_value += offset;
}
const auto result = compared_value < reference_value ? -1
: compared_value == reference_value ? 0 : 1;
// fmt::print(stderr, "compared {}, offset {}, reference {}, result {}\n",
// compared_value, offset, reference_value, result);
return result;
}
// Helper macros to dispatch on type of the ORDER BY column
#define APPLY_FOR_ONE_TYPE(FUNCTION, TYPE) \
else if (typeid_cast<const TYPE *>(column)) \
@ -114,14 +164,20 @@ if (false) /* NOLINT */ \
{ \
/* Do nothing, a starter condition. */ \
} \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt8>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt16>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt32>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt64>) \
\
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int128>) \
\
APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector<Float32>) \
APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector<Float64>) \
\
else \
{ \
throw Exception(ErrorCodes::NOT_IMPLEMENTED, \
@ -193,9 +249,28 @@ WindowTransform::WindowTransform(const Block & input_header_,
== WindowFrame::BoundaryType::Offset))
{
assert(order_by_indices.size() == 1);
const IColumn * column = input_header.getByPosition(
order_by_indices[0]).column.get();
const auto & entry = input_header.getByPosition(order_by_indices[0]);
const IColumn * column = entry.column.get();
APPLY_FOR_TYPES(compareValuesWithOffset)
// Check that the offset type matches the window type.
// Convert the offsets to the ORDER BY column type. We can't just check
// that it matches, because e.g. the int literals are always (U)Int64,
// but the column might be Int8 and so on.
if (window_description.frame.begin_type
== WindowFrame::BoundaryType::Offset)
{
window_description.frame.begin_offset = convertFieldToTypeOrThrow(
window_description.frame.begin_offset,
*entry.type);
}
if (window_description.frame.end_type
== WindowFrame::BoundaryType::Offset)
{
window_description.frame.end_offset = convertFieldToTypeOrThrow(
window_description.frame.end_offset,
*entry.type);
}
}
}
@ -391,7 +466,7 @@ void WindowTransform::advanceFrameStartRowsOffset()
{
// Just recalculate it each time by walking blocks.
const auto [moved_row, offset_left] = moveRowNumber(current_row,
window_description.frame.begin_offset
window_description.frame.begin_offset.get<UInt64>()
* (window_description.frame.begin_preceding ? -1 : 1));
frame_start = moved_row;
@ -638,7 +713,7 @@ void WindowTransform::advanceFrameEndRowsOffset()
// Walk the specified offset from the current row. The "+1" is needed
// because the frame_end is a past-the-end pointer.
const auto [moved_row, offset_left] = moveRowNumber(current_row,
window_description.frame.end_offset
window_description.frame.end_offset.get<UInt64>()
* (window_description.frame.end_preceding ? -1 : 1)
+ 1);
@ -852,14 +927,14 @@ void WindowTransform::writeOutCurrentRow()
for (size_t wi = 0; wi < workspaces.size(); ++wi)
{
auto & ws = workspaces[wi];
IColumn * result_column = block.output_columns[wi].get();
if (ws.window_function_impl)
{
ws.window_function_impl->windowInsertResultInto(*result_column, this);
ws.window_function_impl->windowInsertResultInto(this, wi);
}
else
{
IColumn * result_column = block.output_columns[wi].get();
const auto * a = ws.aggregate_function.get();
auto * buf = ws.aggregate_function_state.data();
// FIXME does it also allocate the result on the arena?
@ -1280,8 +1355,11 @@ struct WindowFunctionRank final : public WindowFunction
DataTypePtr getReturnType() const override
{ return std::make_shared<DataTypeUInt64>(); }
void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
void windowInsertResultInto(const WindowTransform * transform,
size_t function_index) override
{
IColumn & to = *transform->blockAt(transform->current_row)
.output_columns[function_index];
assert_cast<ColumnUInt64 &>(to).getData().push_back(
transform->peer_group_start_row_number);
}
@ -1297,8 +1375,11 @@ struct WindowFunctionDenseRank final : public WindowFunction
DataTypePtr getReturnType() const override
{ return std::make_shared<DataTypeUInt64>(); }
void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
void windowInsertResultInto(const WindowTransform * transform,
size_t function_index) override
{
IColumn & to = *transform->blockAt(transform->current_row)
.output_columns[function_index];
assert_cast<ColumnUInt64 &>(to).getData().push_back(
transform->peer_group_number);
}
@ -1314,13 +1395,123 @@ struct WindowFunctionRowNumber final : public WindowFunction
DataTypePtr getReturnType() const override
{ return std::make_shared<DataTypeUInt64>(); }
void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
void windowInsertResultInto(const WindowTransform * transform,
size_t function_index) override
{
IColumn & to = *transform->blockAt(transform->current_row)
.output_columns[function_index];
assert_cast<ColumnUInt64 &>(to).getData().push_back(
transform->current_row_number);
}
};
// ClickHouse-specific variant of lag/lead that respects the window frame.
template <bool is_lead>
struct WindowFunctionLagLeadInFrame final : public WindowFunction
{
WindowFunctionLagLeadInFrame(const std::string & name_,
const DataTypes & argument_types_, const Array & parameters_)
: WindowFunction(name_, argument_types_, parameters_)
{
if (!parameters.empty())
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Function {} cannot be parameterized", name_);
}
if (argument_types.empty())
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Function {} takes at least one argument", name_);
}
if (argument_types.size() == 1)
{
return;
}
if (!isInt64FieldType(argument_types[1]->getDefault().getType()))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Offset must be an integer, '{}' given",
argument_types[1]->getName());
}
if (argument_types.size() == 2)
{
return;
}
if (!getLeastSupertype({argument_types[0], argument_types[2]}))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"The default value type '{}' is not convertible to the argument type '{}'",
argument_types[2]->getName(),
argument_types[0]->getName());
}
if (argument_types.size() > 3)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Function '{}' accepts at most 3 arguments, {} given",
name, argument_types.size());
}
}
DataTypePtr getReturnType() const override
{ return argument_types[0]; }
void windowInsertResultInto(const WindowTransform * transform,
size_t function_index) override
{
const auto & current_block = transform->blockAt(transform->current_row);
IColumn & to = *current_block.output_columns[function_index];
const auto & workspace = transform->workspaces[function_index];
int offset = 1;
if (argument_types.size() > 1)
{
offset = (*current_block.input_columns[
workspace.argument_column_indices[1]])[
transform->current_row.row].get<Int64>();
if (offset < 0)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"The offset for function {} must be nonnegative, {} given",
getName(), offset);
}
}
const auto [target_row, offset_left] = transform->moveRowNumber(
transform->current_row, offset * (is_lead ? 1 : -1));
if (offset_left != 0
|| target_row < transform->frame_start
|| transform->frame_end <= target_row)
{
// Offset is outside the frame.
if (argument_types.size() > 2)
{
// Column with default values is specified.
to.insertFrom(*current_block.input_columns[
workspace.argument_column_indices[2]],
transform->current_row.row);
}
else
{
to.insertDefault();
}
}
else
{
// Offset is inside the frame.
to.insertFrom(*transform->blockAt(target_row).input_columns[
workspace.argument_column_indices[0]],
target_row.row);
}
}
};
void registerWindowFunctions(AggregateFunctionFactory & factory)
{
// Why didn't I implement lag/lead yet? Because they are a mess. I imagine
@ -1332,9 +1523,10 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
// the whole partition like Postgres does, because using a linear amount
// of additional memory is not an option when we have a lot of data. We must
// be able to process at least the lag/lead in streaming fashion.
// Our best bet is probably rewriting, say `lag(value, offset)` to
// `any(value) over (rows between offset preceding and offset preceding)`,
// at the query planning stage.
// A partial solution for constant offsets is rewriting, say `lag(value, offset)
// to `any(value) over (rows between offset preceding and offset preceding)`.
// We also implement non-standard functions `lag/leadInFrame`, that are
// analogous to `lag/lead`, but respect the frame.
// Functions like cume_dist() do require materializing the entire
// partition, but it's probably also simpler to implement them by rewriting
// to a (rows between unbounded preceding and unbounded following) frame,
@ -1360,6 +1552,20 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
return std::make_shared<WindowFunctionRowNumber>(name, argument_types,
parameters);
});
factory.registerFunction("lagInFrame", [](const std::string & name,
const DataTypes & argument_types, const Array & parameters)
{
return std::make_shared<WindowFunctionLagLeadInFrame<false>>(
name, argument_types, parameters);
});
factory.registerFunction("leadInFrame", [](const std::string & name,
const DataTypes & argument_types, const Array & parameters)
{
return std::make_shared<WindowFunctionLagLeadInFrame<true>>(
name, argument_types, parameters);
});
}
}

View File

@ -110,7 +110,9 @@ public:
Status prepare() override;
void work() override;
private:
/*
* Implementation details.
*/
void advancePartitionEnd();
bool arePeers(const RowNumber & x, const RowNumber & y) const;
@ -321,10 +323,7 @@ public:
int (* compare_values_with_offset) (
const IColumn * compared_column, size_t compared_row,
const IColumn * reference_column, size_t reference_row,
// We can make it a Field later if we need the Decimals. Now we only
// have ints and datetime, and the underlying Field type for them is
// uint64_t anyway.
uint64_t offset,
const Field & offset,
bool offset_is_preceding);
};

Some files were not shown because too many files have changed in this diff Show More