diff --git a/.gitmodules b/.gitmodules index 7a2c5600e65..f9bc8a56a5c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -93,7 +93,7 @@ url = https://github.com/ClickHouse-Extras/libunwind.git [submodule "contrib/simdjson"] path = contrib/simdjson - url = https://github.com/ClickHouse-Extras/simdjson.git + url = https://github.com/simdjson/simdjson.git [submodule "contrib/rapidjson"] path = contrib/rapidjson url = https://github.com/ClickHouse-Extras/rapidjson diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 3c895202c09..363f281584e 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -1105,11 +1105,11 @@ public: } template - inline LUTIndex addMonthsIndex(DateOrTime v, Int64 delta) const + inline LUTIndex NO_SANITIZE_UNDEFINED addMonthsIndex(DateOrTime v, Int64 delta) const { const Values & values = lut[toLUTIndex(v)]; - Int64 month = static_cast(values.month) + delta; + Int64 month = values.month + delta; if (month > 0) { diff --git a/contrib/NuRaft b/contrib/NuRaft index 3d3683e7775..70468326ad5 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 3d3683e77753cfe015a05fae95ddf418e19f59e1 +Subproject commit 70468326ad5d72e9497944838484c591dae054ea diff --git a/contrib/replxx b/contrib/replxx index cdb6e3f2ce4..2b24f14594d 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc +Subproject commit 2b24f14594d7606792b92544bb112a6322ba34d7 diff --git a/contrib/simdjson b/contrib/simdjson index 3190d66a490..95b4870e20b 160000 --- a/contrib/simdjson +++ b/contrib/simdjson @@ -1 +1 @@ -Subproject commit 3190d66a49059092a1753dc35595923debfc1698 +Subproject commit 95b4870e20be5f97d9dcf63b23b1c6f520c366c1 diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index 8443eae691b..d9cd68254b7 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -18,6 +18,7 @@ RUN apt-get update \ clickhouse-client=$version \ clickhouse-common-static=$version \ locales \ + tzdata \ && rm -rf /var/lib/apt/lists/* /var/cache/debconf \ && apt-get clean diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index 295784a6184..414eb23d044 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -32,6 +32,7 @@ RUN groupadd -r clickhouse --gid=101 \ clickhouse-server=$version \ locales \ wget \ + tzdata \ && rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 0f9de1996ab..cd192c0c9da 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -21,7 +21,9 @@ RUN addgroup -S -g 101 clickhouse \ && chown clickhouse:clickhouse /var/lib/clickhouse \ && chown root:clickhouse /var/log/clickhouse-server \ && chmod +x /entrypoint.sh \ - && apk add --no-cache su-exec bash \ + && apk add --no-cache su-exec bash tzdata \ + && cp /usr/share/zoneinfo/UTC /etc/localtime \ + && echo "UTC" > /etc/timezone \ && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client # we need to allow "others" access to clickhouse folder, because docker container diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index 0138a165505..81e04bd7874 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -46,9 +46,11 @@ DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" -- TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)" USER_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=user_files_path || true)" LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.log || true)" -LOG_DIR="$(dirname "$LOG_PATH" || true)" +LOG_DIR="" +if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)" -ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH" || true)" +ERROR_LOG_DIR="" +if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)" CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}" diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 649f9f812e1..bbd5443ffb6 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -292,6 +292,7 @@ function run_tests 01318_decrypt # Depends on OpenSSL 01663_aes_msan # Depends on OpenSSL 01667_aes_args_check # Depends on OpenSSL + 01776_decrypt_aead_size_check # Depends on OpenSSL 01281_unsucceeded_insert_select_queries_counter 01292_create_user 01294_lazy_database_concurrent diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index 3ddaf99b879..4727f485943 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -266,14 +266,13 @@ for query_index in queries_to_run: try: # Will also detect too long queries during warmup stage - res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': 10}) + res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': args.max_query_seconds}) except clickhouse_driver.errors.Error as e: # Add query id to the exception to make debugging easier. e.args = (prewarm_id, *e.args) e.message = prewarm_id + ': ' + e.message raise - print(f'prewarm\t{query_index}\t{prewarm_id}\t{conn_index}\t{c.last_query.elapsed}') except KeyboardInterrupt: raise @@ -320,7 +319,7 @@ for query_index in queries_to_run: for conn_index, c in enumerate(this_query_connections): try: - res = c.execute(q, query_id = run_id) + res = c.execute(q, query_id = run_id, settings = {'max_execution_time': args.max_query_seconds}) except clickhouse_driver.errors.Error as e: # Add query id to the exception to make debugging easier. e.args = (run_id, *e.args) diff --git a/docker/test/sqlancer/Dockerfile b/docker/test/sqlancer/Dockerfile index 6bcdc3df5cd..253ca1b729a 100644 --- a/docker/test/sqlancer/Dockerfile +++ b/docker/test/sqlancer/Dockerfile @@ -2,7 +2,6 @@ FROM ubuntu:20.04 RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven python3 --yes --no-install-recommends - RUN wget https://github.com/sqlancer/sqlancer/archive/master.zip -O /sqlancer.zip RUN mkdir /sqlancer && \ cd /sqlancer && \ diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 1a2ccf3e0dc..8326038407f 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -3,7 +3,7 @@ toc_priority: 8 toc_title: PostgreSQL --- -# PosgtreSQL {#postgresql} +# PostgreSQL {#postgresql} The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server. diff --git a/docs/en/faq/integration/json-import.md b/docs/en/faq/integration/json-import.md index 7038cc539d2..3fa026c794a 100644 --- a/docs/en/faq/integration/json-import.md +++ b/docs/en/faq/integration/json-import.md @@ -19,7 +19,7 @@ $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test Using [CLI interface](../../interfaces/cli.md): ``` bash -$ echo '{"foo":"bar"}' | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow" +$ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow" ``` Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead. diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index ee2235b7861..5987ba0f676 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -50,7 +50,7 @@ The supported formats are: | [Parquet](#data-format-parquet) | ✔ | ✔ | | [Arrow](#data-format-arrow) | ✔ | ✔ | | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | +| [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | @@ -1284,32 +1284,33 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e ## ORC {#data-format-orc} -[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse. +[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the [Hadoop](https://hadoop.apache.org/) ecosystem. ### Data Types Matching {#data_types-matching-3} -The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` queries. +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. -| ORC data type (`INSERT`) | ClickHouse data type | -|--------------------------|-----------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [Date](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | +| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) | +|--------------------------|-----------------------------------------------------|--------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. -Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. +Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. @@ -1321,6 +1322,14 @@ You can insert ORC data from a file into ClickHouse table by the following comma $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` +### Selecting Data {#selecting-data-2} + +You can select data from a ClickHouse table and save them into some file in the ORC format by the following command: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md). ## LineAsString {#lineasstring} diff --git a/docs/en/operations/system-tables/errors.md b/docs/en/operations/system-tables/errors.md index 72a537f15b9..583cce88ca4 100644 --- a/docs/en/operations/system-tables/errors.md +++ b/docs/en/operations/system-tables/errors.md @@ -9,7 +9,7 @@ Columns: - `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — the number of times this error has been happened. - `last_error_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — time when the last error happened. - `last_error_message` ([String](../../sql-reference/data-types/string.md)) — message for the last error. -- `last_error_stacktrace` ([String](../../sql-reference/data-types/string.md)) — stacktrace for the last error. +- `last_error_trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored. - `remote` ([UInt8](../../sql-reference/data-types/int-uint.md)) — remote exception (i.e. received during one of the distributed query). **Example** @@ -25,3 +25,12 @@ LIMIT 1 │ CANNOT_OPEN_FILE │ 76 │ 1 │ └──────────────────┴──────┴───────┘ ``` + +``` sql +WITH arrayMap(x -> demangle(addressToSymbol(x)), last_error_trace) AS all +SELECT name, arrayStringConcat(all, '\n') AS res +FROM system.errors +LIMIT 1 +SETTINGS allow_introspection_functions=1\G +``` + diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index 337586a2e10..de6a780235f 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -320,8 +320,6 @@ Similar to `cache`, but stores data on SSD and index in RAM. 1048576 /var/lib/clickhouse/clickhouse_dictionaries/test_dict - - 1048576 ``` @@ -329,8 +327,8 @@ Similar to `cache`, but stores data on SSD and index in RAM. or ``` sql -LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 - PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576)) +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 + PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict)) ``` ### complex_key_ssd_cache {#complex-key-ssd-cache} diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index cbf03a44d46..a646347ea60 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -23,7 +23,9 @@ ClickHouse supports the standard grammar for defining windows and window functio | `GROUPS` frame | not supported | | Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported | | `rank()`, `dense_rank()`, `row_number()` | supported | -| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`| +| `lag/lead(value, offset)` | Not supported. Workarounds: | +| | 1) replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`| +| | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | ## References diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 67cc80f5cd8..f67997b58d6 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -49,7 +49,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT | [Parquet](#data-format-parquet) | ✔ | ✔ | | [Arrow](#data-format-arrow) | ✔ | ✔ | | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | +| [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | @@ -1203,45 +1203,53 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_ ## ORC {#data-format-orc} -[Apache ORC](https://orc.apache.org/) - это column-oriented формат данных, распространённый в экосистеме Hadoop. Вы можете только вставлять данные этого формата в ClickHouse. +[Apache ORC](https://orc.apache.org/) — это столбцовый формат данных, распространенный в экосистеме [Hadoop](https://hadoop.apache.org/). ### Соответствие типов данных {#sootvetstvie-tipov-dannykh-1} -Таблица показывает поддержанные типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT`. +Таблица ниже содержит поддерживаемые типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT` и `SELECT`. -| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | -|---------------------------|-----------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [Date](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | +| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | Тип данных ORC (`SELECT`) | +|---------------------------|-----------------------------------------------------|---------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | -ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`. +ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных ORC `DECIMAL` как `Decimal128`. -Неподдержанные типы данных ORC: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. +Неподдерживаемые типы данных ORC: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных, ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse. +Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse. ### Вставка данных {#vstavka-dannykh-1} -Данные ORC можно вставить в таблицу ClickHouse командой: +Чтобы вставить в ClickHouse данные из файла в формате ORC, используйте команду следующего вида: ``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` -Для обмена данных с Hadoop можно использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md). +### Вывод данных {#vyvod-dannykh-1} +Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + +Для обмена данных с экосистемой Hadoop вы можете использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md). ## LineAsString {#lineasstring} diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index 1d1e46250e2..285982565c2 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -318,8 +318,6 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) 1048576 /var/lib/clickhouse/clickhouse_dictionaries/test_dict - - 1048576 ``` @@ -327,8 +325,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) или ``` sql -LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 - PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576)) +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 + PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict)) ``` ### complex_key_ssd_cache {#complex-key-ssd-cache} diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index 54537b7735d..f9b3e5c3e68 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -672,7 +672,7 @@ neighbor(column, offset[, default_value]) Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных. Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю. -Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию изне подзапроса. +Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса. **Аргументы** diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index 470bc5e8719..9605525edbf 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -26,7 +26,7 @@ numpy==1.19.2 Pygments==2.5.2 pymdown-extensions==8.0 python-slugify==4.0.1 -PyYAML==5.3.1 +PyYAML==5.4.1 repackage==0.7.3 requests==2.24.0 singledispatch==3.4.0.3 diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 80d44a336a5..939a48d949f 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -8,10 +8,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/src/Client/ya.make b/src/Client/ya.make index af1dd05f1d4..4201203a8e9 100644 --- a/src/Client/ya.make +++ b/src/Client/ya.make @@ -16,7 +16,6 @@ SRCS( HedgedConnections.cpp HedgedConnectionsFactory.cpp MultiplexedConnections.cpp - TimeoutSetter.cpp ) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 586c0fbde4d..918bc301754 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -560,7 +560,7 @@ namespace DB { namespace ErrorCodes { -#define M(VALUE, NAME) extern const Value NAME = VALUE; +#define M(VALUE, NAME) extern const ErrorCode NAME = VALUE; APPLY_FOR_ERROR_CODES(M) #undef M @@ -587,7 +587,7 @@ namespace ErrorCodes ErrorCode end() { return END + 1; } - void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace) + void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace) { if (error_code >= end()) { @@ -596,10 +596,10 @@ namespace ErrorCodes error_code = end() - 1; } - values[error_code].increment(remote, message, stacktrace); + values[error_code].increment(remote, message, trace); } - void ErrorPairHolder::increment(bool remote, const std::string & message, const std::string & stacktrace) + void ErrorPairHolder::increment(bool remote, const std::string & message, const FramePointers & trace) { const auto now = std::chrono::system_clock::now(); @@ -609,7 +609,7 @@ namespace ErrorCodes ++error.count; error.message = message; - error.stacktrace = stacktrace; + error.trace = trace; error.error_time_ms = std::chrono::duration_cast(now.time_since_epoch()).count(); } ErrorPair ErrorPairHolder::get() diff --git a/src/Common/ErrorCodes.h b/src/Common/ErrorCodes.h index edb9be9e0c0..ffd0b8b8619 100644 --- a/src/Common/ErrorCodes.h +++ b/src/Common/ErrorCodes.h @@ -1,11 +1,12 @@ #pragma once -#include +#include #include #include #include -#include #include +#include +#include /** Allows to count number of simultaneously happening error codes. * See also Exception.cpp for incrementing part. @@ -19,6 +20,7 @@ namespace ErrorCodes /// ErrorCode identifier (index in array). using ErrorCode = int; using Value = size_t; + using FramePointers = std::vector; /// Get name of error_code by identifier. /// Returns statically allocated string. @@ -33,7 +35,7 @@ namespace ErrorCodes /// Message for the last error. std::string message; /// Stacktrace for the last error. - std::string stacktrace; + FramePointers trace; }; struct ErrorPair { @@ -46,7 +48,7 @@ namespace ErrorCodes { public: ErrorPair get(); - void increment(bool remote, const std::string & message, const std::string & stacktrace); + void increment(bool remote, const std::string & message, const FramePointers & trace); private: ErrorPair value; @@ -60,7 +62,7 @@ namespace ErrorCodes ErrorCode end(); /// Add value for specified error_code. - void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace); + void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace); } } diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 08afd0397f5..e8a98021588 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -36,7 +36,7 @@ namespace ErrorCodes /// - Aborts the process if error code is LOGICAL_ERROR. /// - Increments error codes statistics. -void handle_error_code([[maybe_unused]] const std::string & msg, const std::string & stacktrace, int code, bool remote) +void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool remote, const Exception::FramePointers & trace) { // In debug builds and builds with sanitizers, treat LOGICAL_ERROR as an assertion failure. // Log the message before we fail. @@ -47,20 +47,21 @@ void handle_error_code([[maybe_unused]] const std::string & msg, const std::stri abort(); } #endif - ErrorCodes::increment(code, remote, msg, stacktrace); + + ErrorCodes::increment(code, remote, msg, trace); } Exception::Exception(const std::string & msg, int code, bool remote_) : Poco::Exception(msg, code) , remote(remote_) { - handle_error_code(msg, getStackTraceString(), code, remote); + handle_error_code(msg, code, remote, getStackFramePointers()); } Exception::Exception(const std::string & msg, const Exception & nested, int code) : Poco::Exception(msg, nested, code) { - handle_error_code(msg, getStackTraceString(), code, remote); + handle_error_code(msg, code, remote, getStackFramePointers()); } Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc) @@ -101,6 +102,31 @@ std::string Exception::getStackTraceString() const #endif } +Exception::FramePointers Exception::getStackFramePointers() const +{ + FramePointers frame_pointers; +#ifdef STD_EXCEPTION_HAS_STACK_TRACE + { + frame_pointers.resize(get_stack_trace_size()); + for (size_t i = 0; i < frame_pointers.size(); ++i) + { + frame_pointers[i] = get_stack_trace_frames()[i]; + } + } +#else + { + size_t stack_trace_size = trace.getSize(); + size_t stack_trace_offset = trace.getOffset(); + frame_pointers.reserve(stack_trace_size - stack_trace_offset); + for (size_t i = stack_trace_offset; i < stack_trace_size; ++i) + { + frame_pointers.push_back(trace.getFramePointers()[i]); + } + } +#endif + return frame_pointers; +} + void throwFromErrno(const std::string & s, int code, int the_errno) { diff --git a/src/Common/Exception.h b/src/Common/Exception.h index e487badafa5..79b4394948a 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -24,6 +24,8 @@ namespace DB class Exception : public Poco::Exception { public: + using FramePointers = std::vector; + Exception() = default; Exception(const std::string & msg, int code, bool remote_ = false); Exception(const std::string & msg, const Exception & nested, int code); @@ -66,6 +68,8 @@ public: bool isRemoteException() const { return remote; } std::string getStackTraceString() const; + /// Used for system.errors + FramePointers getStackFramePointers() const; private: #ifndef STD_EXCEPTION_HAS_STACK_TRACE diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h index df9766c5ee8..870fb219523 100644 --- a/src/Common/HashTable/LRUHashMap.h +++ b/src/Common/HashTable/LRUHashMap.h @@ -271,13 +271,13 @@ private: }; template -struct DefaultCellDisposer +struct DefaultLRUHashMapCellDisposer { void operator()(const Key &, const Mapped &) const {} }; -template , typename Hash = DefaultHash> +template , typename Hash = DefaultHash> using LRUHashMap = LRUHashMapImpl; -template , typename Hash = DefaultHash> +template , typename Hash = DefaultHash> using LRUHashMapWithSavedHash = LRUHashMapImpl; diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 163a6503d2e..57ad3d46177 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -692,6 +692,30 @@ public: assign(from.begin(), from.end()); } + void erase(const_iterator first, const_iterator last) + { + iterator first_no_const = const_cast(first); + iterator last_no_const = const_cast(last); + + size_t items_to_move = end() - last; + + while (items_to_move != 0) + { + *first_no_const = *last_no_const; + + ++first_no_const; + ++last_no_const; + + --items_to_move; + } + + this->c_end = reinterpret_cast(first_no_const); + } + + void erase(const_iterator pos) + { + this->erase(pos, pos + 1); + } bool operator== (const PODArray & rhs) const { diff --git a/src/Common/tests/gtest_pod_array.cpp b/src/Common/tests/gtest_pod_array.cpp index 53b3e207a22..63cf7026757 100644 --- a/src/Common/tests/gtest_pod_array.cpp +++ b/src/Common/tests/gtest_pod_array.cpp @@ -92,3 +92,57 @@ TEST(Common, PODInsertElementSizeNotMultipleOfLeftPadding) EXPECT_EQ(arr1_initially_empty.size(), items_to_insert_size); } + +TEST(Common, PODErase) +{ + { + PaddedPODArray items {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + expected = {0,1,2,3,4,5,6,7,8,9}; + + items.erase(items.begin(), items.begin()); + EXPECT_EQ(items, expected); + + items.erase(items.end(), items.end()); + EXPECT_EQ(items, expected); + } + { + PaddedPODArray actual {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + + expected = {0,1,4,5,6,7,8,9}; + actual.erase(actual.begin() + 2, actual.begin() + 4); + EXPECT_EQ(actual, expected); + + expected = {0,1,4}; + actual.erase(actual.begin() + 3, actual.end()); + EXPECT_EQ(actual, expected); + + expected = {}; + actual.erase(actual.begin(), actual.end()); + EXPECT_EQ(actual, expected); + + for (size_t i = 0; i < 10; ++i) + actual.emplace_back(static_cast(i)); + + expected = {0,1,4,5,6,7,8,9}; + actual.erase(actual.begin() + 2, actual.begin() + 4); + EXPECT_EQ(actual, expected); + + expected = {0,1,4}; + actual.erase(actual.begin() + 3, actual.end()); + EXPECT_EQ(actual, expected); + + expected = {}; + actual.erase(actual.begin(), actual.end()); + EXPECT_EQ(actual, expected); + } + { + PaddedPODArray actual {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + + expected = {1,2,3,4,5,6,7,8,9}; + actual.erase(actual.begin()); + EXPECT_EQ(actual, expected); + } +} diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp index 4b4d33954a9..0548de07859 100644 --- a/src/Compression/CachedCompressedReadBuffer.cpp +++ b/src/Compression/CachedCompressedReadBuffer.cpp @@ -51,7 +51,7 @@ bool CachedCompressedReadBuffer::nextImpl() { owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer(); owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes); - decompress(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum); + decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum); } diff --git a/src/Compression/CompressedReadBuffer.cpp b/src/Compression/CompressedReadBuffer.cpp index 6a082164231..78241ec1b69 100644 --- a/src/Compression/CompressedReadBuffer.cpp +++ b/src/Compression/CompressedReadBuffer.cpp @@ -21,7 +21,7 @@ bool CompressedReadBuffer::nextImpl() memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); return true; } @@ -48,7 +48,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) /// If the decompressed block fits entirely where it needs to be copied. if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { - decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); + decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; bytes += size_decompressed; } @@ -61,9 +61,9 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - pos = working_buffer.begin(); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); + pos = working_buffer.begin(); bytes_read += read(to + bytes_read, n - bytes_read); break; diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index 8f5b779e4bc..79757d6f151 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -184,7 +184,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, } -void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum) +static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs) { ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks); ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed); @@ -210,11 +210,38 @@ void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, s ErrorCodes::CANNOT_DECOMPRESS); } } +} + +void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum) +{ + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); codec->decompress(compressed_buffer, size_compressed_without_checksum, to); } +void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum) +{ + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); + + if (codec->isNone()) + { + /// Shortcut for NONE codec to avoid extra memcpy. + /// We doing it by changing the buffer `to` to point to existing uncompressed data. + + UInt8 header_size = ICompressionCodec::getHeaderSize(); + if (size_compressed_without_checksum < header_size) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", + size_compressed_without_checksum, static_cast(header_size)); + + to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum); + } + else + codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin()); +} + + /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_) : compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_) diff --git a/src/Compression/CompressedReadBufferBase.h b/src/Compression/CompressedReadBufferBase.h index 60b8847f639..c1e928039ef 100644 --- a/src/Compression/CompressedReadBufferBase.h +++ b/src/Compression/CompressedReadBufferBase.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -37,7 +38,12 @@ protected: /// Returns number of compressed bytes read. size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum, bool always_copy); - void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum); + /// Decompress into memory pointed by `to` + void decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum); + + /// This method can change location of `to` to avoid unnecessary copy if data is uncompressed. + /// It is more efficient for compression codec NONE but not suitable if you want to decompress into specific location. + void decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum); public: /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp index 54f360f417b..3a75ea14166 100644 --- a/src/Compression/CompressedReadBufferFromFile.cpp +++ b/src/Compression/CompressedReadBufferFromFile.cpp @@ -31,7 +31,7 @@ bool CompressedReadBufferFromFile::nextImpl() memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); return true; } @@ -108,7 +108,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) /// If the decompressed block fits entirely where it needs to be copied. if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { - decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); + decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; bytes += size_decompressed; } @@ -122,9 +122,9 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - pos = working_buffer.begin(); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); + pos = working_buffer.begin(); bytes_read += read(to + bytes_read, n - bytes_read); break; diff --git a/src/Compression/ICompressionCodec.cpp b/src/Compression/ICompressionCodec.cpp index dec2b633046..46a12e50828 100644 --- a/src/Compression/ICompressionCodec.cpp +++ b/src/Compression/ICompressionCodec.cpp @@ -98,7 +98,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch UInt8 header_size = getHeaderSize(); if (source_size < header_size) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", source_size, size_t(header_size)); + throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", source_size, static_cast(header_size)); uint8_t our_method = getMethodByte(); uint8_t method = source[0]; diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index dcfb13c359e..45eb1348ac6 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -31,6 +31,8 @@ struct Settings; M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \ M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \ M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \ + M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \ + M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index edda26613dd..7e6c10ca125 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -30,6 +30,8 @@ NuKeeperServer::NuKeeperServer( , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) , responses_queue(responses_queue_) { + if (coordination_settings->quorum_reads) + LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Quorum reads enabled, NuKeeper will work slower."); } void NuKeeperServer::startup() @@ -59,6 +61,7 @@ void NuKeeperServer::startup() params.reserved_log_items_ = coordination_settings->reserved_log_items; params.snapshot_distance_ = coordination_settings->snapshot_distance; params.stale_log_gap_ = coordination_settings->stale_log_gap; + params.fresh_log_gap_ = coordination_settings->fresh_log_gap; params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds(); params.auto_forwarding_ = coordination_settings->auto_forwarding; params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2; @@ -106,7 +109,7 @@ nuraft::ptr getZooKeeperLogEntry(int64_t session_id, const Coord void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session) { auto [session_id, request] = request_for_session; - if (isLeaderAlive() && request->isReadRequest()) + if (!coordination_settings->quorum_reads && isLeaderAlive() && request->isReadRequest()) { state_machine->processReadRequest(request_for_session); } @@ -185,6 +188,9 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t if (next_index < last_commited || next_index - last_commited <= 1) commited_store = true; + if (initialized_flag) + return nuraft::cb_func::ReturnCode::Ok; + auto set_initialized = [this] () { std::unique_lock lock(initialized_mutex); @@ -196,10 +202,27 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t { case nuraft::cb_func::BecomeLeader: { - if (commited_store) /// We become leader and store is empty, ready to serve requests + /// We become leader and store is empty or we already committed it + if (commited_store || initial_batch_committed) set_initialized(); return nuraft::cb_func::ReturnCode::Ok; } + case nuraft::cb_func::BecomeFollower: + case nuraft::cb_func::GotAppendEntryReqFromLeader: + { + if (isLeaderAlive()) + { + auto leader_index = raft_instance->get_leader_committed_log_idx(); + auto our_index = raft_instance->get_committed_log_idx(); + /// This may happen when we start RAFT cluster from scratch. + /// Node first became leader, and after that some other node became leader. + /// BecameFresh for this node will not be called because it was already fresh + /// when it was leader. + if (leader_index < our_index + coordination_settings->fresh_log_gap) + set_initialized(); + } + return nuraft::cb_func::ReturnCode::Ok; + } case nuraft::cb_func::BecomeFresh: { set_initialized(); /// We are fresh follower, ready to serve requests. @@ -209,6 +232,7 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t { if (isLeader()) /// We have committed our log store and we are leader, ready to serve requests. set_initialized(); + initial_batch_committed = true; return nuraft::cb_func::ReturnCode::Ok; } default: /// ignore other events @@ -220,7 +244,7 @@ void NuKeeperServer::waitInit() { std::unique_lock lock(initialized_mutex); int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); - if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; })) + if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); })) throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); } diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h index 17099045640..b5c13e62212 100644 --- a/src/Coordination/NuKeeperServer.h +++ b/src/Coordination/NuKeeperServer.h @@ -31,8 +31,9 @@ private: ResponsesQueue & responses_queue; std::mutex initialized_mutex; - bool initialized_flag = false; + std::atomic initialized_flag = false; std::condition_variable initialized_cv; + std::atomic initial_batch_committed = false; nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param); diff --git a/src/Coordination/NuKeeperSnapshotManager.cpp b/src/Coordination/NuKeeperSnapshotManager.cpp index f5a97619976..1caa1ea94b8 100644 --- a/src/Coordination/NuKeeperSnapshotManager.cpp +++ b/src/Coordination/NuKeeperSnapshotManager.cpp @@ -241,9 +241,10 @@ NuKeeperStorageSnapshot::~NuKeeperStorageSnapshot() storage->disableSnapshotMode(); } -NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_) +NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_) : snapshots_path(snapshots_path_) , snapshots_to_keep(snapshots_to_keep_) + , storage_tick_time(storage_tick_time_) { namespace fs = std::filesystem; @@ -325,22 +326,24 @@ nuraft::ptr NuKeeperSnapshotManager::serializeSnapshotToBuffer(c return writer.getBuffer(); } -SnapshotMetadataPtr NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr buffer) +SnapshotMetaAndStorage NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(nuraft::ptr buffer) const { ReadBufferFromNuraftBuffer reader(buffer); CompressedReadBuffer compressed_reader(reader); - return NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader); + auto storage = std::make_unique(storage_tick_time); + auto snapshot_metadata = NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader); + return std::make_pair(snapshot_metadata, std::move(storage)); } -SnapshotMetadataPtr NuKeeperSnapshotManager::restoreFromLatestSnapshot(NuKeeperStorage * storage) +SnapshotMetaAndStorage NuKeeperSnapshotManager::restoreFromLatestSnapshot() { if (existing_snapshots.empty()) - return nullptr; + return {}; auto buffer = deserializeLatestSnapshotBufferFromDisk(); if (!buffer) - return nullptr; - return deserializeSnapshotFromBuffer(storage, buffer); + return {}; + return deserializeSnapshotFromBuffer(buffer); } void NuKeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded() diff --git a/src/Coordination/NuKeeperSnapshotManager.h b/src/Coordination/NuKeeperSnapshotManager.h index 422baf11a65..d844a52eaf4 100644 --- a/src/Coordination/NuKeeperSnapshotManager.h +++ b/src/Coordination/NuKeeperSnapshotManager.h @@ -40,17 +40,20 @@ public: using NuKeeperStorageSnapshotPtr = std::shared_ptr; using CreateSnapshotCallback = std::function; + +using SnapshotMetaAndStorage = std::pair; + class NuKeeperSnapshotManager { public: - NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_); + NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_ = 500); - SnapshotMetadataPtr restoreFromLatestSnapshot(NuKeeperStorage * storage); + SnapshotMetaAndStorage restoreFromLatestSnapshot(); static nuraft::ptr serializeSnapshotToBuffer(const NuKeeperStorageSnapshot & snapshot); std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, size_t up_to_log_idx); - static SnapshotMetadataPtr deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr buffer); + SnapshotMetaAndStorage deserializeSnapshotFromBuffer(nuraft::ptr buffer) const; nuraft::ptr deserializeSnapshotBufferFromDisk(size_t up_to_log_idx) const; nuraft::ptr deserializeLatestSnapshotBufferFromDisk(); @@ -74,6 +77,7 @@ private: const std::string snapshots_path; const size_t snapshots_to_keep; std::map existing_snapshots; + size_t storage_tick_time; }; struct CreateSnapshotTask diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp index 58a7ca3d5bc..a7037b8d644 100644 --- a/src/Coordination/NuKeeperStateMachine.cpp +++ b/src/Coordination/NuKeeperStateMachine.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -37,8 +38,7 @@ NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data) NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_) : coordination_settings(coordination_settings_) - , storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds()) - , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep) + , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep, coordination_settings->dead_session_check_period_ms.totalMicroseconds()) , responses_queue(responses_queue_) , snapshots_queue(snapshots_queue_) , last_committed_idx(0) @@ -60,7 +60,7 @@ void NuKeeperStateMachine::init() try { latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index); - latest_snapshot_meta = snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_buf); + std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf); last_committed_idx = latest_snapshot_meta->get_last_log_idx(); loaded = true; break; @@ -83,6 +83,9 @@ void NuKeeperStateMachine::init() { LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx); } + + if (!storage) + storage = std::make_unique(coordination_settings->dead_session_check_period_ms.totalMilliseconds()); } nuraft::ptr NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data) @@ -96,7 +99,7 @@ nuraft::ptr NuKeeperStateMachine::commit(const size_t log_idx, n nuraft::buffer_serializer bs(response); { std::lock_guard lock(storage_lock); - session_id = storage.getSessionID(session_timeout_ms); + session_id = storage->getSessionID(session_timeout_ms); bs.put_i64(session_id); } LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms); @@ -109,7 +112,7 @@ nuraft::ptr NuKeeperStateMachine::commit(const size_t log_idx, n NuKeeperStorage::ResponsesForSessions responses_for_sessions; { std::lock_guard lock(storage_lock); - responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id, log_idx); + responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, log_idx); for (auto & response_for_session : responses_for_sessions) responses_queue.push(response_for_session); } @@ -133,7 +136,7 @@ bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s) { std::lock_guard lock(storage_lock); - snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_ptr); + std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr); } last_committed_idx = s.get_last_log_idx(); return true; @@ -157,7 +160,7 @@ void NuKeeperStateMachine::create_snapshot( CreateSnapshotTask snapshot_task; { std::lock_guard lock(storage_lock); - snapshot_task.snapshot = std::make_shared(&storage, snapshot_meta_copy); + snapshot_task.snapshot = std::make_shared(storage.get(), snapshot_meta_copy); } snapshot_task.create_snapshot = [this, when_done] (NuKeeperStorageSnapshotPtr && snapshot) @@ -179,7 +182,7 @@ void NuKeeperStateMachine::create_snapshot( { /// Must do it with lock (clearing elements from list) std::lock_guard lock(storage_lock); - storage.clearGarbageAfterSnapshot(); + storage->clearGarbageAfterSnapshot(); /// Destroy snapshot with lock snapshot.reset(); LOG_TRACE(log, "Cleared garbage after snapshot"); @@ -214,7 +217,7 @@ void NuKeeperStateMachine::save_logical_snp_obj( if (obj_id == 0) { std::lock_guard lock(storage_lock); - NuKeeperStorageSnapshot snapshot(&storage, s.get_last_log_idx()); + NuKeeperStorageSnapshot snapshot(storage.get(), s.get_last_log_idx()); cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot); } else @@ -225,7 +228,28 @@ void NuKeeperStateMachine::save_logical_snp_obj( nuraft::ptr snp_buf = s.serialize(); cloned_meta = nuraft::snapshot::deserialize(*snp_buf); - auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx()); + /// Sometimes NuRaft can call save and create snapshots from different threads + /// at once. To avoid race conditions we serialize snapshots through snapshots_queue + /// TODO: make something better + CreateSnapshotTask snapshot_task; + std::shared_ptr> waiter = std::make_shared>(); + auto future = waiter->get_future(); + snapshot_task.snapshot = nullptr; + snapshot_task.create_snapshot = [this, waiter, cloned_buffer, log_idx = s.get_last_log_idx()] (NuKeeperStorageSnapshotPtr &&) + { + try + { + auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, log_idx); + LOG_DEBUG(log, "Saved snapshot {} to path {}", log_idx, result_path); + } + catch (...) + { + tryLogCurrentException(log); + } + waiter->set_value(); + }; + snapshots_queue.push(std::move(snapshot_task)); + future.wait(); { std::lock_guard lock(snapshots_lock); @@ -233,7 +257,6 @@ void NuKeeperStateMachine::save_logical_snp_obj( latest_snapshot_meta = cloned_meta; } - LOG_DEBUG(log, "Created snapshot {} with path {}", s.get_last_log_idx(), result_path); obj_id++; } @@ -271,7 +294,7 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS NuKeeperStorage::ResponsesForSessions responses; { std::lock_guard lock(storage_lock); - responses = storage.processRequest(request_for_session.request, request_for_session.session_id, std::nullopt); + responses = storage->processRequest(request_for_session.request, request_for_session.session_id, std::nullopt); } for (const auto & response : responses) responses_queue.push(response); @@ -280,13 +303,13 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS std::unordered_set NuKeeperStateMachine::getDeadSessions() { std::lock_guard lock(storage_lock); - return storage.getDeadSessions(); + return storage->getDeadSessions(); } void NuKeeperStateMachine::shutdownStorage() { std::lock_guard lock(storage_lock); - storage.finalize(); + storage->finalize(); } } diff --git a/src/Coordination/NuKeeperStateMachine.h b/src/Coordination/NuKeeperStateMachine.h index 905f3448c1a..af9ad6de4d2 100644 --- a/src/Coordination/NuKeeperStateMachine.h +++ b/src/Coordination/NuKeeperStateMachine.h @@ -52,7 +52,7 @@ public: NuKeeperStorage & getStorage() { - return storage; + return *storage; } void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session); @@ -68,7 +68,7 @@ private: CoordinationSettingsPtr coordination_settings; - NuKeeperStorage storage; + NuKeeperStoragePtr storage; NuKeeperSnapshotManager snapshot_manager; diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index fff44163b71..c1a8ebdfb44 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -233,7 +233,7 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest { using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override + std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t /*session_id*/) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperRemoveResponse & response = dynamic_cast(*response_ptr); @@ -257,7 +257,12 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest { auto prev_node = it->value; if (prev_node.stat.ephemeralOwner != 0) - ephemerals[session_id].erase(request.path); + { + auto ephemerals_it = ephemerals.find(prev_node.stat.ephemeralOwner); + ephemerals_it->second.erase(request.path); + if (ephemerals_it->second.empty()) + ephemerals.erase(ephemerals_it); + } auto child_basename = getBaseName(it->key); container.updateValue(parentPath(request.path), [&child_basename] (NuKeeperStorage::Node & parent) @@ -271,10 +276,10 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest container.erase(request.path); - undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename] + undo = [prev_node, &container, &ephemerals, path = request.path, child_basename] { if (prev_node.stat.ephemeralOwner != 0) - ephemerals[session_id].emplace(path); + ephemerals[prev_node.stat.ephemeralOwner].emplace(path); container.insert(path, prev_node); container.updateValue(parentPath(path), [&child_basename] (NuKeeperStorage::Node & parent) @@ -377,7 +382,6 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest { return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED); } - }; struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest @@ -641,6 +645,13 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor for (const auto & ephemeral_path : it->second) { container.erase(ephemeral_path); + container.updateValue(parentPath(ephemeral_path), [&ephemeral_path] (NuKeeperStorage::Node & parent) + { + --parent.stat.numChildren; + ++parent.stat.cversion; + parent.children.erase(getBaseName(ephemeral_path)); + }); + auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED); results.insert(results.end(), responses.begin(), responses.end()); } diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index c49df88159f..058eed55cab 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -131,4 +131,6 @@ public: } }; +using NuKeeperStoragePtr = std::unique_ptr; + } diff --git a/src/Coordination/NuKeeperStorageDispatcher.cpp b/src/Coordination/NuKeeperStorageDispatcher.cpp index 3aed0d99568..5b35b9c4829 100644 --- a/src/Coordination/NuKeeperStorageDispatcher.cpp +++ b/src/Coordination/NuKeeperStorageDispatcher.cpp @@ -132,6 +132,10 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config); + request_thread = ThreadFromGlobalPool([this] { requestThread(); }); + responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); + snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); }); + server = std::make_unique(myid, coordination_settings, config, responses_queue, snapshots_queue); try { @@ -148,10 +152,8 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati throw; } - request_thread = ThreadFromGlobalPool([this] { requestThread(); }); - responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); + session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); }); - snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); }); LOG_DEBUG(log, "Dispatcher initialized"); } diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 01146248f63..cc3dcc04e53 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -897,25 +897,25 @@ TEST(CoordinationTest, TestStorageSnapshotSimple) manager.serializeSnapshotBufferToDisk(*buf, 2); EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin")); - DB::NuKeeperStorage restored_storage(500); auto debuf = manager.deserializeSnapshotBufferFromDisk(2); - manager.deserializeSnapshotFromBuffer(&restored_storage, debuf); - EXPECT_EQ(restored_storage.container.size(), 3); - EXPECT_EQ(restored_storage.container.getValue("/").children.size(), 1); - EXPECT_EQ(restored_storage.container.getValue("/hello").children.size(), 1); - EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").children.size(), 0); + auto [snapshot_meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage.container.getValue("/").data, ""); - EXPECT_EQ(restored_storage.container.getValue("/hello").data, "world"); - EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").data, "somedata"); - EXPECT_EQ(restored_storage.session_id_counter, 7); - EXPECT_EQ(restored_storage.zxid, 2); - EXPECT_EQ(restored_storage.ephemerals.size(), 2); - EXPECT_EQ(restored_storage.ephemerals[3].size(), 1); - EXPECT_EQ(restored_storage.ephemerals[1].size(), 1); - EXPECT_EQ(restored_storage.session_and_timeout.size(), 2); + EXPECT_EQ(restored_storage->container.size(), 3); + EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0); + + EXPECT_EQ(restored_storage->container.getValue("/").data, ""); + EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world"); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata"); + EXPECT_EQ(restored_storage->session_id_counter, 7); + EXPECT_EQ(restored_storage->zxid, 2); + EXPECT_EQ(restored_storage->ephemerals.size(), 2); + EXPECT_EQ(restored_storage->ephemerals[3].size(), 1); + EXPECT_EQ(restored_storage->ephemerals[1].size(), 1); + EXPECT_EQ(restored_storage->session_and_timeout.size(), 2); } TEST(CoordinationTest, TestStorageSnapshotMoreWrites) @@ -946,15 +946,14 @@ TEST(CoordinationTest, TestStorageSnapshotMoreWrites) manager.serializeSnapshotBufferToDisk(*buf, 50); EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin")); - DB::NuKeeperStorage restored_storage(500); auto debuf = manager.deserializeSnapshotBufferFromDisk(50); - manager.deserializeSnapshotFromBuffer(&restored_storage, debuf); + auto [meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage.container.size(), 51); + EXPECT_EQ(restored_storage->container.size(), 51); for (size_t i = 0; i < 50; ++i) { - EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); } } @@ -987,14 +986,13 @@ TEST(CoordinationTest, TestStorageSnapshotManySnapshots) EXPECT_TRUE(fs::exists("./snapshots/snapshot_250.bin")); - DB::NuKeeperStorage restored_storage(500); - manager.restoreFromLatestSnapshot(&restored_storage); + auto [meta, restored_storage] = manager.restoreFromLatestSnapshot(); - EXPECT_EQ(restored_storage.container.size(), 251); + EXPECT_EQ(restored_storage->container.size(), 251); for (size_t i = 0; i < 250; ++i) { - EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); } } @@ -1040,12 +1038,11 @@ TEST(CoordinationTest, TestStorageSnapshotMode) EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i))); } - DB::NuKeeperStorage restored_storage(500); - manager.restoreFromLatestSnapshot(&restored_storage); + auto [meta, restored_storage] = manager.restoreFromLatestSnapshot(); for (size_t i = 0; i < 50; ++i) { - EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); } } @@ -1071,8 +1068,7 @@ TEST(CoordinationTest, TestStorageSnapshotBroken) plain_buf.truncate(34); plain_buf.sync(); - DB::NuKeeperStorage restored_storage(500); - EXPECT_THROW(manager.restoreFromLatestSnapshot(&restored_storage), DB::Exception); + EXPECT_THROW(manager.restoreFromLatestSnapshot(), DB::Exception); } nuraft::ptr getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request) @@ -1236,6 +1232,37 @@ TEST(CoordinationTest, TestStateMachineAndLogStore) } } +TEST(CoordinationTest, TestEphemeralNodeRemove) +{ + using namespace Coordination; + using namespace DB; + + ChangelogDirTest snapshots("./snapshots"); + CoordinationSettingsPtr settings = std::make_shared(); + + ResponsesQueue queue; + SnapshotsQueue snapshots_queue{1}; + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings); + state_machine->init(); + + std::shared_ptr request_c = std::make_shared(); + request_c->path = "/hello"; + request_c->is_ephemeral = true; + auto entry_c = getLogEntryFromZKRequest(0, 1, request_c); + state_machine->commit(1, entry_c->get_buf()); + const auto & storage = state_machine->getStorage(); + + EXPECT_EQ(storage.ephemerals.size(), 1); + std::shared_ptr request_d = std::make_shared(); + request_d->path = "/hello"; + /// Delete from other session + auto entry_d = getLogEntryFromZKRequest(0, 2, request_d); + state_machine->commit(2, entry_d->get_buf()); + + EXPECT_EQ(storage.ephemerals.size(), 0); +} + + int main(int argc, char ** argv) { Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); diff --git a/src/Core/Field.h b/src/Core/Field.h index 558e1fafd74..81d06693a7f 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -953,3 +953,26 @@ void writeFieldText(const Field & x, WriteBuffer & buf); String toString(const Field & x); } + +template <> +struct fmt::formatter +{ + constexpr auto parse(format_parse_context & ctx) + { + auto it = ctx.begin(); + auto end = ctx.end(); + + /// Only support {}. + if (it != end && *it != '}') + throw format_error("invalid format"); + + return it; + } + + template + auto format(const DB::Field & x, FormatContext & ctx) + { + return format_to(ctx.out(), "{}", toString(x)); + } +}; + diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index e96ce1824d2..7b1779d4346 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index fe777355ca1..eedf4dd3d87 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -101,7 +101,7 @@ template double CacheDictionary::getLoadFactor() const { const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - return static_cast(cache_storage_ptr->getSize()) / cache_storage_ptr->getMaxSize(); + return cache_storage_ptr->getLoadFactor(); } template @@ -333,9 +333,7 @@ Columns CacheDictionary::getColumnsImpl( FetchResult result_of_fetch_from_storage; { - /// Write lock on storage - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); } diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index cf0b74e8bd2..f0028dd8848 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -30,28 +31,31 @@ struct CacheDictionaryStorageConfiguration const DictionaryLifetime lifetime; }; -/** Keys are stored in LRUCache and column values are serialized into arena. - - Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored. - - Columns are serialized by rows. - - When cell is removed from LRUCache data associated with it is also removed from arena. - - In case of complex key we also store key data in arena and it is removed from arena. -*/ +/** ICacheDictionaryStorage implementation that keeps key in hash table with fixed collision length. + * Value in hash table point to index in attributes arrays. + */ template class CacheDictionaryStorage final : public ICacheDictionaryStorage { + + static constexpr size_t max_collision_length = 10; + public: using KeyType = std::conditional_t; static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage"); - explicit CacheDictionaryStorage(CacheDictionaryStorageConfiguration & configuration_) + explicit CacheDictionaryStorage( + const DictionaryStructure & dictionary_structure, + CacheDictionaryStorageConfiguration & configuration_) : configuration(configuration_) , rnd_engine(randomSeed()) - , cache(configuration.max_size_in_cells, false, { arena }) { + size_t cells_size = roundUpToPowerOfTwoOrZero(std::max(configuration.max_size_in_cells, max_collision_length)); + + cells.resize_fill(cells_size); + size_overlap_mask = cells_size - 1; + + setup(dictionary_structure); } bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } @@ -71,9 +75,7 @@ public: const DictionaryStorageFetchRequest & fetch_request) override { if constexpr (dictionary_key_type == DictionaryKeyType::simple) - { return fetchColumnsForKeysImpl(keys, fetch_request); - } else throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); } @@ -109,9 +111,7 @@ public: const DictionaryStorageFetchRequest & column_fetch_requests) override { if constexpr (dictionary_key_type == DictionaryKeyType::complex) - { return fetchColumnsForKeysImpl(keys, column_fetch_requests); - } else throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); } @@ -140,79 +140,162 @@ public: throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); } - size_t getSize() const override { return cache.size(); } + size_t getSize() const override { return size; } - size_t getMaxSize() const override { return cache.getMaxSize(); } + double getLoadFactor() const override { return static_cast(size) / configuration.max_size_in_cells; } - size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); } + size_t getBytesAllocated() const override + { + size_t attributes_size_in_bytes = 0; + size_t attributes_size = attributes.size(); + + for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index) + { + getAttributeContainer(attribute_index, [&](const auto & container) + { + attributes_size_in_bytes += container.capacity() * sizeof(container[0]); + }); + } + + return arena.size() + sizeof(Cell) * configuration.max_size_in_cells + attributes_size_in_bytes; + } private: + struct FetchedKey + { + FetchedKey(size_t element_index_, bool is_default_) + : element_index(element_index_) + , is_default(is_default_) + {} + + size_t element_index; + bool is_default; + }; + template - ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl( + KeysStorageFetchResult fetchColumnsForKeysImpl( const PaddedPODArray & keys, const DictionaryStorageFetchRequest & fetch_request) { KeysStorageFetchResult result; result.fetched_columns = fetch_request.makeAttributesResultColumns(); - result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found}); + result.key_index_to_state.resize_fill(keys.size()); - const auto now = std::chrono::system_clock::now(); + const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); size_t fetched_columns_index = 0; + size_t keys_size = keys.size(); std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds); - size_t keys_size = keys.size(); + PaddedPODArray fetched_keys; + fetched_keys.resize_fill(keys_size); for (size_t key_index = 0; key_index < keys_size; ++key_index) { auto key = keys[key_index]; - auto * it = cache.find(key); + auto [key_state, cell_index] = getKeyStateAndCellIndex(key, now); - if (it) + if (unlikely(key_state == KeyState::not_found)) { - /// Columns values for key are serialized in cache now deserialize them - const auto & cell = it->getMapped(); + result.key_index_to_state[key_index] = {KeyState::not_found}; + ++result.not_found_keys_size; + continue; + } - bool has_deadline = cellHasDeadline(cell); + auto & cell = cells[cell_index]; - if (has_deadline && now > cell.deadline + max_lifetime_seconds) - { - result.key_index_to_state[key_index] = {KeyState::not_found}; - ++result.not_found_keys_size; - continue; - } - else if (has_deadline && now > cell.deadline) - { - result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index}; - ++result.expired_keys_size; - } - else - { - result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index}; - ++result.found_keys_size; - } + result.expired_keys_size += static_cast(key_state == KeyState::expired); - ++fetched_columns_index; + result.key_index_to_state[key_index] = {key_state, fetched_columns_index}; + fetched_keys[fetched_columns_index] = FetchedKey(cell.element_index, cell.is_default); - if (cell.isDefault()) + ++fetched_columns_index; + + result.key_index_to_state[key_index].setDefaultValue(cell.is_default); + result.default_keys_size += cell.is_default; + } + + result.found_keys_size = keys_size - (result.expired_keys_size + result.not_found_keys_size); + + for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index) + { + if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index)) + continue; + + auto & attribute = attributes[attribute_index]; + const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index); + + size_t fetched_keys_size = fetched_keys.size(); + auto & fetched_column = *result.fetched_columns[attribute_index]; + fetched_column.reserve(fetched_keys_size); + + if (unlikely(attribute.is_complex_type)) + { + auto & container = std::get>(attribute.attribute_container); + + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) { - result.key_index_to_state[key_index].setDefault(); - ++result.default_keys_size; - insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index); - } - else - { - const char * place_for_serialized_columns = cell.place_for_serialized_columns; - deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns); + auto fetched_key = fetched_keys[fetched_key_index]; + + if (unlikely(fetched_key.is_default)) + fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + fetched_column.insert(container[fetched_key.element_index]); } } else { - result.key_index_to_state[key_index] = {KeyState::not_found}; - ++result.not_found_keys_size; + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnType = + std::conditional_t, ColumnString, + std::conditional_t, ColumnDecimal, + ColumnVector>>; + + auto & container = std::get>(attribute.attribute_container); + ColumnType & column_typed = static_cast(fetched_column); + + if constexpr (std::is_same_v) + { + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + + if (unlikely(fetched_key.is_default)) + column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + { + auto item = container[fetched_key.element_index]; + column_typed.insertData(item.data, item.size); + } + } + } + else + { + auto & data = column_typed.getData(); + + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + + if (unlikely(fetched_key.is_default)) + column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + { + auto item = container[fetched_key.element_index]; + data.push_back(item); + } + } + } + }; + + callOnDictionaryAttributeType(attribute.type, type_call); } } @@ -221,58 +304,108 @@ private: void insertColumnsForKeysImpl(const PaddedPODArray & keys, Columns columns) { - Arena temporary_values_pool; - - size_t columns_to_serialize_size = columns.size(); - PaddedPODArray temporary_column_data(columns_to_serialize_size); - const auto now = std::chrono::system_clock::now(); - size_t keys_size = keys.size(); + Field column_value; - for (size_t key_index = 0; key_index < keys_size; ++key_index) + for (size_t key_index = 0; key_index < keys.size(); ++key_index) { - size_t allocated_size_for_columns = 0; - const char * block_start = nullptr; - auto key = keys[key_index]; - auto * it = cache.find(key); - for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index) + size_t cell_index = getCellIndexForInsert(key); + auto & cell = cells[cell_index]; + + bool cell_was_default = cell.is_default; + cell.is_default = false; + + bool was_inserted = cell.deadline == 0; + + if (was_inserted) { - auto & column = columns[column_index]; - temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start); - allocated_size_for_columns += temporary_column_data[column_index].size; - } + if constexpr (std::is_same_v) + cell.key = copyStringInArena(key); + else + cell.key = key; - char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns); - memcpy(reinterpret_cast(place_for_serialized_columns), reinterpret_cast(block_start), allocated_size_for_columns); + for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index) + { + auto & column = columns[attribute_index]; - if (it) - { - /// Cell exists need to free previous serialized place and update deadline - auto & cell = it->getMapped(); + getAttributeContainer(attribute_index, [&](auto & container) + { + container.emplace_back(); + cell.element_index = container.size() - 1; - if (cell.place_for_serialized_columns) - arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); + using ElementType = std::decay_t; - setCellDeadline(cell, now); - cell.allocated_size_for_columns = allocated_size_for_columns; - cell.place_for_serialized_columns = place_for_serialized_columns; + column->get(key_index, column_value); + + if constexpr (std::is_same_v) + container.back() = column_value; + else if constexpr (std::is_same_v) + { + const String & string_value = column_value.get(); + StringRef string_value_ref = StringRef {string_value.data(), string_value.size()}; + StringRef inserted_value = copyStringInArena(string_value_ref); + container.back() = inserted_value; + } + else + container.back() = column_value.get>(); + }); + } + + ++size; } else { - /// No cell exists so create and put in cache - Cell cell; + if (cell.key != key) + { + if constexpr (std::is_same_v) + { + char * data = const_cast(cell.key.data); + arena.free(data, cell.key.size); + cell.key = copyStringInArena(key); + } + else + cell.key = key; + } - setCellDeadline(cell, now); - cell.allocated_size_for_columns = allocated_size_for_columns; - cell.place_for_serialized_columns = place_for_serialized_columns; + /// Put values into existing index + size_t index_to_use = cell.element_index; - insertCellInCache(key, cell); + for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index) + { + auto & column = columns[attribute_index]; + + getAttributeContainer(attribute_index, [&](auto & container) + { + using ElementType = std::decay_t; + + column->get(key_index, column_value); + + if constexpr (std::is_same_v) + container[index_to_use] = column_value; + else if constexpr (std::is_same_v) + { + const String & string_value = column_value.get(); + StringRef string_ref_value = StringRef {string_value.data(), string_value.size()}; + StringRef inserted_value = copyStringInArena(string_ref_value); + + if (!cell_was_default) + { + StringRef previous_value = container[index_to_use]; + arena.free(const_cast(previous_value.data), previous_value.size); + } + + container[index_to_use] = inserted_value; + } + else + container[index_to_use] = column_value.get>(); + }); + } } - temporary_values_pool.rollback(allocated_size_for_columns); + setCellDeadline(cell, now); } } @@ -280,94 +413,224 @@ private: { const auto now = std::chrono::system_clock::now(); - for (auto key : keys) + size_t keys_size = keys.size(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) { - auto * it = cache.find(key); + auto key = keys[key_index]; - if (it) + size_t cell_index = getCellIndexForInsert(key); + auto & cell = cells[cell_index]; + + bool was_inserted = cell.deadline == 0; + bool cell_was_default = cell.is_default; + + cell.is_default = true; + + if (was_inserted) { - auto & cell = it->getMapped(); + if constexpr (std::is_same_v) + cell.key = copyStringInArena(key); + else + cell.key = key; - setCellDeadline(cell, now); + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + getAttributeContainer(attribute_index, [&](auto & container) + { + container.emplace_back(); + cell.element_index = container.size() - 1; + }); + } - if (cell.place_for_serialized_columns) - arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); - - cell.allocated_size_for_columns = 0; - cell.place_for_serialized_columns = nullptr; + ++size; } else { - Cell cell; + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + getAttributeContainer(attribute_index, [&](const auto & container) + { + using ElementType = std::decay_t; - setCellDeadline(cell, now); - cell.allocated_size_for_columns = 0; - cell.place_for_serialized_columns = nullptr; + if constexpr (std::is_same_v) + { + if (!cell_was_default) + { + StringRef previous_value = container[cell.element_index]; + arena.free(const_cast(previous_value.data), previous_value.size); + } + } + }); + } - insertCellInCache(key, cell); + if (cell.key != key) + { + if constexpr (std::is_same_v) + { + char * data = const_cast(cell.key.data); + arena.free(data, cell.key.size); + cell.key = copyStringInArena(key); + } + else + cell.key = key; + } } + + setCellDeadline(cell, now); } } PaddedPODArray getCachedKeysImpl() const { PaddedPODArray result; - result.reserve(cache.size()); + result.reserve(size); - for (auto & node : cache) + for (auto & cell : cells) { - auto & cell = node.getMapped(); - - if (cell.isDefault()) + if (cell.deadline == 0) continue; - result.emplace_back(node.getKey()); + if (cell.is_default) + continue; + + result.emplace_back(cell.key); } return result; } + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) + { + auto & attribute = attributes[attribute_index]; + auto & attribute_type = attribute.type; + + if (unlikely(attribute.is_complex_type)) + { + auto & container = std::get>(attribute.attribute_container); + std::forward(func)(container); + } + else + { + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + + auto & container = std::get>(attribute.attribute_container); + std::forward(func)(container); + }; + + callOnDictionaryAttributeType(attribute_type, type_call); + } + } + + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) const + { + return const_cast *>(this)->template getAttributeContainer(attribute_index, std::forward(func)); + } + + StringRef copyStringInArena(StringRef value_to_copy) + { + size_t value_to_copy_size = value_to_copy.size; + char * place_for_key = arena.alloc(value_to_copy_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data), value_to_copy_size); + StringRef updated_value{place_for_key, value_to_copy_size}; + + return updated_value; + } + + void setup(const DictionaryStructure & dictionary_structure) + { + /// For each dictionary attribute create storage attribute + /// For simple attributes create PODArray, for complex vector of Fields + + attributes.reserve(dictionary_structure.attributes.size()); + + for (const auto & dictionary_attribute : dictionary_structure.attributes) + { + auto attribute_type = dictionary_attribute.underlying_type; + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + + attributes.emplace_back(); + auto & last_attribute = attributes.back(); + last_attribute.type = attribute_type; + last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array; + + if (dictionary_attribute.is_nullable) + last_attribute.attribute_container = std::vector(); + else + last_attribute.attribute_container = PaddedPODArray(); + }; + + callOnDictionaryAttributeType(attribute_type, type_call); + } + } + using TimePoint = std::chrono::system_clock::time_point; struct Cell { - TimePoint deadline; - size_t allocated_size_for_columns; - char * place_for_serialized_columns; - - inline bool isDefault() const { return place_for_serialized_columns == nullptr; } - inline void setDefault() - { - place_for_serialized_columns = nullptr; - allocated_size_for_columns = 0; - } + KeyType key; + size_t element_index; + bool is_default; + time_t deadline; }; - void insertCellInCache(KeyType & key, const Cell & cell) + struct Attribute { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - { - /// Copy complex key into arena and put in cache - size_t key_size = key.size; - char * place_for_key = arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - KeyType updated_key{place_for_key, key_size}; - key = updated_key; - } + AttributeUnderlyingType type; + bool is_complex_type; - cache.insert(key, cell); - } + std::variant< + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + std::vector> attribute_container; + }; - inline static bool cellHasDeadline(const Cell & cell) - { - return cell.deadline != std::chrono::system_clock::from_time_t(0); - } + CacheDictionaryStorageConfiguration configuration; + + pcg64 rnd_engine; + + size_t size_overlap_mask = 0; + + size_t size = 0; + + PaddedPODArray cells; + + ArenaWithFreeLists arena; + + std::vector attributes; inline void setCellDeadline(Cell & cell, TimePoint now) { if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) { - cell.deadline = std::chrono::system_clock::from_time_t(0); + /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds + /// to the expiration time. And it overflows pretty well. + auto deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); return; } @@ -375,44 +638,75 @@ private: size_t max_sec_lifetime = configuration.lifetime.max_sec; std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; - cell.deadline = now + std::chrono::seconds(distribution(rnd_engine)); + + auto deadline = now + std::chrono::seconds(distribution(rnd_engine)); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); } - template - friend class ArenaCellDisposer; - - CacheDictionaryStorageConfiguration configuration; - - ArenaWithFreeLists arena; - - pcg64 rnd_engine; - - class ArenaCellDisposer + inline size_t getCellIndex(const KeyType key) const { - public: - ArenaWithFreeLists & arena; + const size_t hash = DefaultHash()(key); + const size_t index = hash & size_overlap_mask; + return index; + } - template - void operator()(const Key & key, const Value & value) const + using KeyStateAndCellIndex = std::pair; + + inline KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const + { + size_t place_value = getCellIndex(key); + const size_t place_value_end = place_value + max_collision_length; + + time_t max_lifetime_seconds = static_cast(configuration.strict_max_lifetime_seconds); + + for (; place_value < place_value_end; ++place_value) { - /// In case of complex key we keep it in arena - if constexpr (std::is_same_v) - arena.free(const_cast(key.data), key.size); + const auto cell_place_value = place_value & size_overlap_mask; + const auto & cell = cells[cell_place_value]; - if (value.place_for_serialized_columns) - arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns); + if (cell.key != key) + continue; + + if (unlikely(now > cell.deadline + max_lifetime_seconds)) + return std::make_pair(KeyState::not_found, cell_place_value); + + if (unlikely(now > cell.deadline)) + return std::make_pair(KeyState::expired, cell_place_value); + + return std::make_pair(KeyState::found, cell_place_value); } - }; - using SimpleKeyLRUHashMap = LRUHashMap; - using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; + return std::make_pair(KeyState::not_found, place_value & size_overlap_mask); + } - using CacheLRUHashMap = std::conditional_t< - dictionary_key_type == DictionaryKeyType::simple, - SimpleKeyLRUHashMap, - ComplexKeyLRUHashMap>; + inline size_t getCellIndexForInsert(const KeyType & key) const + { + size_t place_value = getCellIndex(key); + const size_t place_value_end = place_value + max_collision_length; + size_t oldest_place_value = place_value; - CacheLRUHashMap cache; + time_t oldest_time = std::numeric_limits::max(); + + for (; place_value < place_value_end; ++place_value) + { + const size_t cell_place_value = place_value & size_overlap_mask; + const Cell cell = cells[cell_place_value]; + + if (cell.deadline == 0) + return cell_place_value; + + if (cell.key == key) + return cell_place_value; + + if (cell.deadline < oldest_time) + { + oldest_time = cell.deadline; + oldest_place_value = cell_place_value; + } + } + + return oldest_place_value; + } }; } diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h index 8db2dab536c..72b3ef76f11 100644 --- a/src/Dictionaries/ICacheDictionaryStorage.h +++ b/src/Dictionaries/ICacheDictionaryStorage.h @@ -12,9 +12,9 @@ struct KeyState { enum State: uint8_t { - not_found = 2, - expired = 4, - found = 8, + not_found = 0, + expired = 1, + found = 2, }; KeyState(State state_, size_t fetched_column_index_) @@ -31,9 +31,10 @@ struct KeyState inline bool isNotFound() const { return state == State::not_found; } inline bool isDefault() const { return is_default; } inline void setDefault() { is_default = true; } + inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; } /// Valid only if keyState is found or expired inline size_t getFetchedColumnIndex() const { return fetched_column_index; } - + inline void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; } private: State state = not_found; size_t fetched_column_index = 0; @@ -111,8 +112,8 @@ public: /// Return size of keys in storage virtual size_t getSize() const = 0; - /// Return maximum size of keys in storage - virtual size_t getMaxSize() const = 0; + /// Returns storage load factor + virtual double getLoadFactor() const = 0; /// Return bytes allocated in storage virtual size_t getBytesAllocated() const = 0; diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index 16a8954de58..67f0465a2c7 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -56,7 +56,6 @@ struct SSDCacheDictionaryStorageConfiguration const std::string file_path; const size_t max_partitions_count; - const size_t max_stored_keys; const size_t block_size; const size_t file_blocks_size; const size_t read_buffer_blocks_size; @@ -127,7 +126,7 @@ public: /// Reset block with new block_data /// block_data must be filled with zeroes if it is new block - ALWAYS_INLINE inline void reset(char * new_block_data) + inline void reset(char * new_block_data) { block_data = new_block_data; current_block_offset = block_header_size; @@ -135,13 +134,13 @@ public: } /// Check if it is enough place to write key in block - ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const + inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const { return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size; } /// Check if it is enough place to write key in block - ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const + inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const { const StringRef & key = cache_key.key; size_t complex_key_size = sizeof(key.size) + key.size; @@ -152,7 +151,7 @@ public: /// Write key and returns offset in ssd cache block where data is written /// It is client responsibility to check if there is enough place in block to write key /// Returns true if key was written and false if there was not enough place to write key - ALWAYS_INLINE inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block) + inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block) { assert(cache_key.size > 0); @@ -181,7 +180,7 @@ public: return true; } - ALWAYS_INLINE inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block) + inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block) { assert(cache_key.size > 0); @@ -216,20 +215,20 @@ public: return true; } - ALWAYS_INLINE inline size_t getKeysSize() const { return keys_size; } + inline size_t getKeysSize() const { return keys_size; } /// Write keys size into block header - ALWAYS_INLINE inline void writeKeysSize() + inline void writeKeysSize() { char * keys_size_offset_data = block_data + block_header_check_sum_size; std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t)); } /// Get check sum from block header - ALWAYS_INLINE inline size_t getCheckSum() const { return unalignedLoad(block_data); } + inline size_t getCheckSum() const { return unalignedLoad(block_data); } /// Calculate check sum in block - ALWAYS_INLINE inline size_t calculateCheckSum() const + inline size_t calculateCheckSum() const { size_t calculated_check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); @@ -237,7 +236,7 @@ public: } /// Check if check sum from block header matched calculated check sum in block - ALWAYS_INLINE inline bool checkCheckSum() const + inline bool checkCheckSum() const { size_t calculated_check_sum = calculateCheckSum(); size_t check_sum = getCheckSum(); @@ -246,16 +245,16 @@ public: } /// Write check sum in block header - ALWAYS_INLINE inline void writeCheckSum() + inline void writeCheckSum() { size_t check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); std::memcpy(block_data, &check_sum, sizeof(size_t)); } - ALWAYS_INLINE inline size_t getBlockSize() const { return block_size; } + inline size_t getBlockSize() const { return block_size; } /// Returns block data - ALWAYS_INLINE inline char * getBlockData() const { return block_data; } + inline char * getBlockData() const { return block_data; } /// Read keys that were serialized in block /// It is client responsibility to ensure that simple or complex keys were written in block @@ -337,9 +336,7 @@ inline bool operator==(const SSDCacheIndex & lhs, const SSDCacheIndex & rhs) return lhs.block_index == rhs.block_index && lhs.offset_in_block == rhs.offset_in_block; } -/** SSDCacheMemoryBuffer initialized with block size and memory buffer blocks size. - * Allocate block_size * memory_buffer_blocks_size bytes with page alignment. - * Logically represents multiple memory_buffer_blocks_size blocks and current write block. +/** Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks and current write block. * If key cannot be written into current_write_block, current block keys size and check summ is written * and buffer increase index of current_write_block_index. * If current_write_block_index == memory_buffer_blocks_size write key will always returns true. @@ -444,7 +441,7 @@ private: size_t current_block_index = 0; }; -/// TODO: Add documentation +/// Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks on file system template class SSDCacheFileBuffer : private boost::noncopyable { @@ -614,11 +611,13 @@ public: } template - ALWAYS_INLINE void fetchBlocks(char * read_buffer, size_t read_from_file_buffer_blocks_size, const PaddedPODArray & blocks_to_fetch, FetchBlockFunc && func) const + void fetchBlocks(size_t read_from_file_buffer_blocks_size, const PaddedPODArray & blocks_to_fetch, FetchBlockFunc && func) const { if (blocks_to_fetch.empty()) return; + Memory> read_buffer(read_from_file_buffer_blocks_size * block_size, 4096); + size_t blocks_to_fetch_size = blocks_to_fetch.size(); PaddedPODArray requests; @@ -631,7 +630,7 @@ public: { iocb request{}; - char * buffer_place = read_buffer + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size); + char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size); #if defined(__FreeBSD__) request.aio.aio_lio_opcode = LIO_READ; @@ -751,7 +750,7 @@ private: int fd = -1; }; - ALWAYS_INLINE inline static int preallocateDiskSpace(int fd, size_t offset, size_t len) + inline static int preallocateDiskSpace(int fd, size_t offset, size_t len) { #if defined(__FreeBSD__) return posix_fallocate(fd, offset, len); @@ -760,7 +759,7 @@ private: #endif } - ALWAYS_INLINE inline static char * getRequestBuffer(const iocb & request) + inline static char * getRequestBuffer(const iocb & request) { char * result = nullptr; @@ -773,7 +772,7 @@ private: return result; } - ALWAYS_INLINE inline static ssize_t eventResult(io_event & event) + inline static ssize_t eventResult(io_event & event) { ssize_t bytes_written; @@ -795,7 +794,13 @@ private: size_t current_blocks_size = 0; }; -/// TODO: Add documentation +/** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions. + * Data is first written in memory buffer. + * If memory buffer is full then buffer is flushed to disk partition. + * If memory buffer cannot be flushed to associated disk partition, then if partition + * can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused. + * Index maps key to partition block and offset. + */ template class SSDCacheDictionaryStorage final : public ICacheDictionaryStorage { @@ -806,9 +811,7 @@ public: explicit SSDCacheDictionaryStorage(const SSDCacheDictionaryStorageConfiguration & configuration_) : configuration(configuration_) , file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size) - , read_from_file_buffer(configuration_.block_size * configuration_.read_buffer_blocks_size, 4096) , rnd_engine(randomSeed()) - , index(configuration.max_stored_keys, false, { complex_key_arena }) { memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size); } @@ -897,14 +900,31 @@ public: size_t getSize() const override { return index.size(); } - size_t getMaxSize() const override {return index.getMaxSize(); } + double getLoadFactor() const override + { + size_t partitions_size = memory_buffer_partitions.size(); + + if (partitions_size == configuration.max_partitions_count) + return 1.0; + + auto & current_memory_partition = memory_buffer_partitions[current_partition_index]; + + size_t full_partitions = partitions_size - 1; + size_t blocks_in_memory = (full_partitions * configuration.write_buffer_blocks_size) + current_memory_partition.getCurrentBlockIndex(); + size_t blocks_on_disk = file_buffer.getCurrentBlockIndex(); + + size_t max_blocks_size = (configuration.file_blocks_size + configuration.write_buffer_blocks_size) * configuration.max_partitions_count; + + double load_factor = static_cast(blocks_in_memory + blocks_on_disk) / max_blocks_size; + return load_factor; + } size_t getBytesAllocated() const override { size_t memory_partitions_bytes_size = memory_buffer_partitions.size() * configuration.write_buffer_blocks_size * configuration.block_size; size_t file_partitions_bytes_size = memory_buffer_partitions.size() * configuration.file_blocks_size * configuration.block_size; - return index.getSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size; + return index.getBufferSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size; } private: @@ -920,8 +940,7 @@ private: default_value }; - TimePoint deadline; - + time_t deadline; SSDCacheIndex index; size_t in_memory_partition_index; CellState state; @@ -933,13 +952,12 @@ private: struct KeyToBlockOffset { - KeyToBlockOffset(size_t key_index_, size_t offset_in_block_, bool is_expired_) - : key_index(key_index_), offset_in_block(offset_in_block_), is_expired(is_expired_) + KeyToBlockOffset(size_t key_index_, size_t offset_in_block_) + : key_index(key_index_), offset_in_block(offset_in_block_) {} size_t key_index = 0; size_t offset_in_block = 0; - bool is_expired = false; }; template @@ -950,20 +968,24 @@ private: Result result; result.fetched_columns = fetch_request.makeAttributesResultColumns(); - result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found}); + result.key_index_to_state.resize_fill(keys.size()); - const auto now = std::chrono::system_clock::now(); + const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); size_t fetched_columns_index = 0; - using BlockIndexToKeysMap = std::unordered_map, DefaultHash>; + using BlockIndexToKeysMap = absl::flat_hash_map, DefaultHash>; BlockIndexToKeysMap block_to_keys_map; absl::flat_hash_set> unique_blocks_to_request; PaddedPODArray blocks_to_request; - std::chrono::seconds strict_max_lifetime_seconds(configuration.strict_max_lifetime_seconds); + time_t strict_max_lifetime_seconds = static_cast(configuration.strict_max_lifetime_seconds); size_t keys_size = keys.size(); + for (size_t attribute_size = 0; attribute_size < fetch_request.attributesSize(); ++attribute_size) + if (fetch_request.shouldFillResultColumnWithIndex(attribute_size)) + result.fetched_columns[attribute_size]->reserve(keys_size); + for (size_t key_index = 0; key_index < keys_size; ++key_index) { auto key = keys[key_index]; @@ -978,9 +1000,7 @@ private: const auto & cell = it->getMapped(); - bool has_deadline = cellHasDeadline(cell); - - if (has_deadline && now > cell.deadline + strict_max_lifetime_seconds) + if (unlikely(now > cell.deadline + strict_max_lifetime_seconds)) { ++result.not_found_keys_size; continue; @@ -989,14 +1009,14 @@ private: bool cell_is_expired = false; KeyState::State key_state = KeyState::found; - if (has_deadline && now > cell.deadline) + if (now > cell.deadline) { cell_is_expired = true; key_state = KeyState::expired; } - result.expired_keys_size += cell_is_expired; - result.found_keys_size += !cell_is_expired; + result.expired_keys_size += static_cast(cell_is_expired); + result.found_keys_size += static_cast(!cell_is_expired); switch (cell.state) { @@ -1012,13 +1032,20 @@ private: } case Cell::on_disk: { - block_to_keys_map[cell.index.block_index].emplace_back(key_index, cell.index.offset_in_block, cell_is_expired); + PaddedPODArray & keys_block = block_to_keys_map[cell.index.block_index]; + keys_block.emplace_back(key_index, cell.index.offset_in_block); - if (!unique_blocks_to_request.contains(cell.index.block_index)) - { + KeyState::State state = cell_is_expired ? KeyState::expired : KeyState::found; + + /// Fetched column index will be set later during fetch blocks + result.key_index_to_state[key_index] = {state, 0}; + + auto insert_result = unique_blocks_to_request.insert(cell.index.block_index); + bool was_inserted = insert_result.second; + + if (was_inserted) blocks_to_request.emplace_back(cell.index.block_index); - unique_blocks_to_request.insert(cell.index.block_index); - } + break; } case Cell::default_value: @@ -1037,7 +1064,7 @@ private: /// Sort blocks by offset before start async io requests std::sort(blocks_to_request.begin(), blocks_to_request.end()); - file_buffer.fetchBlocks(read_from_file_buffer.m_data, configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data) + file_buffer.fetchBlocks(configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data) { auto & keys_in_block = block_to_keys_map[block_index]; @@ -1046,10 +1073,7 @@ private: char * key_data = block_data + key_in_block.offset_in_block; deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data); - if (key_in_block.is_expired) - result.key_index_to_state[key_in_block.key_index] = {KeyState::expired, fetched_columns_index}; - else - result.key_index_to_state[key_in_block.key_index] = {KeyState::found, fetched_columns_index}; + result.key_index_to_state[key_in_block.key_index].setFetchedColumnIndex(fetched_columns_index); ++fetched_columns_index; } @@ -1087,7 +1111,7 @@ private: throw Exception("Serialized columns size is greater than allowed block size and metadata", ErrorCodes::UNSUPPORTED_METHOD); /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index - index.erase(key); + eraseKeyFromIndex(key); Cell cell; setCellDeadline(cell, now); @@ -1114,8 +1138,7 @@ private: for (auto key : keys) { - /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index - index.erase(key); + eraseKeyFromIndex(key); Cell cell; @@ -1135,7 +1158,7 @@ private: key = updated_key; } - index.insert(key, cell); + index[key] = cell; } } @@ -1188,7 +1211,7 @@ private: cell.index = cache_index; cell.in_memory_partition_index = current_partition_index; - index.insert(ssd_cache_key.key, cell); + index[ssd_cache_key.key] = cell; break; } else @@ -1218,7 +1241,7 @@ private: if (old_key_cell.isOnDisk() && old_key_block >= block_index_in_file_before_write && old_key_block < file_read_end_block_index) - index.erase(old_key); + eraseKeyFromIndex(old_key); } } } @@ -1271,7 +1294,7 @@ private: cell.index = cache_index; cell.in_memory_partition_index = current_partition_index; - index.insert(ssd_cache_key.key, cell); + index[ssd_cache_key.key] = cell; break; } else @@ -1296,16 +1319,12 @@ private: } } - inline static bool cellHasDeadline(const Cell & cell) - { - return cell.deadline != std::chrono::system_clock::from_time_t(0); - } - inline void setCellDeadline(Cell & cell, TimePoint now) { if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) { - cell.deadline = std::chrono::system_clock::from_time_t(0); + auto deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); return; } @@ -1313,47 +1332,45 @@ private: size_t max_sec_lifetime = configuration.lifetime.max_sec; std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; - cell.deadline = now + std::chrono::seconds{distribution(rnd_engine)}; + auto deadline = now + std::chrono::seconds(distribution(rnd_engine)); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); } - template - friend class ArenaCellKeyDisposer; + inline void eraseKeyFromIndex(KeyType key) + { + auto it = index.find(key); + + if (it == nullptr) + return; + + /// In case of complex key in arena key is serialized from hash table + KeyType key_copy = it->getKey(); + + index.erase(key); + + if constexpr (std::is_same_v) + complex_key_arena.free(const_cast(key_copy.data), key_copy.size); + } SSDCacheDictionaryStorageConfiguration configuration; SSDCacheFileBuffer file_buffer; - Memory> read_from_file_buffer; - std::vector> memory_buffer_partitions; pcg64 rnd_engine; - class ArenaCellKeyDisposer - { - public: - ArenaWithFreeLists & arena; + using SimpleKeyHashMap = HashMap; + using ComplexKeyHashMap = HashMapWithSavedHash; - template - void operator()(const Key & key, const Value &) const - { - /// In case of complex key we keep it in arena - if constexpr (std::is_same_v) - arena.free(const_cast(key.data), key.size); - } - }; - - using SimpleKeyLRUHashMap = LRUHashMap; - using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; - - using CacheLRUHashMap = std::conditional_t< + using CacheMap = std::conditional_t< dictionary_key_type == DictionaryKeyType::simple, - SimpleKeyLRUHashMap, - ComplexKeyLRUHashMap>; + SimpleKeyHashMap, + ComplexKeyHashMap>; ArenaWithFreeLists complex_key_arena; - CacheLRUHashMap index; + CacheMap index; size_t current_partition_index = 0; diff --git a/src/Dictionaries/benchmark b/src/Dictionaries/benchmark deleted file mode 100644 index 37d0d92ac14..00000000000 --- a/src/Dictionaries/benchmark +++ /dev/null @@ -1,154 +0,0 @@ -clickhouse-client --query="DROP TABLE IF EXISTS simple_cache_dictionary_table_source"; -clickhouse-client --query="CREATE TABLE simple_cache_dictionary_table_source (id UInt64, value1 String, value2 UInt64, value3 String, value4 Float64, value5 Decimal64(4)) ENGINE=TinyLog;" -clickhouse-client --query="INSERT INTO simple_cache_dictionary_table_source SELECT number, concat('Value1 ', toString(number)), number, concat('Value3 ', toString(number)), toFloat64(number), cast(number, 'Decimal64(4)') FROM system.numbers LIMIT 1000000;" - -clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_simple_cache_dictionary ( - id UInt64, - value1 String, - value2 UInt64, - value3 String, - value4 Float64, - value5 Decimal64(4) -) -PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default')) -LIFETIME(MIN 300 MAX 300) -LAYOUT(CACHE(SIZE_IN_CELLS 100000));" - -clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_ssd_simple_cache_dictionary ( - id UInt64, - value1 String, - value2 UInt64, - value3 String, - value4 Float64, - value5 Decimal64(4) -) -PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default')) -LIFETIME(MIN 300 MAX 300) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 WRITE_BUFFER_SIZE 327680 MAX_STORED_KEYS 1048576 PATH '/opt/mkita/ClickHouse/build_release/programs/ssd_cache'));" - -clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_dummy_simple_cache_dictionary ( - id UInt64, - value1 String, - value2 UInt64, - value3 String, - value4 Float64, - value5 Decimal64(4) -) -PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default')) -LIFETIME(MIN 300 MAX 300) -LAYOUT(DUMMY_SIMPLE());" - -./clickhouse-benchmark --query="SELECT - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 10000 -FORMAT Null" - -./clickhouse-benchmark --query="SELECT - dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) -FROM system.numbers -LIMIT 10000 -FORMAT Null" - -./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null" - -./clickhouse-benchmark --query="SELECT - dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 10000 -FORMAT Null" - -./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null" - -SELECT - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value5', number) -FROM system.numbers - LIMIT 10000 -FORMAT Null - -SELECT dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000 FORMAT Null - -SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000 -FORMAT Null - -SELECT - dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) -FROM system.numbers - LIMIT 10000 -FORMAT - Null - -SELECT - dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers - LIMIT 10000 -FORMAT - Null - -SELECT - dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number) -FROM system.numbers -LIMIT 10000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value1', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value2', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value3', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value4', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('clickhouse_simple_cache_dictionary', 'value2', number), - dictGet('clickhouse_simple_cache_dictionary', 'value3', number), - dictGet('clickhouse_simple_cache_dictionary', 'value4', number), - dictGet('clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT * FROM clickhouse_simple_cache_dictionary_table; \ No newline at end of file diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp index 92e6eb97b63..b93a08acb76 100644 --- a/src/Dictionaries/registerCacheDictionaries.cpp +++ b/src/Dictionaries/registerCacheDictionaries.cpp @@ -1,6 +1,6 @@ #include "CacheDictionary.h" -#include "SSDCacheDictionaryStorage.h" #include "CacheDictionaryStorage.h" +#include "SSDCacheDictionaryStorage.h" #include namespace DB @@ -20,13 +20,13 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration( const DictionaryLifetime & dict_lifetime, DictionaryKeyType dictionary_key_type) { - String dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache."; + String dictionary_type_prefix = (dictionary_key_type == DictionaryKeyType::complex) ? ".complex_key_cache." : ".cache."; String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix; const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells"); if (size == 0) throw Exception(ErrorCodes::TOO_SMALL_BUFFER_SIZE, - "({}: cache dictionary cannot have 0 cells", + "({}): cache dictionary cannot have 0 cells", full_name); size_t dict_lifetime_seconds = static_cast(dict_lifetime.max_sec); @@ -59,7 +59,6 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration( static constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES; static constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES; - static constexpr size_t DEFAULT_MAX_STORED_KEYS = 100000; static constexpr size_t DEFAULT_PARTITIONS_COUNT = 16; const size_t max_partitions_count @@ -94,16 +93,11 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration( if (directory_path.at(0) != '/') directory_path = std::filesystem::path{config.getString("path")}.concat(directory_path).string(); - const size_t max_stored_keys_in_partition - = config.getInt64(dictionary_configuration_prefix + "max_stored_keys", DEFAULT_MAX_STORED_KEYS); - const size_t rounded_size = roundUpToPowerOfTwoOrZero(max_stored_keys_in_partition); - SSDCacheDictionaryStorageConfiguration configuration{ strict_max_lifetime_seconds, dict_lifetime, directory_path, max_partitions_count, - rounded_size, block_size, file_size / block_size, read_buffer_size / block_size, @@ -194,7 +188,8 @@ DictionaryPtr createCacheDictionaryLayout( const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false); auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type); - auto storage = std::make_shared>(storage_configuration); + + std::shared_ptr storage = std::make_shared>(dict_struct, storage_configuration); auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type); diff --git a/src/Functions/FunctionsAES.h b/src/Functions/FunctionsAES.h index 132e94907f5..8af4a27ecc9 100644 --- a/src/Functions/FunctionsAES.h +++ b/src/Functions/FunctionsAES.h @@ -538,8 +538,9 @@ private: [[maybe_unused]] const auto block_size = static_cast(EVP_CIPHER_block_size(evp_cipher)); [[maybe_unused]] const auto iv_size = static_cast(EVP_CIPHER_iv_length(evp_cipher)); - const auto key_size = static_cast(EVP_CIPHER_key_length(evp_cipher)); - const auto tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1 + + const size_t key_size = static_cast(EVP_CIPHER_key_length(evp_cipher)); + static constexpr size_t tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1 auto decrypted_result_column = ColumnString::create(); auto & decrypted_result_column_data = decrypted_result_column->getChars(); @@ -549,9 +550,17 @@ private: size_t resulting_size = 0; for (size_t r = 0; r < input_rows_count; ++r) { - resulting_size += input_column->getDataAt(r).size + 1; + size_t string_size = input_column->getDataAt(r).size; + resulting_size += string_size + 1; /// With terminating zero. + if constexpr (mode == CipherMode::RFC5116_AEAD_AES_GCM) + { + if (string_size < tag_size) + throw Exception("Encrypted data is smaller than the size of additional data for AEAD mode, cannot decrypt.", + ErrorCodes::BAD_ARGUMENTS); + resulting_size -= tag_size; + } } #if defined(MEMORY_SANITIZER) @@ -565,6 +574,7 @@ private: decrypted_result_column_data.resize(resulting_size); #endif } + auto * decrypted = decrypted_result_column_data.data(); KeyHolder key_holder; @@ -631,7 +641,7 @@ private: // 1.a.2: Set AAD if present if (aad_column) { - const auto aad_data = aad_column->getDataAt(r); + StringRef aad_data = aad_column->getDataAt(r); int tmp_len = 0; if (aad_data.size != 0 && EVP_DecryptUpdate(evp_ctx, nullptr, &tmp_len, reinterpret_cast(aad_data.data), aad_data.size) != 1) diff --git a/src/Functions/SimdJSONParser.h b/src/Functions/SimdJSONParser.h index a9adfa27e2c..7ff3c45130d 100644 --- a/src/Functions/SimdJSONParser.h +++ b/src/Functions/SimdJSONParser.h @@ -42,11 +42,11 @@ struct SimdJSONParser ALWAYS_INLINE bool isBool() const { return element.type() == simdjson::dom::element_type::BOOL; } ALWAYS_INLINE bool isNull() const { return element.type() == simdjson::dom::element_type::NULL_VALUE; } - ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().first; } - ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().first; } - ALWAYS_INLINE double getDouble() const { return element.get_double().first; } - ALWAYS_INLINE bool getBool() const { return element.get_bool().first; } - ALWAYS_INLINE std::string_view getString() const { return element.get_string().first; } + ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().value_unsafe(); } + ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().value_unsafe(); } + ALWAYS_INLINE double getDouble() const { return element.get_double().value_unsafe(); } + ALWAYS_INLINE bool getBool() const { return element.get_bool().value_unsafe(); } + ALWAYS_INLINE std::string_view getString() const { return element.get_string().value_unsafe(); } ALWAYS_INLINE Array getArray() const; ALWAYS_INLINE Object getObject() const; @@ -75,7 +75,7 @@ struct SimdJSONParser ALWAYS_INLINE Iterator begin() const { return array.begin(); } ALWAYS_INLINE Iterator end() const { return array.end(); } ALWAYS_INLINE size_t size() const { return array.size(); } - ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).first; } + ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).value_unsafe(); } private: simdjson::dom::array array; @@ -111,7 +111,7 @@ struct SimdJSONParser if (x.error()) return false; - result = x.first; + result = x.value_unsafe(); return true; } @@ -137,7 +137,7 @@ struct SimdJSONParser if (document.error()) return false; - result = document.first; + result = document.value_unsafe(); return true; } @@ -155,12 +155,12 @@ private: inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const { - return element.get_array().first; + return element.get_array().value_unsafe(); } inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const { - return element.get_object().first; + return element.get_object().value_unsafe(); } } diff --git a/src/Functions/URL/ExtractFirstSignificantSubdomain.h b/src/Functions/URL/ExtractFirstSignificantSubdomain.h index c13b5f50156..974574058e9 100644 --- a/src/Functions/URL/ExtractFirstSignificantSubdomain.h +++ b/src/Functions/URL/ExtractFirstSignificantSubdomain.h @@ -90,7 +90,70 @@ struct ExtractFirstSignificantSubdomain res_data += last_3_periods[1] + 1 - begin; res_size = last_3_periods[0] - last_3_periods[1] - 1; } - } + } + + /// The difference with execute() is due to custom TLD list can have records of any level, + /// not only 2-nd level (like non-custom variant), so it requires more lookups. + template + static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) + { + res_data = data; + res_size = 0; + + Pos tmp; + size_t domain_length; + ExtractDomain::execute(data, size, tmp, domain_length); + + if (domain_length == 0) + return; + + if (out_domain_end) + *out_domain_end = tmp + domain_length; + + /// cut useless dot + if (tmp[domain_length - 1] == '.') + --domain_length; + + res_data = tmp; + res_size = domain_length; + + auto begin = tmp; + auto end = begin + domain_length; + const char * last_2_periods[2]{}; + const char * prev = begin - 1; + + auto pos = find_first_symbols<'.'>(begin, end); + while (pos < end) + { + if (lookup(pos + 1, end - pos - 1)) + { + res_data += prev + 1 - begin; + res_size = end - 1 - prev; + return; + } + + last_2_periods[1] = last_2_periods[0]; + last_2_periods[0] = pos; + prev = pos; + pos = find_first_symbols<'.'>(pos + 1, end); + } + + /// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing + if (!last_2_periods[0]) + return; + + /// if there is domain of the second level -> always return itself + if (!last_2_periods[1]) + { + res_size = last_2_periods[0] - begin; + return; + } + + /// if there is domain of the 3+ level, and zero records in TLD list -> + /// fallback to domain of the second level + res_data += last_2_periods[1] + 1 - begin; + res_size = last_2_periods[0] - last_2_periods[1] - 1; + } }; } diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h index 244b32459c1..d6868834f75 100644 --- a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h +++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h @@ -17,10 +17,10 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } -struct FirstSignificantSubdomainCustomtLookup +struct FirstSignificantSubdomainCustomLookup { const TLDList & tld_list; - FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name) + FirstSignificantSubdomainCustomLookup(const std::string & tld_list_name) : tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name)) { } @@ -63,7 +63,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override { const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue()); + FirstSignificantSubdomainCustomLookup tld_lookup(column_tld_list_name->getValue()); /// FIXME: convertToFullColumnIfConst() is suboptimal auto column = arguments[0].column->convertToFullColumnIfConst(); @@ -79,7 +79,7 @@ public: ErrorCodes::ILLEGAL_COLUMN); } - static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup, + static void vector(FirstSignificantSubdomainCustomLookup & tld_lookup, const ColumnString::Chars & data, const ColumnString::Offsets & offsets, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { diff --git a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp index 11fd27e317b..7532ddd00f2 100644 --- a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp +++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp @@ -10,7 +10,7 @@ struct CutToFirstSignificantSubdomainCustom { static size_t getReserveLengthForElement() { return 15; } - static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size) + static void execute(FirstSignificantSubdomainCustomLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size) { res_data = data; res_size = 0; @@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom Pos tmp_data; size_t tmp_length; Pos domain_end; - ExtractFirstSignificantSubdomain::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end); + ExtractFirstSignificantSubdomain::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end); if (tmp_length == 0) return; diff --git a/src/Functions/array/mapPopulateSeries.cpp b/src/Functions/array/mapPopulateSeries.cpp index 2050e0c28ab..c025117af69 100644 --- a/src/Functions/array/mapPopulateSeries.cpp +++ b/src/Functions/array/mapPopulateSeries.cpp @@ -190,7 +190,7 @@ private: } static constexpr size_t MAX_ARRAY_SIZE = 1ULL << 30; - if (static_cast(max_key - min_key) > MAX_ARRAY_SIZE) + if (static_cast(max_key) - static_cast(min_key) > MAX_ARRAY_SIZE) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in the result of function {}", getName()); /* fill the result arrays */ diff --git a/src/Functions/bar.cpp b/src/Functions/bar.cpp index 7364311a1be..6f5298a8c5e 100644 --- a/src/Functions/bar.cpp +++ b/src/Functions/bar.cpp @@ -16,6 +16,7 @@ namespace ErrorCodes extern const int ARGUMENT_OUT_OF_BOUND; extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; } namespace @@ -110,6 +111,9 @@ public: arguments[2].column->getFloat64(i), max_width); + if (!isFinite(width)) + throw Exception("Value of width must not be NaN and Inf", ErrorCodes::BAD_ARGUMENTS); + size_t next_size = current_offset + UnicodeBar::getWidthInBytes(width) + 1; dst_chars.resize(next_size); UnicodeBar::render(width, reinterpret_cast(&dst_chars[current_offset])); diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index 592f0d6774d..ca9bc32486e 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -41,7 +41,8 @@ void registerFunctionThrowIf(FunctionFactory &); void registerFunctionVersion(FunctionFactory &); void registerFunctionBuildId(FunctionFactory &); void registerFunctionUptime(FunctionFactory &); -void registerFunctionTimeZone(FunctionFactory &); +void registerFunctionTimezone(FunctionFactory &); +void registerFunctionTimezoneOf(FunctionFactory &); void registerFunctionRunningAccumulate(FunctionFactory &); void registerFunctionRunningDifference(FunctionFactory &); void registerFunctionRunningDifferenceStartingWithFirstValue(FunctionFactory &); @@ -111,7 +112,8 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory) registerFunctionVersion(factory); registerFunctionBuildId(factory); registerFunctionUptime(factory); - registerFunctionTimeZone(factory); + registerFunctionTimezone(factory); + registerFunctionTimezoneOf(factory); registerFunctionRunningAccumulate(factory); registerFunctionRunningDifference(factory); registerFunctionRunningDifferenceStartingWithFirstValue(factory); diff --git a/src/Functions/timezone.cpp b/src/Functions/timezone.cpp index 4522f21c8b2..2cd0c28612b 100644 --- a/src/Functions/timezone.cpp +++ b/src/Functions/timezone.cpp @@ -12,13 +12,13 @@ namespace /** Returns the server time zone. */ -class FunctionTimeZone : public IFunction +class FunctionTimezone : public IFunction { public: static constexpr auto name = "timezone"; static FunctionPtr create(const Context &) { - return std::make_shared(); + return std::make_shared(); } String getName() const override @@ -45,9 +45,10 @@ public: } -void registerFunctionTimeZone(FunctionFactory & factory) +void registerFunctionTimezone(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); + factory.registerAlias("timeZone", "timezone"); } } diff --git a/src/Functions/timezoneOf.cpp b/src/Functions/timezoneOf.cpp new file mode 100644 index 00000000000..1d007a6e10e --- /dev/null +++ b/src/Functions/timezoneOf.cpp @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + + +namespace +{ + + +/** timezoneOf(x) - get the name of the timezone of DateTime data type. + * Example: Europe/Moscow. + */ +class ExecutableFunctionTimezoneOf : public IExecutableFunctionImpl +{ +public: + static constexpr auto name = "timezoneOf"; + String getName() const override { return name; } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + + /// Execute the function on the columns. + ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + DataTypePtr type_no_nullable = removeNullable(arguments[0].type); + + return DataTypeString().createColumnConst(input_rows_count, + dynamic_cast(*type_no_nullable).getTimeZone().getTimeZone()); + } +}; + + +class BaseFunctionTimezoneOf : public IFunctionBaseImpl +{ +public: + BaseFunctionTimezoneOf(DataTypes argument_types_, DataTypePtr return_type_) + : argument_types(std::move(argument_types_)), return_type(std::move(return_type_)) {} + + static constexpr auto name = "timezoneOf"; + String getName() const override { return name; } + + bool isDeterministic() const override { return true; } + bool isDeterministicInScopeOfQuery() const override { return true; } + + const DataTypes & getArgumentTypes() const override { return argument_types; } + const DataTypePtr & getResultType() const override { return return_type; } + + ExecutableFunctionImplPtr prepare(const ColumnsWithTypeAndName &) const override + { + return std::make_unique(); + } + + ColumnPtr getResultIfAlwaysReturnsConstantAndHasArguments(const ColumnsWithTypeAndName & arguments) const override + { + DataTypePtr type_no_nullable = removeNullable(arguments[0].type); + + return DataTypeString().createColumnConst(1, + dynamic_cast(*type_no_nullable).getTimeZone().getTimeZone()); + } + +private: + DataTypes argument_types; + DataTypePtr return_type; +}; + + +class FunctionTimezoneOfBuilder : public IFunctionOverloadResolverImpl +{ +public: + static constexpr auto name = "timezoneOf"; + String getName() const override { return name; } + static FunctionOverloadResolverImplPtr create(const Context &) { return std::make_unique(); } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnType(const DataTypes & types) const override + { + DataTypePtr type_no_nullable = removeNullable(types[0]); + + if (isDateTime(type_no_nullable) || isDateTime64(type_no_nullable)) + return std::make_shared(); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad argument for function {}, should be DateTime or DateTime64", name); + } + + FunctionBaseImplPtr build(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + return std::make_unique(DataTypes{arguments[0].type}, return_type); + } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t /*number_of_arguments*/) const override { return {0}; } +}; + +} + +void registerFunctionTimezoneOf(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerAlias("timeZoneOf", "timezoneOf"); +} + +} + diff --git a/src/Functions/toTimeZone.cpp b/src/Functions/toTimezone.cpp similarity index 90% rename from src/Functions/toTimeZone.cpp rename to src/Functions/toTimezone.cpp index fbf3a0778a6..d12f926b284 100644 --- a/src/Functions/toTimeZone.cpp +++ b/src/Functions/toTimezone.cpp @@ -21,11 +21,11 @@ namespace { /// Just changes time zone information for data type. The calculation is free. -class FunctionToTimeZone : public IFunction +class FunctionToTimezone : public IFunction { public: - static constexpr auto name = "toTimeZone"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static constexpr auto name = "toTimezone"; + static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { @@ -64,7 +64,8 @@ public: void registerFunctionToTimeZone(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); + factory.registerAlias("toTimeZone", "toTimezone"); } } diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 3ac64828b9c..aed2bd9b70d 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -467,6 +467,7 @@ SRCS( timeSlot.cpp timeSlots.cpp timezone.cpp + timezoneOf.cpp timezoneOffset.cpp toColumnTypeName.cpp toCustomWeek.cpp @@ -506,7 +507,7 @@ SRCS( toStartOfTenMinutes.cpp toStartOfYear.cpp toTime.cpp - toTimeZone.cpp + toTimezone.cpp toTypeName.cpp toUnixTimestamp64Micro.cpp toUnixTimestamp64Milli.cpp diff --git a/src/IO/BrotliWriteBuffer.cpp b/src/IO/BrotliWriteBuffer.cpp index e87eeb1a2be..512ed5fc93f 100644 --- a/src/IO/BrotliWriteBuffer.cpp +++ b/src/IO/BrotliWriteBuffer.cpp @@ -106,7 +106,7 @@ void BrotliWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) diff --git a/src/IO/LZMADeflatingWriteBuffer.cpp b/src/IO/LZMADeflatingWriteBuffer.cpp index 96f1d34b01b..7ea4f7945dc 100644 --- a/src/IO/LZMADeflatingWriteBuffer.cpp +++ b/src/IO/LZMADeflatingWriteBuffer.cpp @@ -105,7 +105,7 @@ void LZMADeflatingWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp index 1d999d586b2..15fdd9448ec 100644 --- a/src/IO/PeekableReadBuffer.cpp +++ b/src/IO/PeekableReadBuffer.cpp @@ -82,6 +82,7 @@ bool PeekableReadBuffer::peekNext() checkpoint.emplace(memory.data()); checkpoint_in_own_memory = true; } + if (currentlyReadFromOwnMemory()) { /// Update buffer size @@ -99,7 +100,6 @@ bool PeekableReadBuffer::peekNext() pos_offset = 0; } BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset); - } peeked_size += bytes_to_copy; @@ -113,12 +113,21 @@ void PeekableReadBuffer::rollbackToCheckpoint(bool drop) { checkStateCorrect(); - if (!checkpoint) - throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); - else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory()) + assert(checkpoint); + + if (checkpointInOwnMemory() == currentlyReadFromOwnMemory()) + { + /// Both checkpoint and position are in the same buffer. pos = *checkpoint; - else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory + } + else + { + /// Checkpoint is in own memory and position is not. + assert(checkpointInOwnMemory()); + + /// Switch to reading from own memory. BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data()); + } if (drop) dropCheckpoint(); @@ -134,6 +143,7 @@ bool PeekableReadBuffer::nextImpl() checkStateCorrect(); bool res; + bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end() && currentlyReadFromOwnMemory(); if (checkpoint) { @@ -163,6 +173,13 @@ bool PeekableReadBuffer::nextImpl() BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset()); nextimpl_working_buffer_offset = sub_buf.offset(); + if (checkpoint_at_end) + { + checkpoint.emplace(working_buffer.begin()); + peeked_size = 0; + checkpoint_in_own_memory = false; + } + checkStateCorrect(); return res; } diff --git a/src/IO/PeekableReadBuffer.h b/src/IO/PeekableReadBuffer.h index 4f6e669b31d..4515c6f8ce5 100644 --- a/src/IO/PeekableReadBuffer.h +++ b/src/IO/PeekableReadBuffer.h @@ -43,10 +43,7 @@ public: /// Forget checkpoint and all data between checkpoint and position ALWAYS_INLINE inline void dropCheckpoint() { -#ifndef NDEBUG - if (!checkpoint) - throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); -#endif + assert(checkpoint); if (!currentlyReadFromOwnMemory()) { /// Don't need to store unread data anymore diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 37896a387bb..c70993c5c3a 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -27,23 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl() ssize_t bytes_read = 0; Stopwatch watch; - int flags = 0; - if (async_callback) - flags |= MSG_DONTWAIT; - /// Add more details to exceptions. try { - bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); - - /// If async_callback is specified, and read is blocking, run async_callback and try again later. + /// If async_callback is specified, and read will block, run async_callback and try again later. /// It is expected that file descriptor may be polled externally. /// Note that receive timeout is not checked here. External code should check it while polling. - while (bytes_read < 0 && async_callback && errno == EAGAIN) - { + while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ)) async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), socket_description); - bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); - } + + /// receiveBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout), + /// but we want to get this exception exactly after receive_timeout. So, set send_timeout = receive_timeout + /// before receiveBytes. + std::unique_ptr timeout_setter = nullptr; + if (socket.secure()) + timeout_setter = std::make_unique(dynamic_cast(socket), socket.getReceiveTimeout(), socket.getReceiveTimeout()); + + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size()); } catch (const Poco::Net::NetException & e) { diff --git a/src/Client/TimeoutSetter.cpp b/src/IO/TimeoutSetter.cpp similarity index 97% rename from src/Client/TimeoutSetter.cpp rename to src/IO/TimeoutSetter.cpp index 87368f93ba3..f06cafecff8 100644 --- a/src/Client/TimeoutSetter.cpp +++ b/src/IO/TimeoutSetter.cpp @@ -1,4 +1,4 @@ -#include "TimeoutSetter.h" +#include #include diff --git a/src/Client/TimeoutSetter.h b/src/IO/TimeoutSetter.h similarity index 100% rename from src/Client/TimeoutSetter.h rename to src/IO/TimeoutSetter.h diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index 284fa5dbd97..4edfc8a2795 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -40,6 +41,13 @@ void WriteBufferFromPocoSocket::nextImpl() /// Add more details to exceptions. try { + /// sendBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout), + /// but we want to get this exception exactly after send_timeout. So, set receive_timeout = send_timeout + /// before sendBytes. + std::unique_ptr timeout_setter = nullptr; + if (socket.secure()) + timeout_setter = std::make_unique(dynamic_cast(socket), socket.getSendTimeout(), socket.getSendTimeout()); + res = socket.impl()->sendBytes(working_buffer.begin() + bytes_written, offset() - bytes_written); } catch (const Poco::Net::NetException & e) diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 7373b24991a..93aaf9456b5 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -120,7 +120,7 @@ WriteBufferFromS3::~WriteBufferFromS3() } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__); + tryLogCurrentException(log); } } diff --git a/src/IO/ZlibDeflatingWriteBuffer.cpp b/src/IO/ZlibDeflatingWriteBuffer.cpp index 5da82b52279..7e91820f298 100644 --- a/src/IO/ZlibDeflatingWriteBuffer.cpp +++ b/src/IO/ZlibDeflatingWriteBuffer.cpp @@ -107,7 +107,7 @@ void ZlibDeflatingWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) diff --git a/src/IO/ZstdDeflatingWriteBuffer.cpp b/src/IO/ZstdDeflatingWriteBuffer.cpp index 27694797db6..5b97588b33e 100644 --- a/src/IO/ZstdDeflatingWriteBuffer.cpp +++ b/src/IO/ZstdDeflatingWriteBuffer.cpp @@ -94,7 +94,7 @@ void ZstdDeflatingWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) diff --git a/src/IO/tests/gtest_peekable_read_buffer.cpp b/src/IO/tests/gtest_peekable_read_buffer.cpp index 8c491338bd3..2e5ca47c0aa 100644 --- a/src/IO/tests/gtest_peekable_read_buffer.cpp +++ b/src/IO/tests/gtest_peekable_read_buffer.cpp @@ -6,11 +6,6 @@ #include #include -namespace DB::ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - static void readAndAssert(DB::ReadBuffer & buf, const char * str) { size_t n = strlen(str); @@ -48,20 +43,6 @@ try readAndAssert(peekable, "01234"); } -#ifndef ABORT_ON_LOGICAL_ERROR - bool exception = false; - try - { - peekable.rollbackToCheckpoint(); - } - catch (DB::Exception & e) - { - if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) - throw; - exception = true; - } - ASSERT_TRUE(exception); -#endif assertAvailable(peekable, "56789"); readAndAssert(peekable, "56"); diff --git a/src/IO/ya.make b/src/IO/ya.make index 6605cf64277..58df027c561 100644 --- a/src/IO/ya.make +++ b/src/IO/ya.make @@ -50,6 +50,7 @@ SRCS( ReadBufferFromPocoSocket.cpp ReadHelpers.cpp SeekAvoidingReadBuffer.cpp + TimeoutSetter.cpp UseSSL.cpp WriteBufferFromFile.cpp WriteBufferFromFileBase.cpp diff --git a/src/Interpreters/ExternalLoader.cpp b/src/Interpreters/ExternalLoader.cpp index 73257ba5185..853fe296d1c 100644 --- a/src/Interpreters/ExternalLoader.cpp +++ b/src/Interpreters/ExternalLoader.cpp @@ -818,13 +818,10 @@ private: if (!min_id) min_id = getMinIDToFinishLoading(forced_to_reload); - if (info->state_id >= min_id) - return true; /// stop - if (info->loading_id < min_id) startLoading(*info, forced_to_reload, *min_id); - /// Wait for the next event if loading wasn't completed, and stop otherwise. + /// Wait for the next event if loading wasn't completed, or stop otherwise. return (info->state_id >= min_id); }; @@ -850,9 +847,6 @@ private: if (filter && !filter(name)) continue; - if (info.state_id >= min_id) - continue; - if (info.loading_id < min_id) startLoading(info, forced_to_reload, *min_id); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index d1af86e7b11..f8bcbf02ab4 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -260,7 +260,8 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) renamed = true; } - database->loadStoredObjects(context, has_force_restore_data_flag, create.attach && force_attach); + /// We use global context here, because storages lifetime is bigger than query context lifetime + database->loadStoredObjects(context.getGlobalContext(), has_force_restore_data_flag, create.attach && force_attach); } catch (...) { @@ -970,7 +971,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (create.as_table_function) { const auto & factory = TableFunctionFactory::instance(); - res = factory.get(create.as_table_function, context)->execute(create.as_table_function, context, create.table, properties.columns); + auto table_func = factory.get(create.as_table_function, context); + res = table_func->execute(create.as_table_function, context, create.table, properties.columns); res->renameInMemory({create.database, create.table, create.uuid}); } else diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index b4f64528471..1f6b0c37437 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -393,7 +393,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( view = nullptr; } - if (try_move_to_prewhere && storage && query.where() && !query.prewhere() && !query.final()) + if (try_move_to_prewhere && storage && query.where() && !query.prewhere()) { /// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable if (const auto & column_sizes = storage->getColumnSizes(); !column_sizes.empty()) diff --git a/src/Interpreters/InterserverIOHandler.h b/src/Interpreters/InterserverIOHandler.h index b4768c30f32..b0c95ed3835 100644 --- a/src/Interpreters/InterserverIOHandler.h +++ b/src/Interpreters/InterserverIOHandler.h @@ -9,8 +9,6 @@ #include #include -#include - #include #include #include diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp index e922f49c896..a97ef41204a 100644 --- a/src/Interpreters/WindowDescription.cpp +++ b/src/Interpreters/WindowDescription.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -60,7 +61,7 @@ void WindowFrame::toString(WriteBuffer & buf) const } else { - buf << abs(begin_offset); + buf << applyVisitor(FieldVisitorToString(), begin_offset); buf << " " << (begin_preceding ? "PRECEDING" : "FOLLOWING"); } @@ -77,7 +78,7 @@ void WindowFrame::toString(WriteBuffer & buf) const } else { - buf << abs(end_offset); + buf << applyVisitor(FieldVisitorToString(), end_offset); buf << " " << (end_preceding ? "PRECEDING" : "FOLLOWING"); } @@ -121,23 +122,33 @@ void WindowFrame::checkValid() const if (end_type == BoundaryType::Offset && begin_type == BoundaryType::Offset) { - // Frame starting with following rows can't have preceding rows. - if (!(end_preceding && !begin_preceding)) + // Frame start offset must be less or equal that the frame end offset. + bool begin_less_equal_end; + if (begin_preceding && end_preceding) { - // Frame start offset must be less or equal that the frame end offset. - const bool begin_before_end - = begin_offset * (begin_preceding ? -1 : 1) - <= end_offset * (end_preceding ? -1 : 1); - - if (!begin_before_end) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame start offset {} {} does not precede the frame end offset {} {}", - begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING", - end_offset, end_preceding ? "PRECEDING" : "FOLLOWING"); - } - return; + begin_less_equal_end = begin_offset >= end_offset; } + else if (begin_preceding && !end_preceding) + { + begin_less_equal_end = true; + } + else if (!begin_preceding && end_preceding) + { + begin_less_equal_end = false; + } + else /* if (!begin_preceding && !end_preceding) */ + { + begin_less_equal_end = begin_offset <= end_offset; + } + + if (!begin_less_equal_end) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Frame start offset {} {} does not precede the frame end offset {} {}", + begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING", + end_offset, end_preceding ? "PRECEDING" : "FOLLOWING"); + } + return; } throw Exception(ErrorCodes::BAD_ARGUMENTS, diff --git a/src/Interpreters/WindowDescription.h b/src/Interpreters/WindowDescription.h index faad4649f91..70a4e0e44e0 100644 --- a/src/Interpreters/WindowDescription.h +++ b/src/Interpreters/WindowDescription.h @@ -44,14 +44,13 @@ struct WindowFrame // Offset might be both preceding and following, controlled by begin_preceding, // but the offset value must be positive. BoundaryType begin_type = BoundaryType::Unbounded; - // This should have been a Field but I'm getting some crazy linker errors. - int64_t begin_offset = 0; + Field begin_offset = 0; bool begin_preceding = true; // Here as well, Unbounded can only be UNBOUNDED FOLLOWING, and end_preceding // must be false. BoundaryType end_type = BoundaryType::Current; - int64_t end_offset = 0; + Field end_offset = 0; bool end_preceding = false; diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index d47f64cb1dc..5d124add0df 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -377,6 +377,11 @@ Field convertFieldToType(const Field & from_value, const IDataType & to_type, co else if (const auto * nullable_type = typeid_cast(&to_type)) { const IDataType & nested_type = *nullable_type->getNestedType(); + + /// NULL remains NULL after any conversion. + if (WhichDataType(nested_type).isNothing()) + return {}; + if (from_type_hint && from_type_hint->equals(nested_type)) return from_value; return convertFieldToTypeImpl(from_value, nested_type, from_type_hint); diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp index aa5508bf190..4715c7f201b 100644 --- a/src/Parsers/ASTSelectQuery.cpp +++ b/src/Parsers/ASTSelectQuery.cpp @@ -137,8 +137,8 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F if (window()) { s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << - "WINDOW " << (s.hilite ? hilite_none : ""); - window()->formatImpl(s, state, frame); + "WINDOW" << (s.hilite ? hilite_none : ""); + window()->as().formatImplMultiline(s, state, frame); } if (orderBy()) diff --git a/src/Parsers/ASTWindowDefinition.cpp b/src/Parsers/ASTWindowDefinition.cpp index aee951fc1f3..35374df6177 100644 --- a/src/Parsers/ASTWindowDefinition.cpp +++ b/src/Parsers/ASTWindowDefinition.cpp @@ -35,6 +35,8 @@ String ASTWindowDefinition::getID(char) const void ASTWindowDefinition::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked format_frame) const { + format_frame.expression_list_prepend_whitespace = false; + if (partition_by) { settings.ostr << "PARTITION BY "; @@ -70,7 +72,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings, } else { - settings.ostr << abs(frame.begin_offset); + settings.ostr << applyVisitor(FieldVisitorToString(), + frame.begin_offset); settings.ostr << " " << (!frame.begin_preceding ? "FOLLOWING" : "PRECEDING"); } @@ -85,7 +88,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings, } else { - settings.ostr << abs(frame.end_offset); + settings.ostr << applyVisitor(FieldVisitorToString(), + frame.end_offset); settings.ostr << " " << (!frame.end_preceding ? "FOLLOWING" : "PRECEDING"); } diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index a54573432a1..913813d5486 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -581,30 +581,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p else if (parser_literal.parse(pos, ast_literal, expected)) { const Field & value = ast_literal->as().value; - if (!isInt64FieldType(value.getType())) + if ((node->frame.type == WindowFrame::FrameType::Rows + || node->frame.type == WindowFrame::FrameType::Groups) + && !(value.getType() == Field::Types::UInt64 + || (value.getType() == Field::Types::Int64 + && value.get() >= 0))) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Only integer frame offsets are supported, '{}' is not supported.", + "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", + WindowFrame::toString(node->frame.type), + applyVisitor(FieldVisitorToString(), value), Field::Types::toString(value.getType())); } - node->frame.begin_offset = value.get(); + node->frame.begin_offset = value; node->frame.begin_type = WindowFrame::BoundaryType::Offset; - // We can easily get a UINT64_MAX here, which doesn't even fit into - // int64_t. Not sure what checks we are going to need here after we - // support floats and dates. - if (node->frame.begin_offset > INT_MAX || node->frame.begin_offset < INT_MIN) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame offset must be between {} and {}, but {} is given", - INT_MAX, INT_MIN, node->frame.begin_offset); - } - - if (node->frame.begin_offset < 0) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame start offset must be greater than zero, {} given", - node->frame.begin_offset); - } } else { @@ -652,28 +642,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p else if (parser_literal.parse(pos, ast_literal, expected)) { const Field & value = ast_literal->as().value; - if (!isInt64FieldType(value.getType())) + if ((node->frame.type == WindowFrame::FrameType::Rows + || node->frame.type == WindowFrame::FrameType::Groups) + && !(value.getType() == Field::Types::UInt64 + || (value.getType() == Field::Types::Int64 + && value.get() >= 0))) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Only integer frame offsets are supported, '{}' is not supported.", + "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", + WindowFrame::toString(node->frame.type), + applyVisitor(FieldVisitorToString(), value), Field::Types::toString(value.getType())); } - node->frame.end_offset = value.get(); + node->frame.end_offset = value; node->frame.end_type = WindowFrame::BoundaryType::Offset; - - if (node->frame.end_offset > INT_MAX || node->frame.end_offset < INT_MIN) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame offset must be between {} and {}, but {} is given", - INT_MAX, INT_MIN, node->frame.end_offset); - } - - if (node->frame.end_offset < 0) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame end offset must be greater than zero, {} given", - node->frame.end_offset); - } } else { diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index ffa8250a3f3..1fa4d396113 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -275,7 +275,8 @@ Token Lexer::nextTokenImpl() else ++pos; } - return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end); + pos = end; + return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, pos); } } return Token(TokenType::Slash, token_begin, pos); diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 1fc51bd4112..4a5282c1e6b 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -27,7 +28,8 @@ public: virtual ~IWindowFunction() = default; // Must insert the result for current_row. - virtual void windowInsertResultInto(IColumn & to, const WindowTransform * transform) = 0; + virtual void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) = 0; }; // Compares ORDER BY column values at given rows to find the boundaries of frame: @@ -37,7 +39,7 @@ template static int compareValuesWithOffset(const IColumn * _compared_column, size_t compared_row, const IColumn * _reference_column, size_t reference_row, - uint64_t _offset, + const Field & _offset, bool offset_is_preceding) { // Casting the columns to the known type here makes it faster, probably @@ -46,7 +48,8 @@ static int compareValuesWithOffset(const IColumn * _compared_column, _compared_column); const auto * reference_column = assert_cast( _reference_column); - const auto offset = static_cast(_offset); + const auto offset = _offset.get(); + assert(offset >= 0); const auto compared_value_data = compared_column->getDataAt(compared_row); assert(compared_value_data.size == sizeof(typename ColumnType::ValueType)); @@ -101,6 +104,53 @@ static int compareValuesWithOffset(const IColumn * _compared_column, } } +// A specialization of compareValuesWithOffset for floats. +template +static int compareValuesWithOffsetFloat(const IColumn * _compared_column, + size_t compared_row, const IColumn * _reference_column, + size_t reference_row, + const Field & _offset, + bool offset_is_preceding) +{ + // Casting the columns to the known type here makes it faster, probably + // because the getData call can be devirtualized. + const auto * compared_column = assert_cast( + _compared_column); + const auto * reference_column = assert_cast( + _reference_column); + const auto offset = _offset.get(); + assert(offset >= 0); + + const auto compared_value_data = compared_column->getDataAt(compared_row); + assert(compared_value_data.size == sizeof(typename ColumnType::ValueType)); + auto compared_value = unalignedLoad( + compared_value_data.data); + + const auto reference_value_data = reference_column->getDataAt(reference_row); + assert(reference_value_data.size == sizeof(typename ColumnType::ValueType)); + auto reference_value = unalignedLoad( + reference_value_data.data); + + // Floats overflow to Inf and the comparison will work normally, so we don't + // have to do anything. + if (offset_is_preceding) + { + reference_value -= offset; + } + else + { + reference_value += offset; + } + + const auto result = compared_value < reference_value ? -1 + : compared_value == reference_value ? 0 : 1; + +// fmt::print(stderr, "compared {}, offset {}, reference {}, result {}\n", +// compared_value, offset, reference_value, result); + + return result; +} + // Helper macros to dispatch on type of the ORDER BY column #define APPLY_FOR_ONE_TYPE(FUNCTION, TYPE) \ else if (typeid_cast(column)) \ @@ -114,14 +164,20 @@ if (false) /* NOLINT */ \ { \ /* Do nothing, a starter condition. */ \ } \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +\ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +\ +APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector) \ +\ else \ { \ throw Exception(ErrorCodes::NOT_IMPLEMENTED, \ @@ -193,9 +249,28 @@ WindowTransform::WindowTransform(const Block & input_header_, == WindowFrame::BoundaryType::Offset)) { assert(order_by_indices.size() == 1); - const IColumn * column = input_header.getByPosition( - order_by_indices[0]).column.get(); + const auto & entry = input_header.getByPosition(order_by_indices[0]); + const IColumn * column = entry.column.get(); APPLY_FOR_TYPES(compareValuesWithOffset) + + // Check that the offset type matches the window type. + // Convert the offsets to the ORDER BY column type. We can't just check + // that it matches, because e.g. the int literals are always (U)Int64, + // but the column might be Int8 and so on. + if (window_description.frame.begin_type + == WindowFrame::BoundaryType::Offset) + { + window_description.frame.begin_offset = convertFieldToTypeOrThrow( + window_description.frame.begin_offset, + *entry.type); + } + if (window_description.frame.end_type + == WindowFrame::BoundaryType::Offset) + { + window_description.frame.end_offset = convertFieldToTypeOrThrow( + window_description.frame.end_offset, + *entry.type); + } } } @@ -391,7 +466,7 @@ void WindowTransform::advanceFrameStartRowsOffset() { // Just recalculate it each time by walking blocks. const auto [moved_row, offset_left] = moveRowNumber(current_row, - window_description.frame.begin_offset + window_description.frame.begin_offset.get() * (window_description.frame.begin_preceding ? -1 : 1)); frame_start = moved_row; @@ -638,7 +713,7 @@ void WindowTransform::advanceFrameEndRowsOffset() // Walk the specified offset from the current row. The "+1" is needed // because the frame_end is a past-the-end pointer. const auto [moved_row, offset_left] = moveRowNumber(current_row, - window_description.frame.end_offset + window_description.frame.end_offset.get() * (window_description.frame.end_preceding ? -1 : 1) + 1); @@ -852,14 +927,14 @@ void WindowTransform::writeOutCurrentRow() for (size_t wi = 0; wi < workspaces.size(); ++wi) { auto & ws = workspaces[wi]; - IColumn * result_column = block.output_columns[wi].get(); if (ws.window_function_impl) { - ws.window_function_impl->windowInsertResultInto(*result_column, this); + ws.window_function_impl->windowInsertResultInto(this, wi); } else { + IColumn * result_column = block.output_columns[wi].get(); const auto * a = ws.aggregate_function.get(); auto * buf = ws.aggregate_function_state.data(); // FIXME does it also allocate the result on the arena? @@ -1280,8 +1355,11 @@ struct WindowFunctionRank final : public WindowFunction DataTypePtr getReturnType() const override { return std::make_shared(); } - void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override { + IColumn & to = *transform->blockAt(transform->current_row) + .output_columns[function_index]; assert_cast(to).getData().push_back( transform->peer_group_start_row_number); } @@ -1297,8 +1375,11 @@ struct WindowFunctionDenseRank final : public WindowFunction DataTypePtr getReturnType() const override { return std::make_shared(); } - void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override { + IColumn & to = *transform->blockAt(transform->current_row) + .output_columns[function_index]; assert_cast(to).getData().push_back( transform->peer_group_number); } @@ -1314,13 +1395,123 @@ struct WindowFunctionRowNumber final : public WindowFunction DataTypePtr getReturnType() const override { return std::make_shared(); } - void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override { + IColumn & to = *transform->blockAt(transform->current_row) + .output_columns[function_index]; assert_cast(to).getData().push_back( transform->current_row_number); } }; +// ClickHouse-specific variant of lag/lead that respects the window frame. +template +struct WindowFunctionLagLeadInFrame final : public WindowFunction +{ + WindowFunctionLagLeadInFrame(const std::string & name_, + const DataTypes & argument_types_, const Array & parameters_) + : WindowFunction(name_, argument_types_, parameters_) + { + if (!parameters.empty()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Function {} cannot be parameterized", name_); + } + + if (argument_types.empty()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Function {} takes at least one argument", name_); + } + + if (argument_types.size() == 1) + { + return; + } + + if (!isInt64FieldType(argument_types[1]->getDefault().getType())) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Offset must be an integer, '{}' given", + argument_types[1]->getName()); + } + + if (argument_types.size() == 2) + { + return; + } + + if (!getLeastSupertype({argument_types[0], argument_types[2]})) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "The default value type '{}' is not convertible to the argument type '{}'", + argument_types[2]->getName(), + argument_types[0]->getName()); + } + + if (argument_types.size() > 3) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Function '{}' accepts at most 3 arguments, {} given", + name, argument_types.size()); + } + } + + DataTypePtr getReturnType() const override + { return argument_types[0]; } + + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override + { + const auto & current_block = transform->blockAt(transform->current_row); + IColumn & to = *current_block.output_columns[function_index]; + const auto & workspace = transform->workspaces[function_index]; + + int offset = 1; + if (argument_types.size() > 1) + { + offset = (*current_block.input_columns[ + workspace.argument_column_indices[1]])[ + transform->current_row.row].get(); + if (offset < 0) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "The offset for function {} must be nonnegative, {} given", + getName(), offset); + } + } + + const auto [target_row, offset_left] = transform->moveRowNumber( + transform->current_row, offset * (is_lead ? 1 : -1)); + + if (offset_left != 0 + || target_row < transform->frame_start + || transform->frame_end <= target_row) + { + // Offset is outside the frame. + if (argument_types.size() > 2) + { + // Column with default values is specified. + to.insertFrom(*current_block.input_columns[ + workspace.argument_column_indices[2]], + transform->current_row.row); + } + else + { + to.insertDefault(); + } + } + else + { + // Offset is inside the frame. + to.insertFrom(*transform->blockAt(target_row).input_columns[ + workspace.argument_column_indices[0]], + target_row.row); + } + } +}; + void registerWindowFunctions(AggregateFunctionFactory & factory) { // Why didn't I implement lag/lead yet? Because they are a mess. I imagine @@ -1332,9 +1523,10 @@ void registerWindowFunctions(AggregateFunctionFactory & factory) // the whole partition like Postgres does, because using a linear amount // of additional memory is not an option when we have a lot of data. We must // be able to process at least the lag/lead in streaming fashion. - // Our best bet is probably rewriting, say `lag(value, offset)` to - // `any(value) over (rows between offset preceding and offset preceding)`, - // at the query planning stage. + // A partial solution for constant offsets is rewriting, say `lag(value, offset) + // to `any(value) over (rows between offset preceding and offset preceding)`. + // We also implement non-standard functions `lag/leadInFrame`, that are + // analogous to `lag/lead`, but respect the frame. // Functions like cume_dist() do require materializing the entire // partition, but it's probably also simpler to implement them by rewriting // to a (rows between unbounded preceding and unbounded following) frame, @@ -1360,6 +1552,20 @@ void registerWindowFunctions(AggregateFunctionFactory & factory) return std::make_shared(name, argument_types, parameters); }); + + factory.registerFunction("lagInFrame", [](const std::string & name, + const DataTypes & argument_types, const Array & parameters) + { + return std::make_shared>( + name, argument_types, parameters); + }); + + factory.registerFunction("leadInFrame", [](const std::string & name, + const DataTypes & argument_types, const Array & parameters) + { + return std::make_shared>( + name, argument_types, parameters); + }); } } diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h index 5001b984e9a..882bf429c0a 100644 --- a/src/Processors/Transforms/WindowTransform.h +++ b/src/Processors/Transforms/WindowTransform.h @@ -110,7 +110,9 @@ public: Status prepare() override; void work() override; -private: + /* + * Implementation details. + */ void advancePartitionEnd(); bool arePeers(const RowNumber & x, const RowNumber & y) const; @@ -321,10 +323,7 @@ public: int (* compare_values_with_offset) ( const IColumn * compared_column, size_t compared_row, const IColumn * reference_column, size_t reference_row, - // We can make it a Field later if we need the Decimals. Now we only - // have ints and datetime, and the underlying Field type for them is - // uint64_t anyway. - uint64_t offset, + const Field & offset, bool offset_is_preceding); }; diff --git a/src/Server/HTTP/HTMLForm.cpp b/src/Server/HTTP/HTMLForm.cpp index ca407858c33..a00950c8e27 100644 --- a/src/Server/HTTP/HTMLForm.cpp +++ b/src/Server/HTTP/HTMLForm.cpp @@ -369,6 +369,11 @@ bool HTMLForm::MultipartReadBuffer::nextImpl() else boundary_hit = startsWith(line, boundary); + if (!line.empty()) + /// If we don't make sure that memory is contiguous then situation may happen, when part of the line is inside internal memory + /// and other part is inside sub-buffer, thus we'll be unable to setup our working buffer properly. + in.makeContinuousMemoryFromCheckpointToPos(); + in.rollbackToCheckpoint(true); /// Rolling back to checkpoint may change underlying buffers. diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index 740072e8e9f..426e4ca2138 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -107,6 +107,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe } catch (...) { + tryLogCurrentException(log); out.finalize(); } }; @@ -116,6 +117,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe if (auto [message, success] = checkAuthentication(request); success) { processQuery(request, response, used_output); + used_output.out->finalize(); LOG_DEBUG(log, "Done processing query"); } else diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index b283356d27d..b676331f6c0 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -240,16 +240,10 @@ Poco::Timespan NuKeeperTCPHandler::receiveHandshake() throw Exception("Unexpected protocol version: " + toString(protocol_version), ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); Coordination::read(last_zxid_seen, *in); - - if (last_zxid_seen != 0) - throw Exception("Non zero last_zxid_seen is not supported", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); - Coordination::read(timeout_ms, *in); + + /// TODO Stop ignoring this value Coordination::read(previous_session_id, *in); - - if (previous_session_id != 0) - throw Exception("Non zero previous session id is not supported", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); - Coordination::read(passwd, *in); int8_t readonly; diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index ee2f7c96b5a..c3dd8346c8e 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -8,10 +8,10 @@ #include #include #include +#include #include #include #include -#include #include "IServer.h" diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp index f3b0e3022f1..affb76314b1 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp @@ -26,7 +26,7 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl HDFSBuilderWrapper builder; HDFSFSPtr fs; - explicit ReadBufferFromHDFSImpl(const std::string & hdfs_name_, + ReadBufferFromHDFSImpl(const std::string & hdfs_name_, const Poco::Util::AbstractConfiguration & config_) : hdfs_uri(hdfs_name_), builder(createHDFSBuilder(hdfs_uri, config_)) diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 2cbc36e02fe..39f6d1f632e 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -1,8 +1,5 @@ #include -#include -#include - #include #include #include diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 7f2a9cdb1f6..3d15681a27e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -333,40 +333,49 @@ IMergeTreeDataPart::State IMergeTreeDataPart::getState() const } -DayNum IMergeTreeDataPart::getMinDate() const +std::pair IMergeTreeDataPart::getMinMaxDate() const { if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized) - return DayNum(minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos].left.get()); + { + const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos]; + return {DayNum(hyperrectangle.left.get()), DayNum(hyperrectangle.right.get())}; + } else - return DayNum(); + return {}; } - -DayNum IMergeTreeDataPart::getMaxDate() const -{ - if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized) - return DayNum(minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos].right.get()); - else - return DayNum(); -} - -time_t IMergeTreeDataPart::getMinTime() const +std::pair IMergeTreeDataPart::getMinMaxTime() const { if (storage.minmax_idx_time_column_pos != -1 && minmax_idx.initialized) - return minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos].left.get(); + { + const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos]; + + /// The case of DateTime + if (hyperrectangle.left.getType() == Field::Types::UInt64) + { + assert(hyperrectangle.right.getType() == Field::Types::UInt64); + return {hyperrectangle.left.get(), hyperrectangle.right.get()}; + } + /// The case of DateTime64 + else if (hyperrectangle.left.getType() == Field::Types::Decimal64) + { + assert(hyperrectangle.right.getType() == Field::Types::Decimal64); + + auto left = hyperrectangle.left.get>(); + auto right = hyperrectangle.right.get>(); + + assert(left.getScale() == right.getScale()); + + return { left.getValue() / left.getScaleMultiplier(), right.getValue() / right.getScaleMultiplier() }; + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Part minmax index by time is neither DateTime or DateTime64"); + } else - return 0; + return {}; } -time_t IMergeTreeDataPart::getMaxTime() const -{ - if (storage.minmax_idx_time_column_pos != -1 && minmax_idx.initialized) - return minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos].right.get(); - else - return 0; -} - void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns) { columns = new_columns; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 83f8c672001..92b05e5cbd2 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -155,13 +155,11 @@ public: bool contains(const IMergeTreeDataPart & other) const { return info.contains(other.info); } - /// If the partition key includes date column (a common case), these functions will return min and max values for this column. - DayNum getMinDate() const; - DayNum getMaxDate() const; + /// If the partition key includes date column (a common case), this function will return min and max values for that column. + std::pair getMinMaxDate() const; - /// otherwise, if the partition key includes dateTime column (also a common case), these functions will return min and max values for this column. - time_t getMinTime() const; - time_t getMaxTime() const; + /// otherwise, if the partition key includes dateTime column (also a common case), this function will return min and max values for that column. + std::pair getMinMaxTime() const; bool isEmpty() const { return rows_count == 0; } diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 8f5dec8077d..6833d2e2fd4 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -444,7 +444,8 @@ bool KeyCondition::addCondition(const String & column, const Range & range) */ bool KeyCondition::getConstant(const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type) { - String column_name = expr->getColumnNameWithoutAlias(); + // Constant expr should use alias names if any + String column_name = expr->getColumnName(); if (const auto * lit = expr->as()) { @@ -607,7 +608,8 @@ bool KeyCondition::canConstantBeWrappedByMonotonicFunctions( if (strict) return false; - String expr_name = node->getColumnNameWithoutAlias(); + // Constant expr should use alias names if any + String expr_name = node->getColumnName(); const auto & sample_block = key_expr->getSampleBlock(); if (!sample_block.has(expr_name)) return false; @@ -675,7 +677,8 @@ bool KeyCondition::canConstantBeWrappedByFunctions( if (strict) return false; - String expr_name = ast->getColumnNameWithoutAlias(); + // Constant expr should use alias names if any + String expr_name = ast->getColumnName(); const auto & sample_block = key_expr->getSampleBlock(); if (!sample_block.has(expr_name)) return false; @@ -1011,6 +1014,8 @@ bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctionsImpl( * Therefore, use the full name of the expression for search. */ const auto & sample_block = key_expr->getSampleBlock(); + + // Key columns should use canonical names for index analysis String name = node->getColumnNameWithoutAlias(); auto it = key_columns.find(name); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 654262542b4..09b7dcd3a78 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -469,15 +469,19 @@ void MergeTreeData::checkPartitionKeyAndInitMinMax(const KeyDescription & new_pa DataTypes minmax_idx_columns_types = getMinMaxColumnsTypes(new_partition_key); /// Try to find the date column in columns used by the partition key (a common case). - bool encountered_date_column = false; + /// If there are no - DateTime or DateTime64 would also suffice. + + bool has_date_column = false; + bool has_datetime_column = false; + for (size_t i = 0; i < minmax_idx_columns_types.size(); ++i) { - if (typeid_cast(minmax_idx_columns_types[i].get())) + if (isDate(minmax_idx_columns_types[i])) { - if (!encountered_date_column) + if (!has_date_column) { minmax_idx_date_column_pos = i; - encountered_date_column = true; + has_date_column = true; } else { @@ -486,16 +490,18 @@ void MergeTreeData::checkPartitionKeyAndInitMinMax(const KeyDescription & new_pa } } } - if (!encountered_date_column) + if (!has_date_column) { for (size_t i = 0; i < minmax_idx_columns_types.size(); ++i) { - if (typeid_cast(minmax_idx_columns_types[i].get())) + if (isDateTime(minmax_idx_columns_types[i]) + || isDateTime64(minmax_idx_columns_types[i]) + ) { - if (!encountered_date_column) + if (!has_datetime_column) { minmax_idx_time_column_pos = i; - encountered_date_column = true; + has_datetime_column = true; } else { diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 747819c77eb..96a3dba12f7 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -551,11 +551,6 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( .checksum_on_read = settings.checksum_on_read, }; - /// PREWHERE - String prewhere_column; - if (select.prewhere()) - prewhere_column = select.prewhere()->getColumnName(); - struct DataSkippingIndexAndCondition { MergeTreeIndexPtr index; diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 34cac56d74c..692d2ac4b94 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -37,6 +37,8 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( : table_columns{ext::map( metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })} , queried_columns{queried_columns_} + , sorting_key_names{NameSet( + metadata_snapshot->getSortingKey().column_names.begin(), metadata_snapshot->getSortingKey().column_names.end())} , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)} , log{log_} , column_sizes{std::move(column_sizes_)} @@ -114,12 +116,12 @@ static bool isConditionGood(const ASTPtr & condition) } -void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) const +void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool is_final) const { if (const auto * func_and = node->as(); func_and && func_and->name == "and") { for (const auto & elem : func_and->arguments->children) - analyzeImpl(res, elem); + analyzeImpl(res, elem, is_final); } else { @@ -133,7 +135,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) cond.viable = /// Condition depend on some column. Constant expressions are not moved. !cond.identifiers.empty() - && !cannotBeMoved(node) + && !cannotBeMoved(node, is_final) /// Do not take into consideration the conditions consisting only of the first primary key column && !hasPrimaryKeyAtoms(node) /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. @@ -149,10 +151,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) } /// Transform conjunctions chain in WHERE expression to Conditions list. -MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression) const +MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool is_final) const { Conditions res; - analyzeImpl(res, expression); + analyzeImpl(res, expression, is_final); return res; } @@ -183,7 +185,7 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const if (!select.where() || select.prewhere()) return; - Conditions where_conditions = analyze(select.where()); + Conditions where_conditions = analyze(select.where(), select.final()); Conditions prewhere_conditions; UInt64 total_size_of_moved_conditions = 0; @@ -300,6 +302,12 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const } +bool MergeTreeWhereOptimizer::isSortingKey(const String & column_name) const +{ + return sorting_key_names.count(column_name); +} + + bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const { const auto column_name = expr->getColumnName(); @@ -319,7 +327,7 @@ bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers } -bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const +bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool is_final) const { if (const auto * function_ptr = ptr->as()) { @@ -336,12 +344,13 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const { /// disallow moving result of ARRAY JOIN to PREWHERE if (array_joined_names.count(*opt_name) || - array_joined_names.count(Nested::extractTableName(*opt_name))) + array_joined_names.count(Nested::extractTableName(*opt_name)) || + (is_final && !isSortingKey(*opt_name))) return true; } for (const auto & child : ptr->children) - if (cannotBeMoved(child)) + if (cannotBeMoved(child, is_final)) return true; return false; diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index cad77fb9eed..8fd973e9ba3 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -67,10 +67,10 @@ private: using Conditions = std::list; - void analyzeImpl(Conditions & res, const ASTPtr & node) const; + void analyzeImpl(Conditions & res, const ASTPtr & node, bool is_final) const; /// Transform conjunctions chain in WHERE expression to Conditions list. - Conditions analyze(const ASTPtr & expression) const; + Conditions analyze(const ASTPtr & expression, bool is_final) const; /// Transform Conditions list to WHERE or PREWHERE expression. static ASTPtr reconstruct(const Conditions & conditions); @@ -85,6 +85,8 @@ private: bool isPrimaryKeyAtom(const ASTPtr & ast) const; + bool isSortingKey(const String & column_name) const; + bool isConstant(const ASTPtr & expr) const; bool isSubsetOfTableColumns(const NameSet & identifiers) const; @@ -95,7 +97,7 @@ private: * * Also, disallow moving expressions with GLOBAL [NOT] IN. */ - bool cannotBeMoved(const ASTPtr & ptr) const; + bool cannotBeMoved(const ASTPtr & ptr, bool is_final) const; void determineArrayJoinedNames(ASTSelectQuery & select); @@ -104,6 +106,7 @@ private: String first_primary_key_column; const StringSet table_columns; const Names queried_columns; + const NameSet sorting_key_names; const Block block_with_constants; Poco::Logger * log; std::unordered_map column_sizes; diff --git a/src/Storages/StorageFactory.cpp b/src/Storages/StorageFactory.cpp index 85f3bea9e0c..7aaec9b7e76 100644 --- a/src/Storages/StorageFactory.cpp +++ b/src/Storages/StorageFactory.cpp @@ -179,6 +179,7 @@ StoragePtr StorageFactory::get( .attach = query.attach, .has_force_restore_data_flag = has_force_restore_data_flag }; + assert(&arguments.context == &arguments.context.getGlobalContext()); auto res = storages.at(name).creator_fn(arguments); if (!empty_engine_args.empty()) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 9b93d7183fd..0849f65477d 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4932,7 +4932,7 @@ bool StorageReplicatedMergeTree::waitForTableReplicaToProcessLogEntry( const auto & stop_waiting = [&]() { - bool stop_waiting_itself = waiting_itself && is_dropped; + bool stop_waiting_itself = waiting_itself && (partial_shutdown_called || is_dropped); bool stop_waiting_non_active = !wait_for_non_active && !getZooKeeper()->exists(table_zookeeper_path + "/replicas/" + replica + "/is_active"); return stop_waiting_itself || stop_waiting_non_active; }; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index ca984f9ece9..2d3879340dc 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -33,7 +33,7 @@ namespace ErrorCodes IStorageURLBase::IStorageURLBase( const Poco::URI & uri_, - const Context & context_, + const Context & /*context_*/, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -42,13 +42,10 @@ IStorageURLBase::IStorageURLBase( const String & compression_method_) : IStorage(table_id_) , uri(uri_) - , context_global(context_) , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) { - context_global.getRemoteHostFilter().checkURL(uri); - StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); @@ -237,14 +234,28 @@ Pipe IStorageURLBase::read( chooseCompressionMethod(request_uri.getPath(), compression_method))); } -BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/) +BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & context) { return std::make_shared(uri, format_name, - format_settings, metadata_snapshot->getSampleBlock(), context_global, - ConnectionTimeouts::getHTTPTimeouts(context_global), + format_settings, metadata_snapshot->getSampleBlock(), context, + ConnectionTimeouts::getHTTPTimeouts(context), chooseCompressionMethod(uri.toString(), compression_method)); } +StorageURL::StorageURL(const Poco::URI & uri_, + const StorageID & table_id_, + const String & format_name_, + const std::optional & format_settings_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + Context & context_, + const String & compression_method_) + : IStorageURLBase(uri_, context_, table_id_, format_name_, + format_settings_, columns_, constraints_, compression_method_) +{ + context_.getRemoteHostFilter().checkURL(uri); +} + void registerStorageURL(StorageFactory & factory) { factory.registerStorage("URL", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 21b2e3e27a1..2b2384b1043 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -45,7 +45,6 @@ protected: const String & compression_method_); Poco::URI uri; - const Context & context_global; String compression_method; String format_name; // For URL engine, we use format settings from server context + `SETTINGS` @@ -114,11 +113,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, Context & context_, - const String & compression_method_) - : IStorageURLBase(uri_, context_, table_id_, format_name_, - format_settings_, columns_, constraints_, compression_method_) - { - } + const String & compression_method_); String getName() const override { diff --git a/src/Storages/System/StorageSystemErrors.cpp b/src/Storages/System/StorageSystemErrors.cpp index 5243cb11aa3..09d0aaddb3d 100644 --- a/src/Storages/System/StorageSystemErrors.cpp +++ b/src/Storages/System/StorageSystemErrors.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -16,7 +17,7 @@ NamesAndTypesList StorageSystemErrors::getNamesAndTypes() { "value", std::make_shared() }, { "last_error_time", std::make_shared() }, { "last_error_message", std::make_shared() }, - { "last_error_stacktrace", std::make_shared() }, + { "last_error_trace", std::make_shared(std::make_shared()) }, { "remote", std::make_shared() }, }; } @@ -34,7 +35,14 @@ void StorageSystemErrors::fillData(MutableColumns & res_columns, const Context & res_columns[col_num++]->insert(error.count); res_columns[col_num++]->insert(error.error_time_ms / 1000); res_columns[col_num++]->insert(error.message); - res_columns[col_num++]->insert(error.stacktrace); + { + Array trace_array; + trace_array.reserve(error.trace.size()); + for (size_t i = 0; i < error.trace.size(); ++i) + trace_array.emplace_back(reinterpret_cast(error.trace[i])); + + res_columns[col_num++]->insert(trace_array); + } res_columns[col_num++]->insert(remote); } }; diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index eece092206d..6a643dbe1b9 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -137,14 +137,17 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(static_cast(part.use_count() - 1)); + auto min_max_date = part->getMinMaxDate(); + auto min_max_time = part->getMinMaxTime(); + if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getMinDate()); + columns[res_index++]->insert(min_max_date.first); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getMaxDate()); + columns[res_index++]->insert(min_max_date.second); if (columns_mask[src_index++]) - columns[res_index++]->insert(static_cast(part->getMinTime())); + columns[res_index++]->insert(static_cast(min_max_time.first)); if (columns_mask[src_index++]) - columns[res_index++]->insert(static_cast(part->getMaxTime())); + columns[res_index++]->insert(static_cast(min_max_time.second)); if (columns_mask[src_index++]) columns[res_index++]->insert(part->info.partition_id); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index 8754e424281..703de70d17f 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -32,6 +32,8 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_ {"refcount", std::make_shared()}, {"min_date", std::make_shared()}, {"max_date", std::make_shared()}, + {"min_time", std::make_shared()}, + {"max_time", std::make_shared()}, {"partition_id", std::make_shared()}, {"min_block_number", std::make_shared()}, {"max_block_number", std::make_shared()}, @@ -95,8 +97,10 @@ void StorageSystemPartsColumns::processNextStorage( /// For convenience, in returned refcount, don't add references that was due to local variables in this method: all_parts, active_parts. auto use_count = part.use_count() - 1; - auto min_date = part->getMinDate(); - auto max_date = part->getMaxDate(); + + auto min_max_date = part->getMinMaxDate(); + auto min_max_time = part->getMinMaxTime(); + auto index_size_in_bytes = part->getIndexSizeInBytes(); auto index_size_in_allocated_bytes = part->getIndexSizeInAllocatedBytes(); @@ -141,9 +145,14 @@ void StorageSystemPartsColumns::processNextStorage( columns[res_index++]->insert(UInt64(use_count)); if (columns_mask[src_index++]) - columns[res_index++]->insert(min_date); + columns[res_index++]->insert(min_max_date.first); if (columns_mask[src_index++]) - columns[res_index++]->insert(max_date); + columns[res_index++]->insert(min_max_date.second); + if (columns_mask[src_index++]) + columns[res_index++]->insert(static_cast(min_max_time.first)); + if (columns_mask[src_index++]) + columns[res_index++]->insert(static_cast(min_max_time.second)); + if (columns_mask[src_index++]) columns[res_index++]->insert(part->info.partition_id); if (columns_mask[src_index++]) diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index 804a5b232ec..b637838c6da 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -20,12 +20,20 @@ StoragePtr ITableFunction::execute(const ASTPtr & ast_function, const Context & ProfileEvents::increment(ProfileEvents::TableFunctionExecute); context.checkAccess(AccessType::CREATE_TEMPORARY_TABLE | StorageFactory::instance().getSourceAccessType(getStorageTypeName())); - if (cached_columns.empty() || (hasStaticStructure() && cached_columns == getActualTableStructure(context))) + if (cached_columns.empty()) return executeImpl(ast_function, context, table_name, std::move(cached_columns)); - auto get_storage = [=, tf = shared_from_this()]() -> StoragePtr + /// We have table structure, so it's CREATE AS table_function(). + /// We should use global context here because there will be no query context on server startup + /// and because storage lifetime is bigger than query context lifetime. + const Context & global_context = context.getGlobalContext(); + if (hasStaticStructure() && cached_columns == getActualTableStructure(context)) + return executeImpl(ast_function, global_context, table_name, std::move(cached_columns)); + + auto this_table_function = shared_from_this(); + auto get_storage = [=, &global_context]() -> StoragePtr { - return tf->executeImpl(ast_function, context, table_name, cached_columns); + return this_table_function->executeImpl(ast_function, global_context, table_name, cached_columns); }; /// It will request actual table structure and create underlying storage lazily diff --git a/src/TableFunctions/ITableFunctionXDBC.cpp b/src/TableFunctions/ITableFunctionXDBC.cpp index e04a86b5abf..21c78d199db 100644 --- a/src/TableFunctions/ITableFunctionXDBC.cpp +++ b/src/TableFunctions/ITableFunctionXDBC.cpp @@ -55,15 +55,21 @@ void ITableFunctionXDBC::parseArguments(const ASTPtr & ast_function, const Conte connection_string = args[0]->as().value.safeGet(); remote_table_name = args[1]->as().value.safeGet(); } +} - /// Have to const_cast, because bridges store their commands inside context - helper = createBridgeHelper(const_cast(context), context.getSettingsRef().http_receive_timeout.value, connection_string); - helper->startBridgeSync(); +void ITableFunctionXDBC::startBridgeIfNot(const Context & context) const +{ + if (!helper) + { + /// Have to const_cast, because bridges store their commands inside context + helper = createBridgeHelper(const_cast(context), context.getSettingsRef().http_receive_timeout.value, connection_string); + helper->startBridgeSync(); + } } ColumnsDescription ITableFunctionXDBC::getActualTableStructure(const Context & context) const { - assert(helper); + startBridgeIfNot(context); /* Infer external table structure */ Poco::URI columns_info_uri = helper->getColumnsInfoURI(); @@ -87,7 +93,7 @@ ColumnsDescription ITableFunctionXDBC::getActualTableStructure(const Context & c StoragePtr ITableFunctionXDBC::executeImpl(const ASTPtr & /*ast_function*/, const Context & context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - assert(helper); + startBridgeIfNot(context); auto columns = getActualTableStructure(context); auto result = std::make_shared(StorageID(getDatabaseName(), table_name), schema_name, remote_table_name, columns, context, helper); result->startup(); diff --git a/src/TableFunctions/ITableFunctionXDBC.h b/src/TableFunctions/ITableFunctionXDBC.h index fb0a0fd1185..f3ff64c2f2d 100644 --- a/src/TableFunctions/ITableFunctionXDBC.h +++ b/src/TableFunctions/ITableFunctionXDBC.h @@ -29,10 +29,12 @@ private: void parseArguments(const ASTPtr & ast_function, const Context & context) override; + void startBridgeIfNot(const Context & context) const; + String connection_string; String schema_name; String remote_table_name; - BridgeHelperPtr helper; + mutable BridgeHelperPtr helper; }; class TableFunctionJDBC : public ITableFunctionXDBC diff --git a/tests/integration/helpers/dictionary.py b/tests/integration/helpers/dictionary.py index b3f7a729777..41d87180c8a 100644 --- a/tests/integration/helpers/dictionary.py +++ b/tests/integration/helpers/dictionary.py @@ -7,12 +7,12 @@ class Layout(object): 'flat': '', 'hashed': '', 'cache': '128', - 'ssd_cache': '/etc/clickhouse/dictionaries/all128', + 'ssd_cache': '/etc/clickhouse/dictionaries/all', 'complex_key_hashed': '', 'complex_key_hashed_one_key': '', 'complex_key_hashed_two_keys': '', 'complex_key_cache': '128', - 'complex_key_ssd_cache': '/etc/clickhouse/dictionaries/all128', + 'complex_key_ssd_cache': '/etc/clickhouse/dictionaries/all', 'range_hashed': '', 'direct': '', 'complex_key_direct': '' diff --git a/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml b/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml index 85f811d2d85..c8fdbcbe0ef 100644 --- a/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml +++ b/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml @@ -42,7 +42,6 @@ 131072 1048576 /etc/clickhouse/dictionaries/radars - 1048576 1 diff --git a/tests/integration/test_dictionaries_postgresql/test.py b/tests/integration/test_dictionaries_postgresql/test.py index 5ceb6496b90..10d9f4213e1 100644 --- a/tests/integration/test_dictionaries_postgresql/test.py +++ b/tests/integration/test_dictionaries_postgresql/test.py @@ -80,7 +80,7 @@ def test_load_dictionaries(started_cluster): create_dict(table_name) dict_name = 'dict0' - node1.query("SYSTEM RELOAD DICTIONARIES") + node1.query("SYSTEM RELOAD DICTIONARY {}".format(dict_name)) assert node1.query("SELECT count() FROM `test`.`dict_table_{}`".format(table_name)).rstrip() == '10000' assert node1.query("SELECT dictGetUInt32('{}', 'id', toUInt64(0))".format(dict_name)) == '0\n' assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(9999))".format(dict_name)) == '9999\n' diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py index 5c8abcda38e..533a29dc245 100644 --- a/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/tests/integration/test_dictionaries_update_and_reload/test.py @@ -141,7 +141,8 @@ def test_reload_after_loading(started_cluster): time.sleep(1) # see the comment above replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '81', '82') replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '101', '102') - query("SYSTEM RELOAD DICTIONARIES") + query("SYSTEM RELOAD DICTIONARY 'file'") + query("SYSTEM RELOAD DICTIONARY 'executable'") assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "82\n" assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n" diff --git a/tests/integration/test_distributed_inter_server_secret/test.py b/tests/integration/test_distributed_inter_server_secret/test.py index b1daf2271d0..1a0e5a3dd91 100644 --- a/tests/integration/test_distributed_inter_server_secret/test.py +++ b/tests/integration/test_distributed_inter_server_secret/test.py @@ -97,12 +97,14 @@ def test_insecure(): n1.query('SELECT * FROM dist_insecure') def test_insecure_insert_async(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_insecure SELECT * FROM numbers(2)') n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER insecure dist_insecure') assert int(n1.query('SELECT count() FROM dist_insecure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER insecure') def test_insecure_insert_sync(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_insecure SELECT * FROM numbers(2)', settings={'insert_distributed_sync': 1}) assert int(n1.query('SELECT count() FROM dist_insecure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER secure') @@ -111,12 +113,14 @@ def test_secure(): n1.query('SELECT * FROM dist_secure') def test_secure_insert_async(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure SELECT * FROM numbers(2)') n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure dist_secure') assert int(n1.query('SELECT count() FROM dist_secure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER secure') def test_secure_insert_sync(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure SELECT * FROM numbers(2)', settings={'insert_distributed_sync': 1}) assert int(n1.query('SELECT count() FROM dist_secure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER secure') @@ -126,6 +130,7 @@ def test_secure_insert_sync(): # Buffer() flush happens with global context, that does not have user # And so Context::user/ClientInfo::current_user/ClientInfo::initial_user will be empty def test_secure_insert_buffer_async(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure_buffer SELECT * FROM numbers(2)') n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure dist_secure') # no Buffer flush happened @@ -141,6 +146,7 @@ def test_secure_disagree(): n1.query('SELECT * FROM dist_secure_disagree') def test_secure_disagree_insert(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure_disagree SELECT * FROM numbers(2)') with pytest.raises(QueryRuntimeException, match='.*Hash mismatch.*'): n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure_disagree dist_secure_disagree') diff --git a/tests/integration/test_max_http_connections_for_replication/test.py b/tests/integration/test_max_http_connections_for_replication/test.py index 2dc4e2a8810..634697c8668 100644 --- a/tests/integration/test_max_http_connections_for_replication/test.py +++ b/tests/integration/test_max_http_connections_for_replication/test.py @@ -43,6 +43,8 @@ def start_small_cluster(): def test_single_endpoint_connections_count(start_small_cluster): + node1.query("TRUNCATE TABLE test_table") + node2.query("SYSTEM SYNC REPLICA test_table") def task(count): print(("Inserting ten times from {}".format(count))) for i in range(count, count + 10): @@ -58,9 +60,11 @@ def test_single_endpoint_connections_count(start_small_cluster): def test_keepalive_timeout(start_small_cluster): - current_count = int(node1.query("select count() from test_table").strip()) + node1.query("TRUNCATE TABLE test_table") + node2.query("SYSTEM SYNC REPLICA test_table") + node1.query("insert into test_table values ('2017-06-16', 777, 0)") - assert_eq_with_retry(node2, "select count() from test_table", str(current_count + 1)) + assert_eq_with_retry(node2, "select count() from test_table", str(1)) # Server keepAliveTimeout is 3 seconds, default client session timeout is 8 # lets sleep in that interval time.sleep(4) @@ -69,7 +73,7 @@ def test_keepalive_timeout(start_small_cluster): time.sleep(3) - assert_eq_with_retry(node2, "select count() from test_table", str(current_count + 2)) + assert_eq_with_retry(node2, "select count() from test_table", str(2)) assert not node2.contains_in_log("No message received"), "Found 'No message received' in clickhouse-server.log" diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py index 6bb6a6ee777..2ef71927bdf 100644 --- a/tests/integration/test_odbc_interaction/test.py +++ b/tests/integration/test_odbc_interaction/test.py @@ -74,6 +74,9 @@ def started_cluster(): node1.exec_in_container( ["bash", "-c", "echo 'CREATE TABLE t4(X INTEGER PRIMARY KEY ASC, Y, Z);' | sqlite3 {}".format(sqlite_db)], privileged=True, user='root') + node1.exec_in_container( + ["bash", "-c", "echo 'CREATE TABLE tf1(x INTEGER PRIMARY KEY ASC, y, z);' | sqlite3 {}".format(sqlite_db)], + privileged=True, user='root') print("sqlite tables created") mysql_conn = get_mysql_conn() print("mysql connection received") @@ -177,6 +180,21 @@ def test_sqlite_simple_select_function_works(started_cluster): assert node1.query( "select count(), sum(x) from odbc('DSN={}', '{}') group by x".format(sqlite_setup["DSN"], 't1')) == "1\t1\n" +def test_sqlite_table_function(started_cluster): + sqlite_setup = node1.odbc_drivers["SQLite3"] + sqlite_db = sqlite_setup["Database"] + + node1.exec_in_container(["bash", "-c", "echo 'INSERT INTO tf1 values(1, 2, 3);' | sqlite3 {}".format(sqlite_db)], + privileged=True, user='root') + node1.query("create table odbc_tf as odbc('DSN={}', '{}')".format(sqlite_setup["DSN"], 'tf1')) + assert node1.query("select * from odbc_tf") == "1\t2\t3\n" + + assert node1.query("select y from odbc_tf") == "2\n" + assert node1.query("select z from odbc_tf") == "3\n" + assert node1.query("select x from odbc_tf") == "1\n" + assert node1.query("select x, y from odbc_tf") == "1\t2\n" + assert node1.query("select z, x, y from odbc_tf") == "3\t1\t2\n" + assert node1.query("select count(), sum(x) from odbc_tf group by x") == "1\t1\n" def test_sqlite_simple_select_storage_works(started_cluster): sqlite_setup = node1.odbc_drivers["SQLite3"] @@ -342,6 +360,7 @@ def test_bridge_dies_with_parent(started_cluster): assert clickhouse_pid is None assert bridge_pid is None + node1.start_clickhouse(20) def test_odbc_postgres_date_data_type(started_cluster): diff --git a/tests/integration/test_secure_socket/__init__.py b/tests/integration/test_secure_socket/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml b/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml new file mode 100644 index 00000000000..0c109d6d768 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml @@ -0,0 +1,14 @@ + + 9440 + + + + + node2 + 9440 + 1 + + + + + diff --git a/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml b/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml new file mode 100644 index 00000000000..fe39e3712b8 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml @@ -0,0 +1,18 @@ + + + + /etc/clickhouse-server/config.d/server.crt + /etc/clickhouse-server/config.d/server.key + /etc/clickhouse-server/config.d/dhparam.pem + none + true + + + true + none + + AcceptCertificateHandler + + + + diff --git a/tests/integration/test_secure_socket/configs_secure/dhparam.pem b/tests/integration/test_secure_socket/configs_secure/dhparam.pem new file mode 100644 index 00000000000..2e6cee0798d --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/dhparam.pem @@ -0,0 +1,8 @@ +-----BEGIN DH PARAMETERS----- +MIIBCAKCAQEAua92DDli13gJ+//ZXyGaggjIuidqB0crXfhUlsrBk9BV1hH3i7fR +XGP9rUdk2ubnB3k2ejBStL5oBrkHm9SzUFSQHqfDjLZjKoUpOEmuDc4cHvX1XTR5 +Pr1vf5cd0yEncJWG5W4zyUB8k++SUdL2qaeslSs+f491HBLDYn/h8zCgRbBvxhxb +9qeho1xcbnWeqkN6Kc9bgGozA16P9NLuuLttNnOblkH+lMBf42BSne/TWt3AlGZf +slKmmZcySUhF8aKfJnLKbkBCFqOtFRh8zBA9a7g+BT/lSANATCDPaAk1YVih2EKb +dpc3briTDbRsiqg2JKMI7+VdULY9bh3EawIBAg== +-----END DH PARAMETERS----- diff --git a/tests/integration/test_secure_socket/configs_secure/server.crt b/tests/integration/test_secure_socket/configs_secure/server.crt new file mode 100644 index 00000000000..7ade2d96273 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/server.crt @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIC/TCCAeWgAwIBAgIJANjx1QSR77HBMA0GCSqGSIb3DQEBCwUAMBQxEjAQBgNV +BAMMCWxvY2FsaG9zdDAgFw0xODA3MzAxODE2MDhaGA8yMjkyMDUxNDE4MTYwOFow +FDESMBAGA1UEAwwJbG9jYWxob3N0MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB +CgKCAQEAs9uSo6lJG8o8pw0fbVGVu0tPOljSWcVSXH9uiJBwlZLQnhN4SFSFohfI +4K8U1tBDTnxPLUo/V1K9yzoLiRDGMkwVj6+4+hE2udS2ePTQv5oaMeJ9wrs+5c9T +4pOtlq3pLAdm04ZMB1nbrEysceVudHRkQbGHzHp6VG29Fw7Ga6YpqyHQihRmEkTU +7UCYNA+Vk7aDPdMS/khweyTpXYZimaK9f0ECU3/VOeG3fH6Sp2X6FN4tUj/aFXEj +sRmU5G2TlYiSIUMF2JPdhSihfk1hJVALrHPTU38SOL+GyyBRWdNcrIwVwbpvsvPg +pryMSNxnpr0AK0dFhjwnupIv5hJIOQIDAQABo1AwTjAdBgNVHQ4EFgQUjPLb3uYC +kcamyZHK4/EV8jAP0wQwHwYDVR0jBBgwFoAUjPLb3uYCkcamyZHK4/EV8jAP0wQw +DAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAM/ocuDvfPus/KpMVD51j +4IdlU8R0vmnYLQ+ygzOAo7+hUWP5j0yvq4ILWNmQX6HNvUggCgFv9bjwDFhb/5Vr +85ieWfTd9+LTjrOzTw4avdGwpX9G+6jJJSSq15tw5ElOIFb/qNA9O4dBiu8vn03C +L/zRSXrARhSqTW5w/tZkUcSTT+M5h28+Lgn9ysx4Ff5vi44LJ1NnrbJbEAIYsAAD ++UA+4MBFKx1r6hHINULev8+lCfkpwIaeS8RL+op4fr6kQPxnULw8wT8gkuc8I4+L +P9gg/xDHB44T3ADGZ5Ib6O0DJaNiToO6rnoaaxs0KkotbvDWvRoxEytSbXKoYjYp +0g== +-----END CERTIFICATE----- diff --git a/tests/integration/test_secure_socket/configs_secure/server.key b/tests/integration/test_secure_socket/configs_secure/server.key new file mode 100644 index 00000000000..f0fb61ac443 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/server.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCz25KjqUkbyjyn +DR9tUZW7S086WNJZxVJcf26IkHCVktCeE3hIVIWiF8jgrxTW0ENOfE8tSj9XUr3L +OguJEMYyTBWPr7j6ETa51LZ49NC/mhox4n3Cuz7lz1Pik62WreksB2bThkwHWdus +TKxx5W50dGRBsYfMenpUbb0XDsZrpimrIdCKFGYSRNTtQJg0D5WTtoM90xL+SHB7 +JOldhmKZor1/QQJTf9U54bd8fpKnZfoU3i1SP9oVcSOxGZTkbZOViJIhQwXYk92F +KKF+TWElUAusc9NTfxI4v4bLIFFZ01ysjBXBum+y8+CmvIxI3GemvQArR0WGPCe6 +ki/mEkg5AgMBAAECggEATrbIBIxwDJOD2/BoUqWkDCY3dGevF8697vFuZKIiQ7PP +TX9j4vPq0DfsmDjHvAPFkTHiTQXzlroFik3LAp+uvhCCVzImmHq0IrwvZ9xtB43f +7Pkc5P6h1l3Ybo8HJ6zRIY3TuLtLxuPSuiOMTQSGRL0zq3SQ5DKuGwkz+kVjHXUN +MR2TECFwMHKQ5VLrC+7PMpsJYyOMlDAWhRfUalxC55xOXTpaN8TxNnwQ8K2ISVY5 +212Jz/a4hn4LdwxSz3Tiu95PN072K87HLWx3EdT6vW4Ge5P/A3y+smIuNAlanMnu +plHBRtpATLiTxZt/n6npyrfQVbYjSH7KWhB8hBHtaQKBgQDh9Cq1c/KtqDtE0Ccr +/r9tZNTUwBE6VP+3OJeKdEdtsfuxjOCkS1oAjgBJiSDOiWPh1DdoDeVZjPKq6pIu +Mq12OE3Doa8znfCXGbkSzEKOb2unKZMJxzrz99kXt40W5DtrqKPNb24CNqTiY8Aa +CjtcX+3weat82VRXvph6U8ltMwKBgQDLxjiQQzNoY7qvg7CwJCjf9qq8jmLK766g +1FHXopqS+dTxDLM8eJSRrpmxGWJvNeNc1uPhsKsKgotqAMdBUQTf7rSTbt4MyoH5 +bUcRLtr+0QTK9hDWMOOvleqNXha68vATkohWYfCueNsC60qD44o8RZAS6UNy3ENq +cM1cxqe84wKBgQDKkHutWnooJtajlTxY27O/nZKT/HA1bDgniMuKaz4R4Gr1PIez +on3YW3V0d0P7BP6PWRIm7bY79vkiMtLEKdiKUGWeyZdo3eHvhDb/3DCawtau8L2K +GZsHVp2//mS1Lfz7Qh8/L/NedqCQ+L4iWiPnZ3THjjwn3CoZ05ucpvrAMwKBgB54 +nay039MUVq44Owub3KDg+dcIU62U+cAC/9oG7qZbxYPmKkc4oL7IJSNecGHA5SbU +2268RFdl/gLz6tfRjbEOuOHzCjFPdvAdbysanpTMHLNc6FefJ+zxtgk9sJh0C4Jh +vxFrw9nTKKzfEl12gQ1SOaEaUIO0fEBGbe8ZpauRAoGAMAlGV+2/K4ebvAJKOVTa +dKAzQ+TD2SJmeR1HZmKDYddNqwtZlzg3v4ZhCk4eaUmGeC1Bdh8MDuB3QQvXz4Dr +vOIP4UVaOr+uM+7TgAgVnP4/K6IeJGzUDhX93pmpWhODfdu/oojEKVcpCojmEmS1 +KCBtmIrQLqzMpnBpLNuSY+Q= +-----END PRIVATE KEY----- diff --git a/tests/integration/test_secure_socket/configs_secure/users.d/users.xml b/tests/integration/test_secure_socket/configs_secure/users.d/users.xml new file mode 100644 index 00000000000..479017f6370 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/users.d/users.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py new file mode 100644 index 00000000000..c2bad80bca0 --- /dev/null +++ b/tests/integration/test_secure_socket/test.py @@ -0,0 +1,84 @@ +import os.path +import time + +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV + +cluster = ClickHouseCluster(__file__) + +NODES = {'node' + str(i): None for i in (1, 2)} + +config = ''' + + + {sleep_in_send_data_ms} + + +''' + + +@pytest.fixture(scope="module") +def started_cluster(): + cluster.__with_ssl_config = True + main_configs = [ + "configs_secure/config.d/remote_servers.xml", + "configs_secure/server.crt", + "configs_secure/server.key", + "configs_secure/dhparam.pem", + "configs_secure/config.d/ssl_conf.xml", + ] + + NODES['node1'] = cluster.add_instance('node1', main_configs=main_configs) + NODES['node2'] = cluster.add_instance('node2', main_configs=main_configs, user_configs=["configs_secure/users.d/users.xml"]) + + try: + cluster.start() + NODES['node2'].query("CREATE TABLE base_table (x UInt64) ENGINE = MergeTree ORDER BY x;") + NODES['node2'].query("INSERT INTO base_table VALUES (5);") + NODES['node1'].query("CREATE TABLE distributed_table (x UInt64) ENGINE = Distributed(test_cluster, default, base_table);") + + yield cluster + + finally: + cluster.shutdown() + + +def test(started_cluster): + NODES['node2'].replace_config('/etc/clickhouse-server/users.d/users.xml', config.format(sleep_in_send_data_ms=1000000)) + + attempts = 0 + while attempts < 1000: + setting = NODES['node2'].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data_ms'") + if int(setting) == 1000000: + break + time.sleep(0.1) + attempts += 1 + + assert attempts < 1000 + + + start = time.time() + NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0, async_socket_for_remote=0;') + end = time.time() + assert end - start < 10 + + start = time.time() + error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0;') + end = time.time() + + assert end - start < 10 + + # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl(). + assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1 + + start = time.time() + error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5;') + end = time.time() + + assert end - start < 10 + + # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl(). + assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1 + + diff --git a/tests/integration/test_system_clusters_actual_information/configs/users.xml b/tests/integration/test_system_clusters_actual_information/configs/users.xml deleted file mode 100644 index 156cd3a6b59..00000000000 --- a/tests/integration/test_system_clusters_actual_information/configs/users.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - 5 - - - diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index 389e249790f..67614b88029 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -396,6 +396,10 @@ def test_ttl_compatibility(started_cluster, node_left, node_right, num_run): node_right.query("OPTIMIZE TABLE test_ttl_group_by FINAL") node_right.query("OPTIMIZE TABLE test_ttl_where FINAL") + node_left.query("SYSTEM SYNC REPLICA test_ttl_delete", timeout=20) + node_left.query("SYSTEM SYNC REPLICA test_ttl_group_by", timeout=20) + node_left.query("SYSTEM SYNC REPLICA test_ttl_where", timeout=20) + assert node_left.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" assert node_right.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" diff --git a/tests/jepsen.nukeeper/.gitignore b/tests/jepsen.nukeeper/.gitignore new file mode 100644 index 00000000000..d956ab0a125 --- /dev/null +++ b/tests/jepsen.nukeeper/.gitignore @@ -0,0 +1,13 @@ +/target +/classes +/checkouts +profiles.clj +pom.xml +pom.xml.asc +*.jar +*.class +/.lein-* +/.nrepl-port +/.prepl-port +.hgignore +.hg/ diff --git a/tests/jepsen.nukeeper/LICENSE b/tests/jepsen.nukeeper/LICENSE new file mode 100644 index 00000000000..231512650b9 --- /dev/null +++ b/tests/jepsen.nukeeper/LICENSE @@ -0,0 +1,280 @@ +Eclipse Public License - v 2.0 + + THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE + PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION + OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + + a) in the case of the initial Contributor, the initial content + Distributed under this Agreement, and + + b) in the case of each subsequent Contributor: + i) changes to the Program, and + ii) additions to the Program; + where such changes and/or additions to the Program originate from + and are Distributed by that particular Contributor. A Contribution + "originates" from a Contributor if it was added to the Program by + such Contributor itself or anyone acting on such Contributor's behalf. + Contributions do not include changes or additions to the Program that + are not Modified Works. + +"Contributor" means any person or entity that Distributes the Program. + +"Licensed Patents" mean patent claims licensable by a Contributor which +are necessarily infringed by the use or sale of its Contribution alone +or when combined with the Program. + +"Program" means the Contributions Distributed in accordance with this +Agreement. + +"Recipient" means anyone who receives the Program under this Agreement +or any Secondary License (as applicable), including Contributors. + +"Derivative Works" shall mean any work, whether in Source Code or other +form, that is based on (or derived from) the Program and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. + +"Modified Works" shall mean any work in Source Code or other form that +results from an addition to, deletion from, or modification of the +contents of the Program, including, for purposes of clarity any new file +in Source Code form that contains any contents of the Program. Modified +Works shall not include works that contain only declarations, +interfaces, types, classes, structures, or files of the Program solely +in each case in order to link to, bind by name, or subclass the Program +or Modified Works thereof. + +"Distribute" means the acts of a) distributing or b) making available +in any manner that enables the transfer of a copy. + +"Source Code" means the form of a Program preferred for making +modifications, including but not limited to software source code, +documentation source, and configuration files. + +"Secondary License" means either the GNU General Public License, +Version 2.0, or any later versions of that license, including any +exceptions or additional permissions as identified by the initial +Contributor. + +2. GRANT OF RIGHTS + + a) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free copyright + license to reproduce, prepare Derivative Works of, publicly display, + publicly perform, Distribute and sublicense the Contribution of such + Contributor, if any, and such Derivative Works. + + b) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free patent + license under Licensed Patents to make, use, sell, offer to sell, + import and otherwise transfer the Contribution of such Contributor, + if any, in Source Code or other form. This patent license shall + apply to the combination of the Contribution and the Program if, at + the time the Contribution is added by the Contributor, such addition + of the Contribution causes such combination to be covered by the + Licensed Patents. The patent license shall not apply to any other + combinations which include the Contribution. No hardware per se is + licensed hereunder. + + c) Recipient understands that although each Contributor grants the + licenses to its Contributions set forth herein, no assurances are + provided by any Contributor that the Program does not infringe the + patent or other intellectual property rights of any other entity. + Each Contributor disclaims any liability to Recipient for claims + brought by any other entity based on infringement of intellectual + property rights or otherwise. As a condition to exercising the + rights and licenses granted hereunder, each Recipient hereby + assumes sole responsibility to secure any other intellectual + property rights needed, if any. For example, if a third party + patent license is required to allow Recipient to Distribute the + Program, it is Recipient's responsibility to acquire that license + before distributing the Program. + + d) Each Contributor represents that to its knowledge it has + sufficient copyright rights in its Contribution, if any, to grant + the copyright license set forth in this Agreement. + + e) Notwithstanding the terms of any Secondary License, no + Contributor makes additional grants to any Recipient (other than + those set forth in this Agreement) as a result of such Recipient's + receipt of the Program under the terms of a Secondary License + (if permitted under the terms of Section 3). + +3. REQUIREMENTS + +3.1 If a Contributor Distributes the Program in any form, then: + + a) the Program must also be made available as Source Code, in + accordance with section 3.2, and the Contributor must accompany + the Program with a statement that the Source Code for the Program + is available under this Agreement, and informs Recipients how to + obtain it in a reasonable manner on or through a medium customarily + used for software exchange; and + + b) the Contributor may Distribute the Program under a license + different than this Agreement, provided that such license: + i) effectively disclaims on behalf of all other Contributors all + warranties and conditions, express and implied, including + warranties or conditions of title and non-infringement, and + implied warranties or conditions of merchantability and fitness + for a particular purpose; + + ii) effectively excludes on behalf of all other Contributors all + liability for damages, including direct, indirect, special, + incidental and consequential damages, such as lost profits; + + iii) does not attempt to limit or alter the recipients' rights + in the Source Code under section 3.2; and + + iv) requires any subsequent distribution of the Program by any + party to be under a license that satisfies the requirements + of this section 3. + +3.2 When the Program is Distributed as Source Code: + + a) it must be made available under this Agreement, or if the + Program (i) is combined with other material in a separate file or + files made available under a Secondary License, and (ii) the initial + Contributor attached to the Source Code the notice described in + Exhibit A of this Agreement, then the Program may be made available + under the terms of such Secondary Licenses, and + + b) a copy of this Agreement must be included with each copy of + the Program. + +3.3 Contributors may not remove or alter any copyright, patent, +trademark, attribution notices, disclaimers of warranty, or limitations +of liability ("notices") contained within the Program from any copy of +the Program which they Distribute, provided that Contributors may add +their own appropriate notices. + +4. COMMERCIAL DISTRIBUTION + +Commercial distributors of software may accept certain responsibilities +with respect to end users, business partners and the like. While this +license is intended to facilitate the commercial use of the Program, +the Contributor who includes the Program in a commercial product +offering should do so in a manner which does not create potential +liability for other Contributors. Therefore, if a Contributor includes +the Program in a commercial product offering, such Contributor +("Commercial Contributor") hereby agrees to defend and indemnify every +other Contributor ("Indemnified Contributor") against any losses, +damages and costs (collectively "Losses") arising from claims, lawsuits +and other legal actions brought by a third party against the Indemnified +Contributor to the extent caused by the acts or omissions of such +Commercial Contributor in connection with its distribution of the Program +in a commercial product offering. The obligations in this section do not +apply to any claims or Losses relating to any actual or alleged +intellectual property infringement. In order to qualify, an Indemnified +Contributor must: a) promptly notify the Commercial Contributor in +writing of such claim, and b) allow the Commercial Contributor to control, +and cooperate with the Commercial Contributor in, the defense and any +related settlement negotiations. The Indemnified Contributor may +participate in any such claim at its own expense. + +For example, a Contributor might include the Program in a commercial +product offering, Product X. That Contributor is then a Commercial +Contributor. If that Commercial Contributor then makes performance +claims, or offers warranties related to Product X, those performance +claims and warranties are such Commercial Contributor's responsibility +alone. Under this section, the Commercial Contributor would have to +defend claims against the other Contributors related to those performance +claims and warranties, and if a court requires any other Contributor to +pay any damages as a result, the Commercial Contributor must pay +those damages. + +5. NO WARRANTY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS" +BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR +IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF +TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR +PURPOSE. Each Recipient is solely responsible for determining the +appropriateness of using and distributing the Program and assumes all +risks associated with its exercise of rights under this Agreement, +including but not limited to the risks and costs of program errors, +compliance with applicable laws, damage to or loss of data, programs +or equipment, and unavailability or interruption of operations. + +6. DISCLAIMER OF LIABILITY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS +SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST +PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE +EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under +applicable law, it shall not affect the validity or enforceability of +the remainder of the terms of this Agreement, and without further +action by the parties hereto, such provision shall be reformed to the +minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against any entity +(including a cross-claim or counterclaim in a lawsuit) alleging that the +Program itself (excluding combinations of the Program with other software +or hardware) infringes such Recipient's patent(s), then such Recipient's +rights granted under Section 2(b) shall terminate as of the date such +litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it +fails to comply with any of the material terms or conditions of this +Agreement and does not cure such failure in a reasonable period of +time after becoming aware of such noncompliance. If all Recipient's +rights under this Agreement terminate, Recipient agrees to cease use +and distribution of the Program as soon as reasonably practicable. +However, Recipient's obligations under this Agreement and any licenses +granted by Recipient relating to the Program shall continue and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, +but in order to avoid inconsistency the Agreement is copyrighted and +may only be modified in the following manner. The Agreement Steward +reserves the right to publish new versions (including revisions) of +this Agreement from time to time. No one other than the Agreement +Steward has the right to modify this Agreement. The Eclipse Foundation +is the initial Agreement Steward. The Eclipse Foundation may assign the +responsibility to serve as the Agreement Steward to a suitable separate +entity. Each new version of the Agreement will be given a distinguishing +version number. The Program (including Contributions) may always be +Distributed subject to the version of the Agreement under which it was +received. In addition, after a new version of the Agreement is published, +Contributor may elect to Distribute the Program (including its +Contributions) under the new version. + +Except as expressly stated in Sections 2(a) and 2(b) above, Recipient +receives no rights or licenses to the intellectual property of any +Contributor under this Agreement, whether expressly, by implication, +estoppel or otherwise. All rights in the Program not expressly granted +under this Agreement are reserved. Nothing in this Agreement is intended +to be enforceable by any entity that is not a Contributor or Recipient. +No third-party beneficiary rights are created under this Agreement. + +Exhibit A - Form of Secondary Licenses Notice + +"This Source Code may also be made available under the following +Secondary Licenses when the conditions for such availability set forth +in the Eclipse Public License, v. 2.0 are satisfied: GNU General Public +License as published by the Free Software Foundation, either version 2 +of the License, or (at your option) any later version, with the GNU +Classpath Exception which is available at +https://www.gnu.org/software/classpath/license.html." + + Simply including a copy of this Agreement, including this Exhibit A + is not sufficient to license the Source Code under Secondary Licenses. + + If it is not possible or desirable to put the notice in a particular + file, then You may include the notice in a location (such as a LICENSE + file in a relevant directory) where a recipient would be likely to + look for such a notice. + + You may add additional accurate notices of copyright ownership. diff --git a/tests/jepsen.nukeeper/README.md b/tests/jepsen.nukeeper/README.md new file mode 100644 index 00000000000..8f3754b8f7b --- /dev/null +++ b/tests/jepsen.nukeeper/README.md @@ -0,0 +1,155 @@ +# Jepsen tests ClickHouse Keeper + +A Clojure library designed to test ZooKeeper-like implementation inside ClickHouse. + +## Test scenarios (workloads) + +### CAS register + +CAS Register has three operations: read number, write number, compare-and-swap number. This register is simulated as a single ZooKeeper node. Read transforms to ZooKeeper's `getData` request. Write transforms to the `set` request. Compare-and-swap implemented via `getData` + compare in code + `set` new value with `version` from `getData`. + +In this test, we use a linearizable checker, so Jepsen validates that history was linearizable. One of the heaviest workloads. + +Strictly requires `quorum_reads` to be true. + +### Set + +Set has two operations: add a number to set and read all values from set. This workload is simulated on a single ZooKeeper node with a string value that represents Clojure set data structure. Add operation very similar to compare-and-swap. We read string value from ZooKeeper node with `getData`, parse it to Clojure's set, add new value to the set and try to write it with the received version. + +In this test, Jepsen validates that all successfully added values can be read. Generator for this workload performs only add operations until a timeout and after that tries to read set once. + +### Unique IDs + +In the Unique IDs workload we have only one operation: generate a new unique number. It's implemented using ZooKeeper's sequential nodes. For each generates request client just creates a new sequential node in ZooKeeper with a fixed prefix. After that cuts the prefix off from the returned path and parses the number from the rest part. + +Jepsen checks that all returned IDs were unique. + +### Counter + +Counter workload has two operations: read counter value and add some number to the counter. Its implementation is quite weird. We add number `N` to the counter creating `N` sequential nodes in a single ZooKeeper transaction. Counter read implemented as `getChildren` ZooKeeper request and count of all returned nodes. + +Jepsen checks that counter value lies in the interval of possible value. Strictly requires `quorum_reads` to be true. + +### Total queue + +Simulates an unordered queue with three operations: enqueue number, dequeue, and drain. Enqueue operation uses `create` request with node name equals to number. `Dequeue` operation is more interesting. We list (`getChildren`) all nodes and remember the parent node version. After that we choose the smallest one and prepare the transaction: `check` parent node version + set an empty value to parent node + delete smalled child node. Drain operation is just `getChildren` on the parent path. + +Jepsen checks that all enqueued values were dequeued or drained. Duplicates are allowed because Jepsen doesn't know the value of the unknown-status (`:info`) dequeue operation. So when we try to `dequeue` some element we should return it even if our delete transaction failed with `Connection loss` error. + +### Linear queue + +Same with the total queue, but without drain operation. Checks linearizability between enqueue and dequeue. Sometimes consume more than 10GB during validation even for very short histories. + + +## Nemesis + +We use almost all standard nemeses with small changes for our storage. + +### Random node killer (random-node-killer) + +Sleep 5 seconds, kills random node, sleep for 5 seconds, and starts it back. + +### All nodes killer (all-nodes-killer) + +Kill all nodes at once, sleep for 5 seconds, and starts them back. + +### Simple partitioner (simple-partitioner) + +Partition one node from others using iptables. No one can see the victim and the victim cannot see anybody. + +### Random node stop (random-node-hammer-time) + +Send `SIGSTOP` to the random node. Sleep 5 seconds. Send `SIGCONT`. + +### All nodes stop (all-nodes-hammer-time) + +Send `SIGSTOP` to all nodes. Sleep 5 seconds. Send `SIGCONT`. + +### Logs corruptor (logs-corruptor) + +Corrupts latest log (change one random byte) in `clickhouse_path/coordination/logs`. Restarts nodes. + +### Snapshots corruptor (snapshots-corruptor) + +Corrupts latest snapshot (change one random byte) in `clickhouse_path/coordination/snapshots`. Restarts nodes. + +### Logs and snapshots corruptor (logs-and-snapshots-corruptor) + +Corrupts both the latest log and snapshot. Restarts node. + +### Drop data corruptor (drop-data-corruptor) + +Drop all data from `clickhouse_path/coordinator`. Restarts node. + +### Bridge partitioner (bridge-partitioner) + +Two nodes don't see each other but can see another node. The last node can see both. + +### Blind node partitioner (blind-node-partitioner) + +One of the nodes cannot see another, but they can see it. + +### Blind others partitioner (blind-others-partitioner) + +Two nodes don't see one node but it can see both. + +## Usage + +### Dependencies + +- leiningen (https://leiningen.org/) +- clojure (https://clojure.org/) +- jvm + +### Options for `lein run` + +- `test` Run a single test. +- `test-all` Run all available tests from tests-set. +- `-w (--workload)` One of the workloads. Option for a single `test`. +- `--nemesis` One of nemeses. Option for a single `test`. +- `-q (--quorum)` Run test with quorum reads. +- `-r (--rate)` How many operations per second Jepsen will generate in a single thread. +- `-s (--snapshot-distance)` ClickHouse Keeper setting. How often we will create a new snapshot. +- `--stale-log-gap` ClickHosue Keeper setting. A leader will send a snapshot instead of a log to this node if it's committed index less than leaders - this setting value. +- `--reserved-log-items` ClickHouse Keeper setting. How many log items to keep after the snapshot. +- `--ops-per-key` Option for CAS register workload. Total ops that will be generated for a single register. +- `--lightweight-run` Run some lightweight tests without linearizability checks. Option for `tests-all` run. +- `--reuse-binary` Don't download clickhouse binary if it already exists on the node. +- `--clickhouse-source` URL to clickhouse `.deb`, `.tgz` or binary. +- `--time-limit` (in seconds) How long Jepsen will generate new operations. +- `--nodes-file` File with nodes for SSH. Newline separated. +- `--username` SSH username for nodes. +- `--password` SSH password for nodes. +- `--concurrency` How many threads Jepsen will use for concurrent requests. +- `--test-count` How many times to run a single test or how many tests to run from the tests set. + + +### Examples: + +1. Run `Set` workload with `logs-and-snapshots-corruptor` ten times: + +```sh +$ lein run test --nodes-file nodes.txt --username root --password '' --time-limit 30 --concurrency 50 -r 50 --workload set --nemesis logs-and-snapshots-corruptor --clickhouse-source 'https://clickhouse-builds.s3.yandex.net/someurl/clickhouse-common-static_21.4.1.6321_amd64.deb' -q --test-count 10 --reuse-binary +``` + +2. Run ten random tests from `lightweight-run` with some custom Keeper settings: + +``` sh +$ lein run test-all --nodes-file nodes.txt --username root --password '' --time-limit 30 --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run --clickhouse-source 'someurl' -q --reuse-binary --test-count 10 +``` + + +## License + +Copyright © 2021 FIXME + +This program and the accompanying materials are made available under the +terms of the Eclipse Public License 2.0 which is available at +http://www.eclipse.org/legal/epl-2.0. + +This Source Code may also be made available under the following Secondary +Licenses when the conditions for such availability set forth in the Eclipse +Public License, v. 2.0 are satisfied: GNU General Public License as published by +the Free Software Foundation, either version 2 of the License, or (at your +option) any later version, with the GNU Classpath Exception which is available +at https://www.gnu.org/software/classpath/license.html. diff --git a/tests/jepsen.nukeeper/doc/intro.md b/tests/jepsen.nukeeper/doc/intro.md new file mode 100644 index 00000000000..c6e5ccbd04a --- /dev/null +++ b/tests/jepsen.nukeeper/doc/intro.md @@ -0,0 +1,3 @@ +# Introduction to jepsen.nukeeper + +TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) diff --git a/tests/jepsen.nukeeper/project.clj b/tests/jepsen.nukeeper/project.clj new file mode 100644 index 00000000000..e7150c9e5d4 --- /dev/null +++ b/tests/jepsen.nukeeper/project.clj @@ -0,0 +1,13 @@ +(defproject jepsen.nukeeper "0.1.0-SNAPSHOT" + :injections [(.. System (setProperty "zookeeper.request.timeout" "10000"))] + :description "A jepsen tests for ClickHouse NuKeeper" + :url "https://clickhouse.tech/" + :license {:name "EPL-2.0" + :url "https://www.eclipse.org/legal/epl-2.0/"} + :main jepsen.nukeeper.main + :plugins [[lein-cljfmt "0.7.0"]] + :dependencies [[org.clojure/clojure "1.10.1"] + [jepsen "0.2.3"] + [zookeeper-clj "0.9.4"] + [org.apache.zookeeper/zookeeper "3.6.1" :exclusions [org.slf4j/slf4j-log4j12]]] + :repl-options {:init-ns jepsen.nukeeper.main}) diff --git a/tests/jepsen.nukeeper/resources/config.xml b/tests/jepsen.nukeeper/resources/config.xml new file mode 120000 index 00000000000..c7596baa075 --- /dev/null +++ b/tests/jepsen.nukeeper/resources/config.xml @@ -0,0 +1 @@ +../../../programs/server/config.xml \ No newline at end of file diff --git a/tests/jepsen.nukeeper/resources/listen.xml b/tests/jepsen.nukeeper/resources/listen.xml new file mode 100644 index 00000000000..de8c737ff75 --- /dev/null +++ b/tests/jepsen.nukeeper/resources/listen.xml @@ -0,0 +1,3 @@ + + :: + diff --git a/tests/jepsen.nukeeper/resources/test_keeper_config.xml b/tests/jepsen.nukeeper/resources/test_keeper_config.xml new file mode 100644 index 00000000000..c69fb0f228c --- /dev/null +++ b/tests/jepsen.nukeeper/resources/test_keeper_config.xml @@ -0,0 +1,36 @@ + + + 9181 + {id} + + + 10000 + 30000 + false + 120000 + trace + {quorum_reads} + {snapshot_distance} + {stale_log_gap} + {reserved_log_items} + + + + + 1 + {srv1} + 9444 + + + 2 + {srv2} + 9444 + + + 3 + {srv3} + 9444 + + + + diff --git a/tests/jepsen.nukeeper/resources/users.xml b/tests/jepsen.nukeeper/resources/users.xml new file mode 120000 index 00000000000..41b137a130f --- /dev/null +++ b/tests/jepsen.nukeeper/resources/users.xml @@ -0,0 +1 @@ +../../../programs/server/users.xml \ No newline at end of file diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj new file mode 100644 index 00000000000..d6245d450f5 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj @@ -0,0 +1,18 @@ +(ns jepsen.nukeeper.constants) + +(def common-prefix "/home/robot-clickhouse") + +(def binary-name "clickhouse") + +(def binary-path (str common-prefix "/" binary-name)) +(def pid-file-path (str common-prefix "/clickhouse.pid")) + +(def data-dir (str common-prefix "/db")) +(def logs-dir (str common-prefix "/logs")) +(def configs-dir (str common-prefix "/config")) +(def sub-configs-dir (str configs-dir "/config.d")) +(def coordination-data-dir (str data-dir "/coordination")) +(def coordination-snapshots-dir (str coordination-data-dir "/snapshots")) +(def coordination-logs-dir (str coordination-data-dir "/logs")) + +(def stderr-file (str logs-dir "/stderr.log")) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj new file mode 100644 index 00000000000..b426a8ea90d --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj @@ -0,0 +1,50 @@ +(ns jepsen.nukeeper.counter + (:require + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defn r [_ _] {:type :invoke, :f :read}) +(defn add [_ _] {:type :invoke, :f :add, :value (rand-int 5)}) + +(defrecord CounterClient [conn nodename] + client/Client + (open! [this test node] + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) + + (setup! [this test]) + + (invoke! [this test op] + (case (:f op) + :read (exec-with-retries 30 (fn [] + (assoc op + :type :ok + :value (count (zk-list conn "/"))))) + :add (try + (do + (zk-multi-create-many-seq-nodes conn "/seq-" (:value op)) + (assoc op :type :ok)) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) + + (teardown! [_ test]) + + (close! [_ test] + (zk/close conn))) + +(defn workload + "A generator, client, and checker for a set test." + [opts] + {:client (CounterClient. nil nil) + :checker (checker/counter) + :generator (->> (range) + (map (fn [x] + (->> (gen/mix [r add]))))) + :final-generator (gen/once {:type :invoke, :f :read, :value nil})}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj new file mode 100644 index 00000000000..d82d628cc95 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj @@ -0,0 +1,128 @@ +(ns jepsen.nukeeper.db + (:require [clojure.tools.logging :refer :all] + [jepsen + [control :as c] + [db :as db] + [util :as util :refer [meh]]] + [jepsen.nukeeper.constants :refer :all] + [jepsen.nukeeper.utils :refer :all] + [clojure.java.io :as io] + [jepsen.control.util :as cu] + [jepsen.os.ubuntu :as ubuntu])) + +(defn get-clickhouse-sky + [version] + (c/exec :sky :get :-d common-prefix :-N :Backbone version) + (str common-prefix "/clickhouse")) + +(defn get-clickhouse-url + [url] + (let [download-result (cu/wget! url)] + (do (c/exec :mv download-result common-prefix) + (str common-prefix "/" download-result)))) + +(defn download-clickhouse + [source] + (info "Downloading clickhouse from" source) + (cond + (clojure.string/starts-with? source "rbtorrent:") (get-clickhouse-sky source) + (clojure.string/starts-with? source "http") (get-clickhouse-url source) + :else (throw (Exception. (str "Don't know how to download clickhouse from" source))))) + +(defn unpack-deb + [path] + (do + (c/exec :dpkg :-x path common-prefix) + (c/exec :rm :-f path) + (c/exec :mv (str common-prefix "/usr/bin/clickhouse") common-prefix) + (c/exec :rm :-rf (str common-prefix "/usr") (str common-prefix "/etc")))) + +(defn unpack-tgz + [path] + (do + (c/exec :mkdir :-p (str common-prefix "/unpacked")) + (c/exec :tar :-zxvf path :-C (str common-prefix "/unpacked")) + (c/exec :rm :-f path) + (let [subdir (c/exec :ls (str common-prefix "/unpacked"))] + (c/exec :mv (str common-prefix "/unpacked/" subdir "/usr/bin/clickhouse") common-prefix) + (c/exec :rm :-fr (str common-prefix "/unpacked"))))) + +(defn chmod-binary + [path] + (c/exec :chmod :+x path)) + +(defn install-downloaded-clickhouse + [path] + (cond + (clojure.string/ends-with? path ".deb") (unpack-deb path) + (clojure.string/ends-with? path ".tgz") (unpack-tgz path) + (clojure.string/ends-with? path "clickhouse") (chmod-binary path) + :else (throw (Exception. (str "Don't know how to install clickhouse from path" path))))) + +(defn prepare-dirs + [] + (do + (c/exec :mkdir :-p common-prefix) + (c/exec :mkdir :-p data-dir) + (c/exec :mkdir :-p logs-dir) + (c/exec :mkdir :-p configs-dir) + (c/exec :mkdir :-p sub-configs-dir) + (c/exec :touch stderr-file) + (c/exec :chown :-R :root common-prefix))) + +(defn cluster-config + [test node config-template] + (let [nodes (:nodes test) + replacement-map {#"\{srv1\}" (get nodes 0) + #"\{srv2\}" (get nodes 1) + #"\{srv3\}" (get nodes 2) + #"\{id\}" (str (inc (.indexOf nodes node))) + #"\{quorum_reads\}" (str (boolean (:quorum test))) + #"\{snapshot_distance\}" (str (:snapshot-distance test)) + #"\{stale_log_gap\}" (str (:stale-log-gap test)) + #"\{reserved_log_items\}" (str (:reserved-log-items test))}] + (reduce #(clojure.string/replace %1 (get %2 0) (get %2 1)) config-template replacement-map))) + +(defn install-configs + [test node] + (c/exec :echo (slurp (io/resource "config.xml")) :> (str configs-dir "/config.xml")) + (c/exec :echo (slurp (io/resource "users.xml")) :> (str configs-dir "/users.xml")) + (c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml")) + (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml"))) + +(defn db + [version reuse-binary] + (reify db/DB + (setup! [_ test node] + (c/su + (do + (info "Preparing directories") + (prepare-dirs) + (if (or (not (cu/exists? binary-path)) (not reuse-binary)) + (do (info "Downloading clickhouse") + (install-downloaded-clickhouse (download-clickhouse version))) + (info "Binary already exsist on path" binary-path "skipping download")) + (info "Installing configs") + (install-configs test node) + (info "Starting server") + (start-clickhouse! node test) + (info "ClickHouse started")))) + + (teardown! [_ test node] + (info node "Tearing down clickhouse") + (kill-clickhouse! node test) + (c/su + (if (not reuse-binary) + (c/exec :rm :-rf binary-path)) + (c/exec :rm :-rf pid-file-path) + (c/exec :rm :-rf data-dir) + (c/exec :rm :-rf logs-dir) + (c/exec :rm :-rf configs-dir))) + + db/LogFiles + (log-files [_ test node] + (c/su + (kill-clickhouse! node test) + (c/cd data-dir + (c/exec :tar :czf "coordination.tar.gz" "coordination"))) + [stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")]))) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj new file mode 100644 index 00000000000..b9439097e85 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -0,0 +1,159 @@ +(ns jepsen.nukeeper.main + (:require [clojure.tools.logging :refer :all] + [jepsen.nukeeper.utils :refer :all] + [clojure.pprint :refer [pprint]] + [jepsen.nukeeper.set :as set] + [jepsen.nukeeper.db :refer :all] + [jepsen.nukeeper.nemesis :as custom-nemesis] + [jepsen.nukeeper.register :as register] + [jepsen.nukeeper.unique :as unique] + [jepsen.nukeeper.queue :as queue] + [jepsen.nukeeper.counter :as counter] + [jepsen.nukeeper.constants :refer :all] + [clojure.string :as str] + [jepsen + [checker :as checker] + [cli :as cli] + [client :as client] + [control :as c] + [db :as db] + [nemesis :as nemesis] + [generator :as gen] + [independent :as independent] + [tests :as tests] + [util :as util :refer [meh]]] + [jepsen.control.util :as cu] + [jepsen.os.ubuntu :as ubuntu] + [jepsen.checker.timeline :as timeline] + [clojure.java.io :as io] + [zookeeper.data :as data] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException) + (ch.qos.logback.classic Level) + (org.slf4j Logger LoggerFactory))) + +(def workloads + "A map of workload names to functions that construct workloads, given opts." + {"set" set/workload + "register" register/workload + "unique-ids" unique/workload + "counter" counter/workload + "total-queue" queue/total-workload + "linear-queue" queue/linear-workload}) + +(def cli-opts + "Additional command line options." + [["-w" "--workload NAME" "What workload should we run?" + :default "set" + :validate [workloads (cli/one-of workloads)]] + [nil "--nemesis NAME" "Which nemesis will poison our lives?" + :default "random-node-killer" + :validate [custom-nemesis/custom-nemesises (cli/one-of custom-nemesis/custom-nemesises)]] + ["-q" "--quorum" "Use quorum reads, instead of reading from any primary."] + ["-r" "--rate HZ" "Approximate number of requests per second, per thread." + :default 10 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + ["-s" "--snapshot-distance NUM" "Number of log entries to create snapshot" + :default 10000 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + [nil "--stale-log-gap NUM" "Number of log entries to send snapshot instead of separate logs" + :default 1000 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + [nil "--reserved-log-items NUM" "Number of log entries to keep after snapshot" + :default 1000 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + [nil "--ops-per-key NUM" "Maximum number of operations on any given key." + :default 100 + :parse-fn parse-long + :validate [pos? "Must be a positive integer."]] + [nil, "--lightweight-run" "Subset of workloads/nemesises which is simple to validate"] + [nil, "--reuse-binary" "Use already downloaded binary if it exists, don't remove it on shutdown"] + ["-c" "--clickhouse-source URL" "URL for clickhouse deb or tgz package" + :default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]]) + +(defn nukeeper-test + "Given an options map from the command line runner (e.g. :nodes, :ssh, + :concurrency, ...), constructs a test map." + [opts] + (info "Test opts\n" (with-out-str (pprint opts))) + (let [quorum (boolean (:quorum opts)) + workload ((get workloads (:workload opts)) opts) + current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))] + (merge tests/noop-test + opts + {:name (str "clickhouse-keeper-quorum=" quorum "-" (name (:workload opts)) "-" (name (:nemesis opts))) + :os ubuntu/os + :db (db (:clickhouse-source opts) (boolean (:reuse-binary opts))) + :pure-generators true + :client (:client workload) + :nemesis (:nemesis current-nemesis) + :checker (checker/compose + {:perf (checker/perf) + :workload (:checker workload)}) + :generator (gen/phases + (->> (:generator workload) + (gen/stagger (/ (:rate opts))) + (gen/nemesis (:generator current-nemesis)) + (gen/time-limit (:time-limit opts))) + (gen/log "Healing cluster") + (gen/nemesis (gen/once {:type :info, :f :stop})) + (gen/log "Waiting for recovery") + (gen/sleep 10) + (gen/clients (:final-generator workload)))}))) + +(def all-nemesises (keys custom-nemesis/custom-nemesises)) + +(def all-workloads (keys workloads)) + +(def lightweight-workloads ["set" "unique-ids" "counter" "total-queue"]) + +(def useful-nemesises ["random-node-killer" + "simple-partitioner" + "all-nodes-hammer-time" + ; can lead to a very rare data loss https://github.com/eBay/NuRaft/issues/185 + ;"logs-and-snapshots-corruptor" + ;"drop-data-corruptor" + "bridge-partitioner" + "blind-node-partitioner" + "blind-others-partitioner"]) + +(defn cart [colls] + (if (empty? colls) + '(()) + (for [more (cart (rest colls)) + x (first colls)] + (cons x more)))) + +(defn all-test-options + "Takes base cli options, a collection of nemeses, workloads, and a test count, + and constructs a sequence of test options." + [cli worload-nemeseis-collection] + (take (:test-count cli) + (shuffle (for [[workload nemesis] worload-nemeseis-collection] + (assoc cli + :nemesis nemesis + :workload workload + :test-count 1))))) +(defn all-tests + "Turns CLI options into a sequence of tests." + [test-fn cli] + (if (boolean (:lightweight-run cli)) + (map test-fn (all-test-options cli (cart [lightweight-workloads useful-nemesises]))) + (map test-fn (all-test-options cli (cart [all-workloads all-nemesises]))))) + +(defn -main + "Handles command line arguments. Can either run a test, or a web server for + browsing results." + [& args] + (.setLevel + (LoggerFactory/getLogger "org.apache.zookeeper") Level/OFF) + (cli/run! (merge (cli/single-test-cmd {:test-fn nukeeper-test + :opt-spec cli-opts}) + (cli/test-all-cmd {:tests-fn (partial all-tests nukeeper-test) + :opt-spec cli-opts}) + (cli/serve-cmd)) + args)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj new file mode 100644 index 00000000000..7d4941cdc8e --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -0,0 +1,160 @@ +(ns jepsen.nukeeper.nemesis + (:require + [clojure.tools.logging :refer :all] + [jepsen + [nemesis :as nemesis] + [control :as c] + [generator :as gen]] + [jepsen.nukeeper.constants :refer :all] + [jepsen.nukeeper.utils :refer :all])) + +(defn random-node-killer-nemesis + [] + (nemesis/node-start-stopper + rand-nth + (fn start [test node] (kill-clickhouse! node test)) + (fn stop [test node] (start-clickhouse! node test)))) + +(defn all-nodes-killer-nemesis + [] + (nemesis/node-start-stopper + identity + (fn start [test node] (kill-clickhouse! node test)) + (fn stop [test node] (start-clickhouse! node test)))) + +(defn random-node-hammer-time-nemesis + [] + (nemesis/hammer-time "clickhouse")) + +(defn all-nodes-hammer-time-nemesis + [] + (nemesis/hammer-time identity "clickhouse")) + +(defn select-last-file + [path] + (last (clojure.string/split + (c/exec :find path :-type :f :-printf "%T+ %p\n" :| :grep :-v :tmp_ :| :sort :| :awk "{print $2}") + #"\n"))) + +(defn random-file-pos + [fname] + (let [fsize (Integer/parseInt (c/exec :du :-b fname :| :cut :-f1))] + (rand-int fsize))) + +(defn corrupt-file + [fname] + (if (not (empty? fname)) + (do + (info "Corrupting" fname) + (c/exec :dd "if=/dev/zero" (str "of=" fname) "bs=1" "count=1" (str "seek=" (random-file-pos fname)) "conv=notrunc")) + (info "Nothing to corrupt"))) + +(defn corruptor-nemesis + [path corruption-op] + (reify nemesis/Nemesis + + (setup! [this test] this) + + (invoke! [this test op] + (cond (= (:f op) :corrupt) + (let [nodes (list (rand-nth (:nodes test)))] + (info "Corruption on node" nodes) + (c/on-nodes test nodes + (fn [test node] + (c/su + (kill-clickhouse! node test) + (corruption-op path) + (start-clickhouse! node test)))) + (assoc op :type :info, :value :corrupted)) + :else (do (c/on-nodes test (:nodes test) + (fn [test node] + (c/su + (start-clickhouse! node test)))) + (assoc op :type :info, :value :done)))) + + (teardown! [this test]))) + +(defn logs-corruption-nemesis + [] + (corruptor-nemesis coordination-logs-dir #(corrupt-file (select-last-file %1)))) + +(defn snapshots-corruption-nemesis + [] + (corruptor-nemesis coordination-snapshots-dir #(corrupt-file (select-last-file %1)))) + +(defn logs-and-snapshots-corruption-nemesis + [] + (corruptor-nemesis coordination-data-dir (fn [path] + (do + (corrupt-file (select-last-file (str path "/snapshots"))) + (corrupt-file (select-last-file (str path "/logs"))))))) +(defn drop-all-corruption-nemesis + [] + (corruptor-nemesis coordination-data-dir (fn [path] + (c/exec :rm :-fr path)))) + +(defn partition-bridge-nemesis + [] + (nemesis/partitioner nemesis/bridge)) + +(defn blind-node + [nodes] + (let [[[victim] others] (nemesis/split-one nodes)] + {victim (into #{} others)})) + +(defn blind-node-partition-nemesis + [] + (nemesis/partitioner blind-node)) + +(defn blind-others + [nodes] + (let [[[victim] others] (nemesis/split-one nodes)] + (into {} (map (fn [node] [node #{victim}])) others))) + +(defn blind-others-partition-nemesis + [] + (nemesis/partitioner blind-others)) + +(defn network-non-symmetric-nemesis + [] + (nemesis/partitioner nemesis/bridge)) + +(defn start-stop-generator + [time-corrupt time-ok] + (->> + (cycle [(gen/sleep time-ok) + {:type :info, :f :start} + (gen/sleep time-corrupt) + {:type :info, :f :stop}]))) + +(defn corruption-generator + [] + (->> + (cycle [(gen/sleep 5) + {:type :info, :f :corrupt}]))) + +(def custom-nemesises + {"random-node-killer" {:nemesis (random-node-killer-nemesis) + :generator (start-stop-generator 5 5)} + "all-nodes-killer" {:nemesis (all-nodes-killer-nemesis) + :generator (start-stop-generator 1 10)} + "simple-partitioner" {:nemesis (nemesis/partition-random-halves) + :generator (start-stop-generator 5 5)} + "random-node-hammer-time" {:nemesis (random-node-hammer-time-nemesis) + :generator (start-stop-generator 5 5)} + "all-nodes-hammer-time" {:nemesis (all-nodes-hammer-time-nemesis) + :generator (start-stop-generator 1 10)} + "logs-corruptor" {:nemesis (logs-corruption-nemesis) + :generator (corruption-generator)} + "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis) + :generator (corruption-generator)} + "logs-and-snapshots-corruptor" {:nemesis (logs-and-snapshots-corruption-nemesis) + :generator (corruption-generator)} + "drop-data-corruptor" {:nemesis (drop-all-corruption-nemesis) + :generator (corruption-generator)} + "bridge-partitioner" {:nemesis (partition-bridge-nemesis) + :generator (start-stop-generator 5 5)} + "blind-node-partitioner" {:nemesis (blind-node-partition-nemesis) + :generator (start-stop-generator 5 5)} + "blind-others-partitioner" {:nemesis (blind-others-partition-nemesis) + :generator (start-stop-generator 5 5)}}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj new file mode 100644 index 00000000000..308778983aa --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj @@ -0,0 +1,79 @@ +(ns jepsen.nukeeper.queue + (:require + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [knossos.model :as model] + [jepsen.checker.timeline :as timeline] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defn enqueue [val _ _] {:type :invoke, :f :enqueue :value val}) +(defn dequeue [_ _] {:type :invoke, :f :dequeue}) + +(defrecord QueueClient [conn nodename] + client/Client + (open! [this test node] + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) + + (setup! [this test]) + + (invoke! [this test op] + (case (:f op) + :enqueue (try + (do + (zk-create-if-not-exists conn (str "/" (:value op)) "") + (assoc op :type :ok)) + (catch Exception _ (assoc op :type :info, :error :connect-error))) + :dequeue + (try + (let [result (zk-multi-delete-first-child conn "/")] + (if (not (nil? result)) + (assoc op :type :ok :value result) + (assoc op :type :fail :value result))) + (catch Exception _ (assoc op :type :info, :error :connect-error))) + :drain + ; drain via delete is to long, just list all nodes + (exec-with-retries 30 (fn [] + (zk-sync conn) + (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/")))))))) + + (teardown! [_ test]) + + (close! [_ test] + (zk/close conn))) + +(defn sorted-str-range + [n] + (sort (map (fn [v] (str v)) (take n (range))))) + +(defn total-workload + "A generator, client, and checker for a set test." + [opts] + {:client (QueueClient. nil nil) + :checker (checker/compose + {:total-queue (checker/total-queue) + :timeline (timeline/html)}) + :generator (->> (sorted-str-range 50000) + (map (fn [x] + (rand-nth [{:type :invoke, :f :enqueue :value x} + {:type :invoke, :f :dequeue}])))) + :final-generator (gen/once {:type :invoke, :f :drain, :value nil})}) + +(defn linear-workload + [opts] + {:client (QueueClient. nil nil) + :checker (checker/compose + {:linear (checker/linearizable {:model (model/unordered-queue) + :algorithm :linear}) + :timeline (timeline/html)}) + :generator (->> (sorted-str-range 10000) + (map (fn [x] + (rand-nth [{:type :invoke, :f :enqueue :value x} + {:type :invoke, :f :dequeue}]))))}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj new file mode 100644 index 00000000000..98322845346 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj @@ -0,0 +1,64 @@ +(ns jepsen.nukeeper.register + (:require [jepsen + [checker :as checker] + [client :as client] + [independent :as independent] + [generator :as gen]] + [jepsen.checker.timeline :as timeline] + [knossos.model :as model] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defn r [_ _] {:type :invoke, :f :read, :value nil}) +(defn w [_ _] {:type :invoke, :f :write, :value (rand-int 5)}) +(defn cas [_ _] {:type :invoke, :f :cas, :value [(rand-int 5) (rand-int 5)]}) + +(defrecord RegisterClient [conn] + client/Client + (open! [this test node] + (assoc this :conn (zk-connect node 9181 30000))) + + (setup! [this test] + (zk-create-range conn 300)) ; 300 nodes to be sure + + (invoke! [_ test op] + (let [[k v] (:value op) + zk-k (zk-path k)] + (case (:f op) + :read (try + (assoc op :type :ok, :value (independent/tuple k (parse-long (:data (zk-get-str conn zk-k))))) + (catch Exception _ (assoc op :type :fail, :error :connect-error))) + :write (try + (do (zk-set conn zk-k v) + (assoc op :type :ok)) + (catch Exception _ (assoc op :type :info, :error :connect-error))) + :cas (try + (let [[old new] v] + (assoc op :type (if (zk-cas conn zk-k old new) + :ok + :fail))) + (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version)) + (catch Exception _ (assoc op :type :info, :error :connect-error)))))) + + (teardown! [this test]) + + (close! [_ test] + (zk/close conn))) + +(defn workload + "Tests linearizable reads, writes, and compare-and-set operations on + independent keys." + [opts] + {:client (RegisterClient. nil) + :checker (independent/checker + (checker/compose + {:linear (checker/linearizable {:model (model/cas-register) + :algorithm :linear}) + :timeline (timeline/html)})) + :generator (independent/concurrent-generator + 10 + (range) + (fn [k] + (->> (gen/mix [r w cas]) + (gen/limit (:ops-per-key opts)))))}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj new file mode 100644 index 00000000000..f9d21a8dc62 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -0,0 +1,49 @@ +(ns jepsen.nukeeper.set + (:require + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defrecord SetClient [k conn nodename] + client/Client + (open! [this test node] + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) + + (setup! [this test] + (zk-create-if-not-exists conn k "#{}")) + + (invoke! [this test op] + (case (:f op) + :read (exec-with-retries 30 (fn [] + (zk-sync conn) + (assoc op + :type :ok + :value (read-string (:data (zk-get-str conn k)))))) + :add (try + (do + (zk-add-to-set conn k (:value op)) + (assoc op :type :ok)) + (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version)) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) + + (teardown! [_ test]) + + (close! [_ test] + (zk/close conn))) + +(defn workload + "A generator, client, and checker for a set test." + [opts] + {:client (SetClient. "/a-set" nil nil) + :checker (checker/set) + :generator (->> (range) + (map (fn [x] {:type :invoke, :f :add, :value x}))) + :final-generator (gen/once {:type :invoke, :f :read, :value nil})}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj new file mode 100644 index 00000000000..9dfb906bc17 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj @@ -0,0 +1,42 @@ +(ns jepsen.nukeeper.unique + (:require + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defrecord UniqueClient [conn nodename] + client/Client + (open! [this test node] + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) + + (setup! [this test]) + + (invoke! [this test op] + (case + :generate + (try + (let [result-path (zk-create-sequential conn "/seq-" "")] + (assoc op :type :ok :value (parse-and-get-counter result-path))) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) + + (teardown! [_ test]) + + (close! [_ test] + (zk/close conn))) + +(defn workload + "A generator, client, and checker for a set test." + [opts] + {:client (UniqueClient. nil nil) + :checker (checker/unique-ids) + :generator (->> + (range) + (map (fn [_] {:type :invoke, :f :generate})))}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj new file mode 100644 index 00000000000..cfe9add238b --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -0,0 +1,180 @@ +(ns jepsen.nukeeper.utils + (:require [clojure.string :as str] + [zookeeper.data :as data] + [zookeeper :as zk] + [zookeeper.internal :as zi] + [jepsen.control.util :as cu] + [jepsen.nukeeper.constants :refer :all] + [jepsen.control :as c] + [clojure.tools.logging :refer :all]) + (:import (org.apache.zookeeper.data Stat) + (org.apache.zookeeper CreateMode + ZooKeeper) + (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defn parse-long + "Parses a string to a Long. Passes through `nil` and empty strings." + [s] + (if (and s (> (count s) 0)) + (Long/parseLong s))) + +(defn parse-and-get-counter + [path] + (Integer/parseInt (apply str (take-last 10 (seq (str path)))))) + +(defn zk-range + [] + (map (fn [v] (str "/" v)) (range))) + +(defn zk-path + [n] + (str "/" n)) + +(defn zk-connect + [host port timeout] + (zk/connect (str host ":" port) :timeout-msec timeout)) + +(defn zk-create-range + [conn n] + (dorun (map (fn [v] (zk/create-all conn v :persistent? true)) (take n (zk-range))))) + +(defn zk-set + ([conn path value] + (zk/set-data conn path (data/to-bytes (str value)) -1)) + ([conn path value version] + (zk/set-data conn path (data/to-bytes (str value)) version))) + +(defn zk-get-str + [conn path] + (let [zk-result (zk/data conn path)] + {:data (data/to-string (:data zk-result)) + :stat (:stat zk-result)})) + +(defn zk-list + [conn path] + (zk/children conn path)) + +(defn zk-list-with-stat + [conn path] + (let [stat (new Stat) + children (seq (.getChildren conn path false stat))] + {:children children + :stat (zi/stat-to-map stat)})) + +(defn zk-cas + [conn path old-value new-value] + (let [current-value (zk-get-str conn path)] + (if (= (parse-long (:data current-value)) old-value) + (do (zk-set conn path new-value (:version (:stat current-value))) + true)))) + +(defn zk-add-to-set + [conn path elem] + (let [current-value (zk-get-str conn path) + current-set (read-string (:data current-value)) + new-set (conj current-set elem)] + (zk-set conn path (pr-str new-set) (:version (:stat current-value))))) + +(defn zk-create-if-not-exists + [conn path data] + (zk/create conn path :data (data/to-bytes (str data)) :persistent? true)) + +(defn zk-create-sequential + [conn path-prefix data] + (zk/create conn path-prefix :data (data/to-bytes (str data)) :persistent? true :sequential? true)) + +(defn zk-multi-create-many-seq-nodes + [conn path-prefix num] + (let [txn (.transaction conn)] + (loop [i 0] + (cond (>= i num) (.commit txn) + :else (do (.create txn path-prefix + (data/to-bytes "") + (zi/acls :open-acl-unsafe) + CreateMode/PERSISTENT_SEQUENTIAL) + (recur (inc i))))))) + +; sync call not implemented in zookeeper-clj and don't have sync version in java API +(defn zk-sync + [conn] + (zk-set conn "/" "" -1)) + +(defn zk-parent-path + [path] + (let [rslash_pos (str/last-index-of path "/")] + (if (> rslash_pos 0) + (subs path 0 rslash_pos) + "/"))) + +(defn zk-multi-delete-first-child + [conn path] + (let [{children :children stat :stat} (zk-list-with-stat conn path) + txn (.transaction conn) + first-child (first (sort children))] + (if (not (nil? first-child)) + (try + (do (.check txn path (:version stat)) + (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions + (.delete txn (str path first-child) -1) + (.commit txn) + first-child) + (catch KeeperException$BadVersionException _ nil) + ; Even if we got connection loss, delete may actually be executed. + ; This function is used for queue model, which strictly require + ; all enqueued elements to be dequeued, but allow duplicates. + ; So even in case when we not sure about delete we return first-child. + (catch Exception _ first-child)) + nil))) + +(defn clickhouse-alive? + [node test] + (info "Checking server alive on" node) + (try + (c/exec binary-path :client :--query "SELECT 1") + (catch Exception _ false))) + +(defn wait-clickhouse-alive! + [node test & {:keys [maxtries] :or {maxtries 30}}] + (loop [i 0] + (cond (> i maxtries) false + (clickhouse-alive? node test) true + :else (do (Thread/sleep 1000) (recur (inc i)))))) + +(defn kill-clickhouse! + [node test] + (info "Killing server on node" node) + (c/su + (cu/stop-daemon! binary-path pid-file-path) + (c/exec :rm :-fr (str data-dir "/status")))) + +(defn start-clickhouse! + [node test] + (info "Starting server on node" node) + (c/su + (cu/start-daemon! + {:pidfile pid-file-path + :logfile stderr-file + :chdir data-dir} + binary-path + :server + :--config (str configs-dir "/config.xml") + :-- + :--path (str data-dir "/") + :--user_files_path (str data-dir "/user_files") + :--top_level_domains_path (str data-dir "/top_level_domains") + :--logger.log (str logs-dir "/clickhouse-server.log") + :--logger.errorlog (str logs-dir "/clickhouse-server.err.log") + :--test_keeper_server.snapshot_storage_path coordination-snapshots-dir + :--test_keeper_server.logs_storage_path coordination-logs-dir) + (wait-clickhouse-alive! node test))) + +(defn exec-with-retries + [retries f & args] + (let [res (try {:value (apply f args)} + (catch Exception e + (if (zero? retries) + (throw e) + {:exception e})))] + (if (:exception res) + (do (Thread/sleep 1000) (recur (dec retries) f args)) + (:value res)))) diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj new file mode 100644 index 00000000000..db84ff33ee3 --- /dev/null +++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj @@ -0,0 +1,39 @@ +(ns jepsen.nukeeper-test + (:require [clojure.test :refer :all] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk] + [zookeeper.data :as data]) + (:import (ch.qos.logback.classic Level) + (org.slf4j Logger LoggerFactory))) + +(defn multicreate + [conn] + (dorun (map (fn [v] (zk/create conn v :persistent? true)) (take 10 (zk-range))))) + +(defn multidelete + [conn] + (dorun (map (fn [v] (zk/delete conn v)) (take 10 (zk-range))))) + +(deftest a-test + (testing "nukeeper connection" + (.setLevel + (LoggerFactory/getLogger "org.apache.zookeeper") Level/OFF) + (let [conn (zk/connect "localhost:9181" :timeout-msec 5000)] + ;(println (take 10 (zk-range))) + ;(multidelete conn) + ;(multicreate conn) + ;(zk/create-all conn "/0") + ;(zk/create conn "/0") + ;(println (zk/children conn "/")) + ;(zk/set-data conn "/0" (data/to-bytes "777") -1) + (println (zk-parent-path "/sasds/dasda/das")) + (println (zk-parent-path "/sasds")) + (zk-multi-create-many-seq-nodes conn "/a-" 5) + (println (zk/children conn "/")) + (println (zk-list-with-stat conn "/")) + (println (zk-multi-delete-first-child conn "/")) + (println (zk-list-with-stat conn "/")) + ;(Thread/sleep 5000) + ;(println "VALUE" (data/to-string (:data (zk/data conn "/0")))) + ;(is (= (data/to-string (:data (zk/data conn "/0"))) "777")) + (zk/close conn)))) diff --git a/tests/performance/arithmetic.xml b/tests/performance/arithmetic.xml index 0be61eb5823..bf5e7662e37 100644 --- a/tests/performance/arithmetic.xml +++ b/tests/performance/arithmetic.xml @@ -1,4 +1,4 @@ - + 30000000000 diff --git a/tests/performance/array_join.xml b/tests/performance/array_join.xml index ca280ce28ad..cf92b51f545 100644 --- a/tests/performance/array_join.xml +++ b/tests/performance/array_join.xml @@ -1,4 +1,4 @@ - + diff --git a/tests/performance/bounding_ratio.xml b/tests/performance/bounding_ratio.xml index e3a15f90013..e430136b624 100644 --- a/tests/performance/bounding_ratio.xml +++ b/tests/performance/bounding_ratio.xml @@ -1,4 +1,4 @@ - + SELECT boundingRatio(number, number) FROM numbers(100000000) SELECT (argMax(number, number) - argMin(number, number)) / (max(number) - min(number)) FROM numbers(100000000) diff --git a/tests/performance/codec_none.xml b/tests/performance/codec_none.xml new file mode 100644 index 00000000000..e6eb9773a66 --- /dev/null +++ b/tests/performance/codec_none.xml @@ -0,0 +1,13 @@ + + + hits_10m_single + + + CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple() + INSERT INTO hits_none SELECT Title FROM test.hits + OPTIMIZE TABLE hits_none FINAL + + + + DROP TABLE hits_none + diff --git a/tests/performance/codecs_float_insert.xml b/tests/performance/codecs_float_insert.xml index a7cb5152c09..b282bcc268f 100644 --- a/tests/performance/codecs_float_insert.xml +++ b/tests/performance/codecs_float_insert.xml @@ -1,5 +1,5 @@ - + 1 diff --git a/tests/performance/codecs_int_insert.xml b/tests/performance/codecs_int_insert.xml index caefaba3725..662df80ae70 100644 --- a/tests/performance/codecs_int_insert.xml +++ b/tests/performance/codecs_int_insert.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/collations.xml b/tests/performance/collations.xml index 17b2d36b7e3..52ccede3798 100644 --- a/tests/performance/collations.xml +++ b/tests/performance/collations.xml @@ -1,4 +1,4 @@ - + diff --git a/tests/performance/conditional.xml b/tests/performance/conditional.xml index 21623f45b05..91b6cb95ff2 100644 --- a/tests/performance/conditional.xml +++ b/tests/performance/conditional.xml @@ -1,4 +1,4 @@ - + SELECT count() FROM zeros(10000000) WHERE NOT ignore(if(rand() % 2, toDateTime('2019-02-04 01:24:31'), toDate('2019-02-04'))) SELECT count() FROM zeros(10000000) WHERE NOT ignore(multiIf(rand() % 2, toDateTime('2019-02-04 01:24:31'), toDate('2019-02-04'))) SELECT count() FROM zeros(10000000) WHERE NOT ignore(if(rand() % 2, [toDateTime('2019-02-04 01:24:31')], [toDate('2019-02-04')])) diff --git a/tests/performance/constant_column_search.xml b/tests/performance/constant_column_search.xml index cb76fd4cefb..71d8185d818 100644 --- a/tests/performance/constant_column_search.xml +++ b/tests/performance/constant_column_search.xml @@ -1,4 +1,4 @@ - + search diff --git a/tests/performance/date_time_64.xml b/tests/performance/date_time_64.xml index 838aba34d87..fd883416a33 100644 --- a/tests/performance/date_time_64.xml +++ b/tests/performance/date_time_64.xml @@ -1,4 +1,4 @@ - + hits_100m_single diff --git a/tests/performance/date_time_long.xml b/tests/performance/date_time_long.xml index 0c3d85f9659..c2eb42d3318 100644 --- a/tests/performance/date_time_long.xml +++ b/tests/performance/date_time_long.xml @@ -1,4 +1,4 @@ - + datetime_transform diff --git a/tests/performance/direct_dictionary.xml b/tests/performance/direct_dictionary.xml index eb1b4e0da00..68b52d917dd 100644 --- a/tests/performance/direct_dictionary.xml +++ b/tests/performance/direct_dictionary.xml @@ -1,4 +1,4 @@ - + CREATE TABLE simple_direct_dictionary_test_table ( diff --git a/tests/performance/float_formatting.xml b/tests/performance/float_formatting.xml index d24ccd7664c..71d8aee3f89 100644 --- a/tests/performance/float_formatting.xml +++ b/tests/performance/float_formatting.xml @@ -3,7 +3,7 @@ is 10 times faster than toString(number % 100 + 0.5). The shorter queries are somewhat unstable, so ignore differences less than 10%. --> - + expr diff --git a/tests/performance/float_parsing.xml b/tests/performance/float_parsing.xml index 33ab8ba6f10..eb8577bd127 100644 --- a/tests/performance/float_parsing.xml +++ b/tests/performance/float_parsing.xml @@ -1,4 +1,4 @@ - + expr diff --git a/tests/performance/fuzz_bits.xml b/tests/performance/fuzz_bits.xml index 2679977cb1d..87064e520c2 100644 --- a/tests/performance/fuzz_bits.xml +++ b/tests/performance/fuzz_bits.xml @@ -1,4 +1,4 @@ - + diff --git a/tests/performance/general_purpose_hashes.xml b/tests/performance/general_purpose_hashes.xml index bd2fa9674f6..f34554360cf 100644 --- a/tests/performance/general_purpose_hashes.xml +++ b/tests/performance/general_purpose_hashes.xml @@ -1,4 +1,4 @@ - + gp_hash_func diff --git a/tests/performance/generate_table_function.xml b/tests/performance/generate_table_function.xml index bc49a7de1bd..0339a8c19e8 100644 --- a/tests/performance/generate_table_function.xml +++ b/tests/performance/generate_table_function.xml @@ -1,4 +1,4 @@ - + SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('ui64 UInt64, i64 Int64, ui32 UInt32, i32 Int32, ui16 UInt16, i16 Int16, ui8 UInt8, i8 Int8') LIMIT 1000000000); SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('ui64 UInt64, i64 Int64, ui32 UInt32, i32 Int32, ui16 UInt16, i16 Int16, ui8 UInt8, i8 Int8', 0, 10, 10) LIMIT 1000000000); SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Enum8(\'hello\' = 1, \'world\' = 5)', 0, 10, 10) LIMIT 1000000000); diff --git a/tests/performance/group_by_sundy_li.xml b/tests/performance/group_by_sundy_li.xml index c49712a8519..aebc305335c 100644 --- a/tests/performance/group_by_sundy_li.xml +++ b/tests/performance/group_by_sundy_li.xml @@ -1,4 +1,4 @@ - + 8 diff --git a/tests/performance/if_array_string.xml b/tests/performance/if_array_string.xml index 445b3c8c55a..773509e1c4b 100644 --- a/tests/performance/if_array_string.xml +++ b/tests/performance/if_array_string.xml @@ -1,4 +1,4 @@ - + SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : ['a', 'b', 'c']) SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? materialize(['Hello', 'World']) : ['a', 'b', 'c']) SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : materialize(['a', 'b', 'c'])) diff --git a/tests/performance/int_parsing.xml b/tests/performance/int_parsing.xml index 3b8620e46c3..32f904331ce 100644 --- a/tests/performance/int_parsing.xml +++ b/tests/performance/int_parsing.xml @@ -1,4 +1,4 @@ - + hits_100m_single hits_10m_single diff --git a/tests/performance/jit_small_requests.xml b/tests/performance/jit_small_requests.xml index c9abec0926b..d8f917fb9af 100644 --- a/tests/performance/jit_small_requests.xml +++ b/tests/performance/jit_small_requests.xml @@ -1,4 +1,4 @@ - + WITH bitXor(number, 0x4CF2D2BAAE6DA887) AS x0, diff --git a/tests/performance/joins_in_memory.xml b/tests/performance/joins_in_memory.xml index bac7679930f..fac6f2659c6 100644 --- a/tests/performance/joins_in_memory.xml +++ b/tests/performance/joins_in_memory.xml @@ -1,4 +1,4 @@ - + CREATE TABLE ints (i64 Int64, i32 Int32, i16 Int16, i8 Int8) ENGINE = Memory INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) diff --git a/tests/performance/joins_in_memory_pmj.xml b/tests/performance/joins_in_memory_pmj.xml index 5dd4395513d..87d1c0df14c 100644 --- a/tests/performance/joins_in_memory_pmj.xml +++ b/tests/performance/joins_in_memory_pmj.xml @@ -1,4 +1,4 @@ - + CREATE TABLE ints (i64 Int64, i32 Int32, i16 Int16, i8 Int8) ENGINE = Memory diff --git a/tests/performance/logical_functions_medium.xml b/tests/performance/logical_functions_medium.xml index be474894b54..19572191532 100644 --- a/tests/performance/logical_functions_medium.xml +++ b/tests/performance/logical_functions_medium.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/logical_functions_small.xml b/tests/performance/logical_functions_small.xml index 3d70ef6811d..d5f6a7b99cb 100644 --- a/tests/performance/logical_functions_small.xml +++ b/tests/performance/logical_functions_small.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/math.xml b/tests/performance/math.xml index 006e33548c9..35250351683 100644 --- a/tests/performance/math.xml +++ b/tests/performance/math.xml @@ -1,4 +1,4 @@ - + func_slow diff --git a/tests/performance/optimized_select_final.xml b/tests/performance/optimized_select_final.xml index 2c8254d2b88..d70fccc1330 100644 --- a/tests/performance/optimized_select_final.xml +++ b/tests/performance/optimized_select_final.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/optimized_select_final_one_part.xml b/tests/performance/optimized_select_final_one_part.xml index 92c8eed859a..63541313ac9 100644 --- a/tests/performance/optimized_select_final_one_part.xml +++ b/tests/performance/optimized_select_final_one_part.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/or_null_default.xml b/tests/performance/or_null_default.xml index 6fed0cce4d6..009719f66a5 100644 --- a/tests/performance/or_null_default.xml +++ b/tests/performance/or_null_default.xml @@ -1,4 +1,4 @@ - + SELECT sumOrNull(number) FROM numbers(100000000) SELECT sumOrDefault(toNullable(number)) FROM numbers(100000000) SELECT sumOrNull(number) FROM numbers(10000000) GROUP BY number % 1024 diff --git a/tests/performance/parse_engine_file.xml b/tests/performance/parse_engine_file.xml index 2459ed084cd..d49670b36b5 100644 --- a/tests/performance/parse_engine_file.xml +++ b/tests/performance/parse_engine_file.xml @@ -1,4 +1,4 @@ - + test.hits diff --git a/tests/performance/random_string.xml b/tests/performance/random_string.xml index 1a740ae077a..79f12373f1c 100644 --- a/tests/performance/random_string.xml +++ b/tests/performance/random_string.xml @@ -1,4 +1,4 @@ - + SELECT count() FROM zeros(100000000) WHERE NOT ignore(randomString(10)) SELECT count() FROM zeros(100000000) WHERE NOT ignore(randomString(100)) SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomString(1000)) diff --git a/tests/performance/sum.xml b/tests/performance/sum.xml index 32c194dab6f..9bee2a580c3 100644 --- a/tests/performance/sum.xml +++ b/tests/performance/sum.xml @@ -1,4 +1,4 @@ - + SELECT sum(number) FROM numbers(100000000) SELECT sum(toUInt32(number)) FROM numbers(100000000) SELECT sum(toUInt16(number)) FROM numbers(100000000) diff --git a/tests/performance/sum_map.xml b/tests/performance/sum_map.xml index bc9f9be2a18..b732c150220 100644 --- a/tests/performance/sum_map.xml +++ b/tests/performance/sum_map.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/synthetic_hardware_benchmark.xml b/tests/performance/synthetic_hardware_benchmark.xml index 4b94f73a21d..ffcf30db5cb 100644 --- a/tests/performance/synthetic_hardware_benchmark.xml +++ b/tests/performance/synthetic_hardware_benchmark.xml @@ -1,4 +1,4 @@ - + 30000000000 diff --git a/tests/performance/url_hits.xml b/tests/performance/url_hits.xml index a699ef6ba97..1813b2a72cb 100644 --- a/tests/performance/url_hits.xml +++ b/tests/performance/url_hits.xml @@ -1,4 +1,4 @@ - + hits_100m_single hits_10m_single diff --git a/tests/performance/visit_param_extract_raw.xml b/tests/performance/visit_param_extract_raw.xml index 67faeb1f743..358dcc9cc0e 100644 --- a/tests/performance/visit_param_extract_raw.xml +++ b/tests/performance/visit_param_extract_raw.xml @@ -1,4 +1,4 @@ - + param diff --git a/tests/performance/window_functions.xml b/tests/performance/window_functions.xml index 622e349d060..6be3d59e2b0 100644 --- a/tests/performance/window_functions.xml +++ b/tests/performance/window_functions.xml @@ -110,4 +110,46 @@ format Null + + + select leadInFrame(number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between unbounded preceding and unbounded following) + format Null + + + + + select any(number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between 1 following and 1 following) + format Null + + + + select leadInFrame(number, number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between unbounded preceding and unbounded following) + format Null + + + + select leadInFrame(number, number, number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between unbounded preceding and unbounded following) + format Null + + diff --git a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference index f7eb44d66e0..4521d575ff3 100644 --- a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference +++ b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference @@ -4,3 +4,7 @@ 0 0 0 +0 +0 +0 +0 diff --git a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql index afcbc78cfd5..0e7fa55dbae 100644 --- a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql +++ b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql @@ -3,6 +3,8 @@ SET allow_simdjson=1; SELECT JSONLength('"HX-='); SELECT JSONLength('[9]\0\x42\xD3\x36\xE3'); SELECT JSONLength(unhex('5B30000E06D7AA5D')); +SELECT JSONLength('{"success"test:"123"}'); +SELECT isValidJSON('{"success"test:"123"}'); SET allow_simdjson=0; @@ -10,3 +12,5 @@ SET allow_simdjson=0; SELECT JSONLength('"HX-='); SELECT JSONLength('[9]\0\x42\xD3\x36\xE3'); SELECT JSONLength(unhex('5B30000E06D7AA5D')); +SELECT JSONLength('{"success"test:"123"}'); +SELECT isValidJSON('{"success"test:"123"}'); diff --git a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh index 1e61c8d64f3..fe6246e02f6 100755 --- a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh +++ b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh @@ -74,7 +74,7 @@ timeout $TIMEOUT bash -c thread5 2> /dev/null & wait -$CLICKHOUSE_CLIENT -n -q " - DROP TABLE alter_table; - DROP TABLE alter_table2 -" +$CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table;" & +$CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table2;" & + +wait diff --git a/tests/queries/0_stateless/01053_ssd_dictionary.sql b/tests/queries/0_stateless/01053_ssd_dictionary.sql index a23ae7e5e96..23a369cc8a6 100644 --- a/tests/queries/0_stateless/01053_ssd_dictionary.sql +++ b/tests/queries/0_stateless/01053_ssd_dictionary.sql @@ -76,7 +76,7 @@ CREATE DICTIONARY 01053_db.ssd_dict PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01053_db')) LIFETIME(MIN 1000 MAX 2000) -LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096 MAX_STORED_KEYS 1000000)); +LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096)); SELECT 'UPDATE DICTIONARY'; -- 118 @@ -142,7 +142,7 @@ CREATE DICTIONARY 01053_db.ssd_dict PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01053_db')) LIFETIME(MIN 1000 MAX 2000) -LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 1024 MAX_STORED_KEYS 10)); +LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 1024)); SELECT 'UPDATE DICTIONARY (MT)'; -- 118 diff --git a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql index 50b34c4b18f..cd3e52c9691 100644 --- a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql +++ b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql @@ -98,7 +98,7 @@ CREATE DICTIONARY 01280_db.ssd_dict PRIMARY KEY k1, k2 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01280_db')) LIFETIME(MIN 1000 MAX 2000) -LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096 MAX_STORED_KEYS 1000000)); +LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096)); SELECT 'UPDATE DICTIONARY'; -- 118 diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index e31f8476326..14e5889a811 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -1002,6 +1002,32 @@ from numbers(5); 1 3 2 4 3 \N +-- variants of lag/lead that respect the frame +select number, p, pp, + lagInFrame(number, number - pp, number * 11) over w as lag, + leadInFrame(number, number - pp, number * 11) over w as lead +from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) +window w as (partition by p order by number + rows between unbounded preceding and unbounded following) +order by number +settings max_block_size = 3; +; +0 0 0 0 0 +1 0 0 0 2 +2 0 0 0 4 +3 0 0 0 33 +4 0 0 0 44 +5 1 5 5 5 +6 1 5 5 7 +7 1 5 5 9 +8 1 5 5 88 +9 1 5 5 99 +10 2 10 10 10 +11 2 10 10 12 +12 2 10 10 14 +13 2 10 10 143 +14 2 10 10 154 +15 3 15 15 15 -- case-insensitive SQL-standard synonyms for any and anyLast select number, @@ -1026,3 +1052,16 @@ order by number select count() over () from numbers(4) where number < 2; 2 2 +-- floating point RANGE frame +select + count(*) over (order by (toFloat32(number) as f32) range 5. preceding), + count(*) over (order by (toFloat64(number) as f64) range 5. preceding) +from numbers(7) +; +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +6 6 diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql index 7a2d2522038..30847e09246 100644 --- a/tests/queries/0_stateless/01591_window_functions.sql +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -347,6 +347,17 @@ select over (order by number rows between 1 following and 1 following) from numbers(5); +-- variants of lag/lead that respect the frame +select number, p, pp, + lagInFrame(number, number - pp, number * 11) over w as lag, + leadInFrame(number, number - pp, number * 11) over w as lead +from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) +window w as (partition by p order by number + rows between unbounded preceding and unbounded following) +order by number +settings max_block_size = 3; +; + -- case-insensitive SQL-standard synonyms for any and anyLast select number, @@ -360,3 +371,10 @@ order by number -- In this case, we had a problem with PartialSortingTransform returning zero-row -- chunks for input chunks w/o columns. select count() over () from numbers(4) where number < 2; + +-- floating point RANGE frame +select + count(*) over (order by (toFloat32(number) as f32) range 5. preceding), + count(*) over (order by (toFloat64(number) as f64) range 5. preceding) +from numbers(7) +; diff --git a/tests/queries/0_stateless/01601_custom_tld.reference b/tests/queries/0_stateless/01601_custom_tld.reference index 98b99778396..e056505f273 100644 --- a/tests/queries/0_stateless/01601_custom_tld.reference +++ b/tests/queries/0_stateless/01601_custom_tld.reference @@ -1,11 +1,24 @@ -no-tld +-- no-tld + +foo.there-is-no-such-domain +foo.there-is-no-such-domain foo.there-is-no-such-domain foo.there-is-no-such-domain foo -generic +-- generic kernel kernel.biz.ss -difference +-- difference biz.ss kernel.biz.ss +-- 3+level +xx.blogspot.co.at +blogspot +xx.blogspot.co.at +blogspot +-- url +foobar.com +foobar.com +foobar.com +xx.blogspot.co.at diff --git a/tests/queries/0_stateless/01601_custom_tld.sql b/tests/queries/0_stateless/01601_custom_tld.sql index 6d68299c07d..688dd419858 100644 --- a/tests/queries/0_stateless/01601_custom_tld.sql +++ b/tests/queries/0_stateless/01601_custom_tld.sql @@ -1,16 +1,31 @@ -select 'no-tld'; -select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list'); +select '-- no-tld'; -- even if there is no TLD, 2-nd level by default anyway -- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real) +select cutToFirstSignificantSubdomain('there-is-no-such-domain'); +select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain'); +select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain'); +select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list'); select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list'); select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); -select 'generic'; -select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss +select '-- generic'; +select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss -select 'difference'; +select '-- difference'; -- biz.ss is not in the default TLD list, hence: select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss + +select '-- 3+level'; +select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at +select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot +select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at +select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot + +select '-- url'; +select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list'); diff --git a/tests/queries/0_stateless/01649_with_alias_key_condition.sql b/tests/queries/0_stateless/01649_with_alias_key_condition.sql index b813e6ee84f..0a796f8512e 100644 --- a/tests/queries/0_stateless/01649_with_alias_key_condition.sql +++ b/tests/queries/0_stateless/01649_with_alias_key_condition.sql @@ -6,6 +6,6 @@ insert into alias_key_condition values (1, 2), (3, 4); set force_primary_key = 1; -with i as k select * from alias_key_condition where k = 3; +with i as k select * from alias_key_condition where k = (select i from alias_key_condition where i = 3); drop table if exists alias_key_condition; diff --git a/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql index ee2cde963d7..f200ead341b 100644 --- a/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql +++ b/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql @@ -40,7 +40,7 @@ SELECT dictGetOrDefault('01681_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; +SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes ORDER BY id; DROP DICTIONARY 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; DROP TABLE 01681_database_for_cache_dictionary.simple_key_simple_attributes_source_table; @@ -84,7 +84,7 @@ SELECT dictGetOrDefault('01681_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; +SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes ORDER BY id; DROP DICTIONARY 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; DROP TABLE 01681_database_for_cache_dictionary.simple_key_complex_attributes_source_table; diff --git a/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql index 65c56090c47..4cc83412457 100644 --- a/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql +++ b/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql @@ -42,7 +42,7 @@ SELECT dictGetOrDefault('01682_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; +SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes ORDER BY id; DROP DICTIONARY 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; DROP TABLE 01682_database_for_cache_dictionary.complex_key_simple_attributes_source_table; @@ -89,7 +89,7 @@ SELECT dictGetOrDefault('01682_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; +SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes ORDER BY id; DROP DICTIONARY 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; DROP TABLE 01682_database_for_cache_dictionary.complex_key_complex_attributes_source_table; diff --git a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql index 3b327257fc4..9dbad1289f1 100644 --- a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql +++ b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql @@ -40,7 +40,7 @@ SELECT dictGetOrDefault('01684_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; +SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes ORDER BY id; DROP DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; DROP TABLE 01684_database_for_cache_dictionary.simple_key_simple_attributes_source_table; @@ -84,7 +84,7 @@ SELECT dictGetOrDefault('01684_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; +SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes ORDER BY id; DROP DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; DROP TABLE 01684_database_for_cache_dictionary.simple_key_complex_attributes_source_table; diff --git a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql index 1757b136d3e..03a7e1d80df 100644 --- a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql +++ b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql @@ -42,7 +42,7 @@ SELECT dictGetOrDefault('01685_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; +SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes ORDER BY id; DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; DROP TABLE 01685_database_for_cache_dictionary.complex_key_simple_attributes_source_table; @@ -89,10 +89,10 @@ SELECT dictGetOrDefault('01685_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; +SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes ORDER BY id; DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; DROP TABLE 01685_database_for_cache_dictionary.complex_key_complex_attributes_source_table; DROP DATABASE 01685_database_for_cache_dictionary; - + diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference new file mode 100644 index 00000000000..95479cf37ba --- /dev/null +++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference @@ -0,0 +1,28 @@ +SELECT + x, + y, + z +FROM prewhere_move_select_final +PREWHERE y > 100 +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE y > 100 +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +WHERE z > 400 +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE y > 100 +WHERE (y > 100) AND (z > 400) diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql new file mode 100644 index 00000000000..a3a882c461a --- /dev/null +++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS prewhere_move_select_final; +CREATE TABLE prewhere_move_select_final (x Int, y Int, z Int) ENGINE = ReplacingMergeTree() ORDER BY (x, y); +INSERT INTO prewhere_move_select_final SELECT number, number * 2, number * 3 FROM numbers(1000); + +-- order key can be pushed down with final +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE y > 100; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100; + +-- can not be pushed down +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE z > 400; + +-- only y can be pushed down +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100 and z > 400; + +DROP TABLE prewhere_move_select_final; diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect new file mode 100755 index 00000000000..65b9bde235b --- /dev/null +++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect @@ -0,0 +1,25 @@ +#!/usr/bin/expect -f + +log_user 0 +set timeout 5 +match_max 100000 +# A default timeout action is to do nothing, change it to fail +expect_after { + timeout { + exit 2 + } +} + +set basedir [file dirname $argv0] +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT" +expect ":) " + +# regression for heap-buffer-overflow issue (under ASAN) +send -- "/**" +expect "/**" +# just in case few more bytes +send -- "foobar" +expect "/**foobar" + +send -- "\3\4" +expect eof diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01767_timezoneOf.reference b/tests/queries/0_stateless/01767_timezoneOf.reference new file mode 100644 index 00000000000..0a8a8c32d4e --- /dev/null +++ b/tests/queries/0_stateless/01767_timezoneOf.reference @@ -0,0 +1 @@ +Asia/Tehran Asia/Tehran Asia/Tehran Africa/Accra Pacific/Pitcairn diff --git a/tests/queries/0_stateless/01767_timezoneOf.sh b/tests/queries/0_stateless/01767_timezoneOf.sh new file mode 100755 index 00000000000..9dee051ee3f --- /dev/null +++ b/tests/queries/0_stateless/01767_timezoneOf.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +TZ=Asia/Tehran $CLICKHOUSE_LOCAL --query "SELECT timezone(), timezoneOf(now()), timeZone(), timeZoneOf(toTimezone(toNullable(now()), 'Africa/Accra')), timeZoneOf(toTimeZone(now64(3), 'Pacific/Pitcairn'))" diff --git a/tests/queries/0_stateless/01770_add_months_ubsan.reference b/tests/queries/0_stateless/01770_add_months_ubsan.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/01770_add_months_ubsan.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/01770_add_months_ubsan.sql b/tests/queries/0_stateless/01770_add_months_ubsan.sql new file mode 100644 index 00000000000..039434ff9bc --- /dev/null +++ b/tests/queries/0_stateless/01770_add_months_ubsan.sql @@ -0,0 +1,2 @@ +-- Result does not make sense but UBSan report should not be triggered. +SELECT ignore(now() + INTERVAL 9223372036854775807 MONTH); diff --git a/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference new file mode 100644 index 00000000000..1cea52ec1c2 --- /dev/null +++ b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference @@ -0,0 +1,2 @@ +2000-01-02 03:04:05 2001-02-03 04:05:06 +2000-01-02 03:04:05 2001-02-03 04:05:06 diff --git a/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql new file mode 100644 index 00000000000..5a1f809b03b --- /dev/null +++ b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS test; +CREATE TABLE test (time DateTime64(3)) ENGINE = MergeTree ORDER BY tuple() PARTITION BY toStartOfInterval(time, INTERVAL 2 YEAR); + +INSERT INTO test VALUES ('2000-01-02 03:04:05.123'), ('2001-02-03 04:05:06.789'); + +SELECT min_time, max_time FROM system.parts WHERE table = 'test' AND database = currentDatabase(); +SELECT min_time, max_time FROM system.parts_columns WHERE table = 'test' AND database = currentDatabase(); + +DROP TABLE test; diff --git a/tests/queries/0_stateless/01774_bar_with_illegal_value.reference b/tests/queries/0_stateless/01774_bar_with_illegal_value.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01774_bar_with_illegal_value.sql b/tests/queries/0_stateless/01774_bar_with_illegal_value.sql new file mode 100644 index 00000000000..60c7f303c13 --- /dev/null +++ b/tests/queries/0_stateless/01774_bar_with_illegal_value.sql @@ -0,0 +1 @@ +SELECT greatCircleAngle(1048575, 257, -9223372036854775808, 1048576) - NULL, bar(7, -inf, 1024); -- { serverError 36 } diff --git a/tests/queries/0_stateless/01774_tuple_null_in.reference b/tests/queries/0_stateless/01774_tuple_null_in.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/0_stateless/01774_tuple_null_in.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/0_stateless/01774_tuple_null_in.sql b/tests/queries/0_stateless/01774_tuple_null_in.sql new file mode 100644 index 00000000000..a9cc39e8840 --- /dev/null +++ b/tests/queries/0_stateless/01774_tuple_null_in.sql @@ -0,0 +1,2 @@ +SELECT (NULL, NULL) = (8, 0) OR (NULL, NULL) = (3, 2) OR (NULL, NULL) = (0, 0) OR (NULL, NULL) = (3, 1); +SELECT (NULL, NULL) IN ((NULL, 0), (3, 1), (3, 2), (8, 0), (NULL, NULL)); diff --git a/tests/queries/0_stateless/01776_decrypt_aead_size_check.reference b/tests/queries/0_stateless/01776_decrypt_aead_size_check.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql b/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql new file mode 100644 index 00000000000..8730ed0eda2 --- /dev/null +++ b/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql @@ -0,0 +1 @@ +SELECT decrypt('aes-128-gcm', 'text', 'key', 'IV'); -- { serverError 36 } diff --git a/tests/queries/0_stateless/01777_map_populate_series_ubsan.reference b/tests/queries/0_stateless/01777_map_populate_series_ubsan.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql b/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql new file mode 100644 index 00000000000..5a8c182425a --- /dev/null +++ b/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql @@ -0,0 +1,2 @@ +-- Should correctly throw exception about overflow: +SELECT mapPopulateSeries([-9223372036854775808, toUInt32(2)], [toUInt32(1023), -1]); -- { serverError 128 } diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index 0135fc6437a..1b333a6baec 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -212,6 +212,7 @@ 01017_uniqCombined_memory_usage 01747_join_view_filter_dictionary 01748_dictionary_table_dot +01755_client_highlight_multi_line_comment_regression 00950_dict_get 01683_flat_dictionary 01681_cache_dictionary_simple_key diff --git a/tests/queries/1_stateful/00162_mmap_compression_none.reference b/tests/queries/1_stateful/00162_mmap_compression_none.reference new file mode 100644 index 00000000000..3495cc537c1 --- /dev/null +++ b/tests/queries/1_stateful/00162_mmap_compression_none.reference @@ -0,0 +1 @@ +687074654 diff --git a/tests/queries/1_stateful/00162_mmap_compression_none.sql b/tests/queries/1_stateful/00162_mmap_compression_none.sql new file mode 100644 index 00000000000..2178644214a --- /dev/null +++ b/tests/queries/1_stateful/00162_mmap_compression_none.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS hits_none; +CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO hits_none SELECT Title FROM test.hits; + +SET min_bytes_to_use_mmap_io = 1; +SELECT sum(length(Title)) FROM hits_none; + +DROP TABLE hits_none; diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 981cf69d676..77d4a9b8499 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -95,7 +95,8 @@ "01370_client_autocomplete_word_break_characters", "01676_clickhouse_client_autocomplete", "01193_metadata_loading", - "01455_time_zones" + "01455_time_zones", + "01755_client_highlight_multi_line_comment_regression" ], "release-build": [ ], @@ -582,6 +583,7 @@ "00980_zookeeper_merge_tree_alter_settings", "00988_constraints_replication_zookeeper", "00989_parallel_parts_loading", + "00992_system_parts_race_condition_zookeeper_long", "00993_system_parts_race_condition_drop_zookeeper", "01012_show_tables_limit", "01013_sync_replica_timeout_zookeeper", diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index d534fd8fd4f..d3e1c2acd69 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -21,6 +21,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (corrector_utf8) add_subdirectory (zookeeper-cli) add_subdirectory (zookeeper-test) + add_subdirectory (nukeeper-data-dumper) add_subdirectory (zookeeper-dump-tree) add_subdirectory (zookeeper-remove-by-list) add_subdirectory (zookeeper-create-entry-to-download-part) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 231d22b50da..799492cdd90 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,5 +1,7 @@ +v21.3.4.25-lts 2021-03-28 v21.3.3.14-lts 2021-03-19 v21.3.2.5-lts 2021-03-12 +v21.2.7.11-stable 2021-03-28 v21.2.6.1-stable 2021-03-15 v21.2.5.5-stable 2021-03-02 v21.2.4.6-stable 2021-02-20 diff --git a/utils/nukeeper-data-dumper/CMakeLists.txt b/utils/nukeeper-data-dumper/CMakeLists.txt new file mode 100644 index 00000000000..bab1137bf4d --- /dev/null +++ b/utils/nukeeper-data-dumper/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(nukeeper-data-dumper main.cpp) +target_link_libraries(nukeeper-data-dumper PRIVATE dbms) diff --git a/utils/nukeeper-data-dumper/main.cpp b/utils/nukeeper-data-dumper/main.cpp new file mode 100644 index 00000000000..c80aeb473e2 --- /dev/null +++ b/utils/nukeeper-data-dumper/main.cpp @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include +#include // Y_IGNORE +#include +#include +#include + +using namespace Coordination; +using namespace DB; + +void dumpMachine(std::shared_ptr machine) +{ + auto & storage = machine->getStorage(); + std::queue keys; + keys.push("/"); + + while (!keys.empty()) + { + auto key = keys.front(); + keys.pop(); + std::cout << key << "\n"; + auto value = storage.container.getValue(key); + std::cout << "\tStat: {version: " << value.stat.version << + ", mtime: " << value.stat.mtime << + ", emphemeralOwner: " << value.stat.ephemeralOwner << + ", czxid: " << value.stat.czxid << + ", mzxid: " << value.stat.mzxid << + ", numChildren: " << value.stat.numChildren << + ", dataLength: " << value.stat.dataLength << + "}" << std::endl; + std::cout << "\tData: " << storage.container.getValue(key).data << std::endl; + + for (const auto & child : value.children) + { + if (key == "/") + keys.push(key + child); + else + keys.push(key + "/" + child); + } + } + std::cout << std::flush; +} + +int main(int argc, char *argv[]) +{ + if (argc != 3) + { + std::cerr << "usage: " << argv[0] << " snapshotpath logpath" << std::endl; + return 3; + } + else + { + Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); + Poco::Logger::root().setChannel(channel); + Poco::Logger::root().setLevel("trace"); + } + auto * logger = &Poco::Logger::get("nukeeper-dumper"); + ResponsesQueue queue; + SnapshotsQueue snapshots_queue{1}; + CoordinationSettingsPtr settings = std::make_shared(); + auto state_machine = std::make_shared(queue, snapshots_queue, argv[1], settings); + state_machine->init(); + size_t last_commited_index = state_machine->last_commit_index(); + + LOG_INFO(logger, "Last committed index: {}", last_commited_index); + + DB::NuKeeperLogStore changelog(argv[2], 10000000, true); + changelog.init(last_commited_index, 10000000000UL); /// collect all logs + if (changelog.size() == 0) + LOG_INFO(logger, "Changelog empty"); + else + LOG_INFO(logger, "Last changelog entry {}", changelog.next_slot() - 1); + + for (size_t i = last_commited_index + 1; i < changelog.next_slot(); ++i) + { + if (changelog.entry_at(i)->get_val_type() == nuraft::log_val_type::app_log) + state_machine->commit(i, changelog.entry_at(i)->get_buf()); + } + + dumpMachine(state_machine); + + return 0; +} diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html index 92da6328f0f..a57930b279d 100644 --- a/website/benchmark/hardware/index.html +++ b/website/benchmark/hardware/index.html @@ -75,6 +75,7 @@ Results for Raspberry Pi and Digital Ocean CPU-optimized are from Fritz Wijay Results for Digitalocean (Storage-intesinve VMs) + (CPU/GP) are from Yiğit Konur and Metehan Çetinkaya of seo.do.
Results for 2x AMD EPYC 7F72 3.2 Ghz (Total 96 Cores, IBM Cloud's Bare Metal Service) from Yiğit Konur and Metehan Çetinkaya of seo.do.
Results for 2x AMD EPYC 7742 (128 physical cores, 1 TB DDR4-3200 RAM) from Yedige Davletgaliyev and Nikita Zhavoronkov of blockchair.com.
+Results for ASUS A15 (Ryzen laptop) are from Kimmo Linna.

diff --git a/website/benchmark/hardware/results/asus_a15.json b/website/benchmark/hardware/results/asus_a15.json new file mode 100644 index 00000000000..983dbde8681 --- /dev/null +++ b/website/benchmark/hardware/results/asus_a15.json @@ -0,0 +1,54 @@ +[ + { + "system": "Asus A15", + "system_full": "Asus A15 (16 × AMD Ryzen 7 4800H, 16 GiB RAM)", + "time": "2021-03-23 00:00:00", + "kind": "laptop", + "result": + [ +[0.004, 0.003, 0.003], +[0.019, 0.013, 0.012], +[0.053, 0.041, 0.037], +[0.106, 0.057, 0.056], +[0.158, 0.115, 0.110], +[0.324, 0.266, 0.262], +[0.027, 0.024, 0.026], +[0.017, 0.016, 0.017], +[0.644, 0.589, 0.582], +[0.733, 0.679, 0.679], +[0.233, 0.201, 0.197], +[0.276, 0.235, 0.236], +[1.025, 0.962, 0.962], +[1.342, 1.270, 1.264], +[1.170, 1.129, 1.124], +[1.375, 1.346, 1.351], +[3.271, 3.210, 3.242], +[1.960, 1.898, 1.907], +[5.997, 5.965, 5.983], +[0.106, 0.065, 0.055], +[1.264, 0.990, 0.989], +[1.555, 1.241, 1.239], +[3.798, 3.307, 3.280], +[1.949, 1.022, 0.995], +[0.393, 0.292, 0.292], +[0.307, 0.254, 0.255], +[0.378, 0.297, 0.290], +[1.632, 1.399, 1.386], +[2.111, 1.909, 1.900], +[3.349, 3.352, 3.357], +[0.892, 0.824, 0.816], +[1.505, 1.392, 1.378], +[9.105, 8.951, 8.914], +[5.195, 4.975, 4.919], +[5.150, 5.021, 4.955], +[1.756, 1.743, 1.749], +[0.161, 0.154, 0.158], +[0.108, 0.058, 0.055], +[0.101, 0.102, 0.052], +[0.365, 0.309, 0.334], +[0.050, 0.023, 0.023], +[0.037, 0.019, 0.015], +[0.023, 0.013, 0.018] + ] + } +]