Merge branch 'master' into alter-name-collision

2024-11-21 15:12:02 +00:00 · 2020-11-03 13:16:34 +03:00 · 2020-11-03 13:16:34 +03:00 · 6e32e17a7d
commit 6e32e17a7d
parent e6d8ab2270 268c80520f
20 changed files with 102 additions and 33 deletions
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -164,7 +164,7 @@ case "$stage" in
        # Lost connection to the server. This probably means that the server died
        # with abort.
        echo "failure" > status.txt
-        if ! grep -ao "Received signal.*\|Logical error.*\|Assertion.*failed" server.log > description.txt
+        if ! grep -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*" server.log > description.txt
        then
            echo "Lost connection to server. See the logs" > description.txt
        fi
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -88,7 +88,7 @@ For a description of parameters, see the [CREATE query description](../../../sql

    -   `index_granularity` — Maximum number of data rows between the marks of an index. Default value: 8192. See [Data Storage](#mergetree-data-storage).
    -   `index_granularity_bytes` — Maximum size of data granules in bytes. Default value: 10Mb. To restrict the granule size only by number of rows, set to 0 (not recommended). See [Data Storage](#mergetree-data-storage).
-    -   `min_index_granularity_bytes` — Min allowed size of data granules in bytes. Default value: 1024b. To provide safeguard against accidentally creating tables with very low index_granularity_bytes. See [Data Storage](#mergetree-data-storage).
+    -   `min_index_granularity_bytes` — Min allowed size of data granules in bytes. Default value: 1024b. To provide a safeguard against accidentally creating tables with very low index_granularity_bytes. See [Data Storage](#mergetree-data-storage).
    -   `enable_mixed_granularity_parts` — Enables or disables transitioning to control the granule size with the `index_granularity_bytes` setting. Before version 19.11, there was only the `index_granularity` setting for restricting granule size. The `index_granularity_bytes` setting improves ClickHouse performance when selecting data from tables with big rows (tens and hundreds of megabytes). If you have tables with big rows, you can enable this setting for the tables to improve the efficiency of `SELECT` queries.
    -   `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../../../operations/server-configuration-parameters/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in “Server configuration parameters”.
    -   `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse reads and writes the data to the storage disk using the direct I/O interface (`O_DIRECT` option). If `min_merge_bytes_to_use_direct_io = 0`, then direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes.
--- a/docs/en/interfaces/http.md
+++ b/docs/en/interfaces/http.md
@ -79,7 +79,7 @@ By default, data is returned in TabSeparated format (for more information, see t

 You use the FORMAT clause of the query to request any other format.

-Also, you can use the ‘default_format’ URL parameter or ‘X-ClickHouse-Format’ header to specify a default format other than TabSeparated.
+Also, you can use the ‘default_format’ URL parameter or the ‘X-ClickHouse-Format’ header to specify a default format other than TabSeparated.

 ``` bash
 $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @-
@ -170,7 +170,7 @@ $ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gz
 !!! note "Note"
    Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly.

-You can use the ‘database’ URL parameter or ‘X-ClickHouse-Database’ header to specify the default database.
+You can use the ‘database’ URL parameter or the ‘X-ClickHouse-Database’ header to specify the default database.

 ``` bash
 $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @-
--- a/docs/en/operations/utilities/clickhouse-local.md
+++ b/docs/en/operations/utilities/clickhouse-local.md
@ -16,7 +16,7 @@ By default `clickhouse-local` does not have access to data on the same host, but
 !!! warning "Warning"
    It is not recommended to load production server configuration into `clickhouse-local` because data can be damaged in case of human error.

-For temporary data an unique temporary data directory is created by default. If you want to override this behavior the data directory can be explicitly specified with the `-- --path` option.
+For temporary data, a unique temporary data directory is created by default. If you want to override this behavior, the data directory can be explicitly specified with the `-- --path` option.

 ## Usage {#usage}

--- a/docs/en/sql-reference/statements/create/view.md
+++ b/docs/en/sql-reference/statements/create/view.md
@ -15,7 +15,7 @@ Syntax:
 CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] AS SELECT ...
 ```

-Normal views don’t store any data, they just perform a read from another table on each access. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the [FROM](../../../sql-reference/statements/select/from.md) clause.
+Normal views don’t store any data. They just perform a read from another table on each access. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the [FROM](../../../sql-reference/statements/select/from.md) clause.

 As an example, assume you’ve created a view:

--- a/docs/ru/engines/table-engines/integrations/kafka.md
+++ b/docs/ru/engines/table-engines/integrations/kafka.md
@ -159,6 +159,22 @@ Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format

 В документе [librdkafka configuration reference](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md) можно увидеть список возможных опций конфигурации. Используйте подчеркивание (`_`) вместо точки в конфигурации ClickHouse. Например, `check.crcs=true` будет соответствовать `<check_crcs>true</check_crcs>`.

+### Поддержка Kerberos {#kafka-kerberos-support}
+
+Чтобы начать работу с Kafka с поддержкой Kerberos, добавьте дочерний элемент `security_protocol` со значением `sasl_plaintext`. Этого будет достаточно, если получен тикет на получение тикета (ticket-granting ticket) Kerberos и он кэшируется средствами ОС.
+ClickHouse может поддерживать учетные данные Kerberos с помощью файла keytab. Рассмотрим дочерние элементы `sasl_kerberos_service_name`, `sasl_kerberos_keytab`, `sasl_kerberos_principal` и `sasl.kerberos.kinit.cmd`.
+
+Пример:
+
+``` xml
+  <!-- Kerberos-aware Kafka -->
+  <kafka>
+    <security_protocol>SASL_PLAINTEXT</security_protocol>
+	<sasl_kerberos_keytab>/home/kafkauser/kafkauser.keytab</sasl_kerberos_keytab>
+	<sasl_kerberos_principal>kafkauser/kafkahost@EXAMPLE.COM</sasl_kerberos_principal>
+  </kafka>
+```
+
 ## Виртуальные столбцы {#virtualnye-stolbtsy}

 -   `_topic` — топик Kafka.
--- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md
@ -79,6 +79,7 @@ ORDER BY expr

    - `index_granularity` — максимальное количество строк данных между засечками индекса. По умолчанию — 8192. Смотрите [Хранение данных](#mergetree-data-storage).
    - `index_granularity_bytes` — максимальный размер гранул данных в байтах. По умолчанию — 10Mb. Чтобы ограничить размер гранул только количеством строк, установите значение 0 (не рекомендовано). Смотрите [Хранение данных](#mergetree-data-storage).
+    -   `min_index_granularity_bytes` — минимально допустимый размер гранул данных в байтах. Значение по умолчанию — 1024b. Для обеспечения защиты от случайного создания таблиц с очень низким значением `index_granularity_bytes`. Смотрите [Хранение данных](#mergetree-data-storage).
    - `enable_mixed_granularity_parts` — включает или выключает переход к ограничению размера гранул с помощью настройки `index_granularity_bytes`. Настройка `index_granularity_bytes` улучшает производительность ClickHouse при выборке данных из таблиц с большими (десятки и сотни мегабайтов) строками. Если у вас есть таблицы с большими строками, можно включить эту настройку, чтобы повысить эффективность запросов `SELECT`.
    - `use_minimalistic_part_header_in_zookeeper` — Способ хранения заголовков кусков данных в ZooKeeper. Если  `use_minimalistic_part_header_in_zookeeper = 1`, то ZooKeeper хранит меньше данных. Подробнее читайте в [описании настройки](../../../operations/server-configuration-parameters/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) в разделе "Конфигурационные параметры сервера".
    - `min_merge_bytes_to_use_direct_io` — минимальный объём данных при слиянии, необходимый для прямого (небуферизованного) чтения/записи (direct I/O) на диск. При слиянии частей данных ClickHouse вычисляет общий объём хранения всех данных, подлежащих слиянию. Если общий объём хранения всех данных для чтения превышает `min_bytes_to_use_direct_io` байт, тогда ClickHouse  использует флаг `O_DIRECT` при чтении данных с диска. Если `min_merge_bytes_to_use_direct_io = 0`, тогда прямой ввод-вывод отключен. Значение по умолчанию: `10 * 1024 * 1024 * 1024` байтов.
--- a/docs/ru/interfaces/http.md
+++ b/docs/ru/interfaces/http.md
@ -76,8 +76,11 @@ ECT 1
 ```

 По умолчанию, данные возвращаются в формате TabSeparated (подробнее смотри раздел «Форматы»).
+
 Можно попросить любой другой формат - с помощью секции FORMAT запроса.

+Кроме того, вы можете использовать параметр URL-адреса `default_format` или заголовок `X-ClickHouse-Format`, чтобы указать формат по умолчанию, отличный от `TabSeparated`.
+
 ``` bash
 $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @-
 ┏━━━┓
@ -168,7 +171,7 @@ $ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gz
 !!! note "Примечание"
    Некоторые HTTP-клиенты могут по умолчанию распаковывать данные (`gzip` и `deflate`) с сервера в фоновом режиме и вы можете получить распакованные данные, даже если правильно используете настройки сжатия.

-В параметре URL database может быть указана БД по умолчанию.
+Вы можете использовать параметр URL `database` или заголовок `X-ClickHouse-Database`, чтобы указать БД по умолчанию.

 ``` bash
 $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @-
--- a/docs/ru/operations/utilities/clickhouse-local.md
+++ b/docs/ru/operations/utilities/clickhouse-local.md
@ -14,6 +14,8 @@ toc_title: clickhouse-local
 !!! warning "Warning"
    Мы не рекомендуем подключать серверную конфигурацию к `clickhouse-local`, поскольку данные можно легко повредить неосторожными действиями.

+Для временных данных по умолчанию создается специальный каталог. Если вы хотите обойти это действие, каталог данных можно указать с помощью опции `-- --path`.
+
 ## Вызов программы {#vyzov-programmy}

 Основной формат вызова:
@ -39,25 +41,51 @@ $ clickhouse-local --structure "table_structure" --input-format "format_of_incom
 ## Примеры вызова {#primery-vyzova}

 ``` bash
-$ echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table"
+$ echo -e "1,2\n3,4" | clickhouse-local --structure "a Int64, b Int64" \
+    --input-format "CSV" --query "SELECT * FROM table"
 Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec.
-1 2
-3 4
+1   2
+3   4
 ```

 Вызов выше эквивалентен следующему:

 ``` bash
-$ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table"
+$ echo -e "1,2\n3,4" | clickhouse-local --query "
+    CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin);
+    SELECT a, b FROM table;
+    DROP TABLE table"
 Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec.
-1 2
-3 4
+1   2
+3   4
+```
+
+
+Необязательно использовать ключи `stdin` или `--file`. Вы можете открывать любое количество файлов с помощью [табличной функции `file`](../../sql-reference/table-functions/file.md):
+
+``` bash
+$ echo 1 | tee 1.tsv
+1
+
+$ echo 2 | tee 2.tsv
+2
+
+$ clickhouse-local --query "
+    select * from file('1.tsv', TSV, 'a int') t1
+    cross join file('2.tsv', TSV, 'b int') t2"
+1	2
 ```

 А теперь давайте выведем на экран объём оперативной памяти, занимаемой пользователями (Unix):

 ``` bash
-$ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty"
+$ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \
+    | clickhouse-local --structure "user String, mem Float64" \
+        --query "SELECT user, round(sum(mem), 2) as memTotal
+            FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty"
+```
+
+``` text
 Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec.
 ┏━━━━━━━━━━┳━━━━━━━━━━┓
 ┃ user     ┃ memTotal ┃
--- a/docs/ru/sql-reference/statements/create/view.md
+++ b/docs/ru/sql-reference/statements/create/view.md
@ -13,7 +13,7 @@ toc_title: "\u041f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d\u
 CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] AS SELECT ...
 ```

-Normal views don’t store any data, they just perform a read from another table on each access. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the [FROM](../../../sql-reference/statements/select/from.md) clause.
+Обычные представления не хранят никаких данных, они выполняют чтение данных из другой таблицы при каждом доступе. Другими словами, обычное представление - это не что иное, как сохраненный запрос. При чтении данных из представления этот сохраненный запрос используется как подзапрос в секции [FROM](../../../sql-reference/statements/select/from.md).

 Для примера, пусть вы создали представление:

--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -512,6 +512,7 @@ namespace ErrorCodes
    extern const int NO_ROW_DELIMITER = 546;
    extern const int INVALID_RAID_TYPE = 547;
    extern const int UNKNOWN_VOLUME = 548;
+    extern const int DATA_TYPE_CANNOT_BE_USED_IN_KEY = 549;

    extern const int KEEPER_EXCEPTION = 999;
    extern const int POCO_EXCEPTION = 1000;
--- a/src/Common/HashTable/HashTable.h
+++ b/src/Common/HashTable/HashTable.h
@ -4,6 +4,7 @@

 #include <math.h>

+#include <new>
 #include <utility>

 #include <boost/noncopyable.hpp>
@ -314,8 +315,8 @@ public:
        zeroValue()->~Cell();
    }

-    Cell * zeroValue()             { return reinterpret_cast<Cell*>(&zero_value_storage); }
-    const Cell * zeroValue() const { return reinterpret_cast<const Cell*>(&zero_value_storage); }
+    Cell * zeroValue()             { return std::launder(reinterpret_cast<Cell*>(&zero_value_storage)); }
+    const Cell * zeroValue() const { return std::launder(reinterpret_cast<const Cell*>(&zero_value_storage)); }
 };

 template <typename Cell>
--- a/src/Common/HashTable/StringHashTable.h
+++ b/src/Common/HashTable/StringHashTable.h
@ -3,8 +3,10 @@
 #include <Common/HashTable/HashMap.h>
 #include <Common/HashTable/HashTable.h>

+#include <new>
 #include <variant>

+
 using StringKey8 = UInt64;
 using StringKey16 = DB::UInt128;
 struct StringKey24
@ -106,8 +108,8 @@ public:
            zeroValue()->~Cell();
    }

-    Cell * zeroValue() { return reinterpret_cast<Cell *>(&zero_value_storage); }
-    const Cell * zeroValue() const { return reinterpret_cast<const Cell *>(&zero_value_storage); }
+    Cell * zeroValue() { return std::launder(reinterpret_cast<Cell *>(&zero_value_storage)); }
+    const Cell * zeroValue() const { return std::launder(reinterpret_cast<const Cell *>(&zero_value_storage)); }

    using LookupResult = Cell *;
    using ConstLookupResult = const Cell *;
--- a/src/Core/Field.h
+++ b/src/Core/Field.h
@ -767,9 +767,10 @@ T & Field::get()
 #ifndef NDEBUG
    // Disregard signedness when converting between int64 types.
    constexpr Field::Types::Which target = TypeToEnum<NearestFieldType<ValueType>>::value;
-    assert(target == which
-           || (isInt64FieldType(target) && isInt64FieldType(which))
-           || target == Field::Types::Decimal64 /* DateTime64 fields */);
+    if (target != which
+           && (!isInt64FieldType(target) || !isInt64FieldType(which))
+        && target != Field::Types::Decimal64 /* DateTime64 fields */)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Field get from type {} to type {}", Types::toString(which), Types::toString(target));
 #endif

    ValueType * MAY_ALIAS ptr = reinterpret_cast<ValueType *>(&storage);
--- a/src/Storages/KeyDescription.cpp
+++ b/src/Storages/KeyDescription.cpp
@ -6,6 +6,7 @@
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/TreeRewriter.h>
 #include <Storages/extractKeyExpressionList.h>
+#include <Common/quoteString.h>


 namespace DB
@ -14,6 +15,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
+    extern const int DATA_TYPE_CANNOT_BE_USED_IN_KEY;
 }

 KeyDescription::KeyDescription(const KeyDescription & other)
@ -115,7 +117,13 @@ KeyDescription KeyDescription::getSortingKeyFromAST(
    }

    for (size_t i = 0; i < result.sample_block.columns(); ++i)
+    {
        result.data_types.emplace_back(result.sample_block.getByPosition(i).type);
+        if (!result.data_types.back()->isComparable())
+            throw Exception(ErrorCodes::DATA_TYPE_CANNOT_BE_USED_IN_KEY,
+                            "Column {} with type {} is not allowed in key expression, it's not comparable",
+                            backQuote(result.sample_block.getByPosition(i).name), result.data_types.back()->getName());
+    }

    return result;
 }
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
@ -201,7 +201,7 @@ void MergeTreeDataPartWriterOnDisk::calculateAndSerializePrimaryIndex(const Bloc
    {
        index_types = primary_index_block.getDataTypes();
        index_columns.resize(primary_columns_num);
-        last_index_row.resize(primary_columns_num);
+        last_block_index_columns.resize(primary_columns_num);
        for (size_t i = 0; i < primary_columns_num; ++i)
            index_columns[i] = primary_index_block.getByPosition(i).column->cloneEmpty();
    }
@ -236,10 +236,7 @@ void MergeTreeDataPartWriterOnDisk::calculateAndSerializePrimaryIndex(const Bloc

    /// store last index row to write final mark at the end of column
    for (size_t j = 0; j < primary_columns_num; ++j)
-    {
-        const IColumn & primary_column = *primary_index_block.getByPosition(j).column.get();
-        primary_column.get(rows - 1, last_index_row[j]);
-    }
+        last_block_index_columns[j] = primary_index_block.getByPosition(j).column;
 }

 void MergeTreeDataPartWriterOnDisk::calculateAndSerializeSkipIndices(const Block & skip_indexes_block)
@ -325,11 +322,12 @@ void MergeTreeDataPartWriterOnDisk::finishPrimaryIndexSerialization(
        {
            for (size_t j = 0; j < index_columns.size(); ++j)
            {
-                index_columns[j]->insert(last_index_row[j]);
-                index_types[j]->serializeBinary(last_index_row[j], *index_stream);
+                const auto & column = *last_block_index_columns[j];
+                size_t last_row_number = column.size() - 1;
+                index_columns[j]->insertFrom(column, last_row_number);
+                index_types[j]->serializeBinary(column, last_row_number, *index_stream);
            }
-
-            last_index_row.clear();
+            last_block_index_columns.clear();
        }

        index_stream->next();
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h
@ -111,9 +111,9 @@ protected:
    std::unique_ptr<WriteBufferFromFileBase> index_file_stream;
    std::unique_ptr<HashingWriteBuffer> index_stream;
    DataTypes index_types;
-    /// Index columns values from the last row from the last block
+    /// Index columns from the last block
    /// It's written to index file in the `writeSuffixAndFinalizePart` method
-    Row last_index_row;
+    Columns last_block_index_columns;

    bool data_written = false;
    bool primary_index_initialized = false;
--- a/tests/queries/0_stateless/01548_uncomparable_columns_in_keys.reference
+++ b/tests/queries/0_stateless/01548_uncomparable_columns_in_keys.reference
--- a/tests/queries/0_stateless/01548_uncomparable_columns_in_keys.sql
+++ b/tests/queries/0_stateless/01548_uncomparable_columns_in_keys.sql
@ -0,0 +1,9 @@
+DROP TABLE IF EXISTS uncomparable_keys;
+
+CREATE TABLE foo (id UInt64, key AggregateFunction(max, UInt64)) ENGINE MergeTree ORDER BY key; --{serverError 549}
+
+CREATE TABLE foo (id UInt64, key AggregateFunction(max, UInt64)) ENGINE MergeTree PARTITION BY key; --{serverError 549}
+
+CREATE TABLE foo (id UInt64, key AggregateFunction(max, UInt64)) ENGINE MergeTree ORDER BY (key) SAMPLE BY key; --{serverError 549}
+
+DROP TABLE IF EXISTS uncomparable_keys;
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@ -28,6 +28,7 @@ v20.4.5.36-stable	2020-06-10
 v20.4.4.18-stable	2020-05-26
 v20.4.3.16-stable	2020-05-23
 v20.4.2.9-stable	2020-05-12
+v20.3.21.2-lts	2020-11-02
 v20.3.20.6-lts	2020-10-09
 v20.3.19.4-lts	2020-09-18
 v20.3.18.10-lts	2020-09-08