Merge branch 'ClickHouse:master' into zh-docs-patch-20220622

2024-11-29 02:52:13 +00:00 · 2022-06-23 12:57:16 +08:00 · 2022-06-23 12:57:16 +08:00 · 22f5f7e2d6
commit 22f5f7e2d6
parent 81becebfe3 f2866630b5
7 changed files with 54 additions and 22 deletions
--- a/docker/test/integration/hive_server/Dockerfile
+++ b/docker/test/integration/hive_server/Dockerfile
@ -43,8 +43,5 @@ COPY demo_data.txt /
 ENV PATH=/apache-hive-2.3.9-bin/bin:/hadoop-3.1.0/bin:/hadoop-3.1.0/sbin:$PATH
 RUN service ssh start && sed s/HOSTNAME/$HOSTNAME/ /hadoop-3.1.0/etc/hadoop/core-site.xml.template > /hadoop-3.1.0/etc/hadoop/core-site.xml && hdfs namenode -format
 RUN apt install -y python3 python3-pip
 RUN pip3 install flask requests
 COPY http_api_server.py /
 COPY start.sh /
--- a/docker/test/integration/hive_server/start.sh
+++ b/docker/test/integration/hive_server/start.sh
@ -7,5 +7,5 @@ mysql -u root -e "GRANT ALL  ON * . * TO 'test'@'localhost'"
 schematool -initSchema -dbType mysql
 #nohup hiveserver2 &
 nohup hive --service metastore &
-bash /prepare_hive_data.sh
+
-python3 http_api_server.py
+while true; do sleep 60; done
--- a/docker/test/integration/runner/compose/docker_compose_hive.yml
+++ b/docker/test/integration/runner/compose/docker_compose_hive.yml
@ -1,7 +1,7 @@
 version: '2.3'
 services:
    hdfs1:
-        image: lgboustc/hive_test:v1.0
+        image: lgboustc/hive_test:v2.0
        hostname: hivetest
        restart: always
        entrypoint: bash /start.sh
--- a/docs/en/sql-reference/statements/create/table.md
+++ b/docs/en/sql-reference/statements/create/table.md
@ -230,12 +230,21 @@ ClickHouse supports general purpose codecs and specialized codecs.
 ### General Purpose Codecs
-Codecs:
+#### NONE
-   `NONE` — No compression.
+`NONE` — No compression.
-   `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression.
+
-   `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: \[1, 12\]. Recommended level range: \[4, 9\].
+#### LZ4
-   `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default value: 1.
+
 `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression.
 #### LZ4HC
 `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: \[1, 12\]. Recommended level range: \[4, 9\].
 #### ZSTD
 `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default value: 1.
 High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage.
@ -243,13 +252,25 @@ High compression levels are useful for asymmetric scenarios, like compress once,
 These codecs are designed to make compression more effective by using specific features of data. Some of these codecs do not compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation.
-Specialized codecs:
+#### Delta
-   `Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1.
+`Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1.
-   `DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
+
-   `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
+#### DoubleDelta
-   `FPC` - Repeatedly predicts the next floating point value in the sequence using the better of two predictors, then XORs the actual with the predicted value, and leading-zero compresses the result. Similar to Gorilla, this is efficient when storing a series of floating point values that change slowly. For 64-bit values (double), FPC is faster than Gorilla, for 32-bit values your mileage may vary. For a detailed description of the algorithm see [High Throughput Compression of Double-Precision Floating-Point Data](https://userweb.cs.txstate.edu/~burtscher/papers/dcc07a.pdf).
+
-   `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that do not differ between maximum and minimum values in the whole data part for which the compression is used.
+`DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
 #### Gorilla
 `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
 #### FPC
 `FPC` - Repeatedly predicts the next floating point value in the sequence using the better of two predictors, then XORs the actual with the predicted value, and leading-zero compresses the result. Similar to Gorilla, this is efficient when storing a series of floating point values that change slowly. For 64-bit values (double), FPC is faster than Gorilla, for 32-bit values your mileage may vary. For a detailed description of the algorithm see [High Throughput Compression of Double-Precision Floating-Point Data](https://userweb.cs.txstate.edu/~burtscher/papers/dcc07a.pdf).
 #### T64
 `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that do not differ between maximum and minimum values in the whole data part for which the compression is used.
 `DoubleDelta` and `Gorilla` codecs are used in Gorilla TSDB as the components of its compressing algorithm. Gorilla approach is effective in scenarios when there is a sequence of slowly changing values with their timestamps. Timestamps are effectively compressed by the `DoubleDelta` codec, and values are effectively compressed by the `Gorilla` codec. For example, to get an effectively stored table, you can create it in the following configuration:
@ -268,14 +289,20 @@ These codecs don't actually compress data, but instead encrypt data on disk. The
 Encryption codecs:
 -   `CODEC('AES-128-GCM-SIV')` — Encrypts data with AES-128 in [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV mode. 
-   `CODEC('AES-256-GCM-SIV')` — Encrypts data with AES-256 in GCM-SIV mode. 
+#### AES_128_GCM_SIV
 `CODEC('AES-128-GCM-SIV')` — Encrypts data with AES-128 in [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV mode. 
 #### AES-256-GCM-SIV
 `CODEC('AES-256-GCM-SIV')` — Encrypts data with AES-256 in GCM-SIV mode. 
 These codecs use a fixed nonce and encryption is therefore deterministic. This makes it compatible with deduplicating engines such as [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) but has a weakness: when the same data block is encrypted twice, the resulting ciphertext will be exactly the same so an adversary who can read the disk can see this equivalence (although only the equivalence, without getting its content).
 :::warning    
-Most engines including the "*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed.
+Most engines including the "\*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed.
 :::
 :::warning    
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@ -2396,7 +2396,7 @@ class ClickHouseCluster:
                logging.debug("Setup hive")
                subprocess_check_call(self.base_hive_cmd + common_opts)
                self.up_called = True
-                time.sleep(300)
+                time.sleep(30)
            if self.with_minio and self.base_minio_cmd:
                # Copy minio certificates to minio/certs
--- a/tests/integration/test_hive_query/data/prepare_hive_data.sh
+++ b/tests/integration/test_hive_query/data/prepare_hive_data.sh
--- a/tests/integration/test_hive_query/test.py
+++ b/tests/integration/test_hive_query/test.py
@ -19,12 +19,20 @@ def started_cluster():
        cluster.add_instance(
            "h0_0_0",
            main_configs=["configs/config.xml"],
-            extra_configs=["configs/hdfs-site.xml"],
+            extra_configs=["configs/hdfs-site.xml", "data/prepare_hive_data.sh"],
            with_hive=True,
        )
        logging.info("Starting cluster ...")
        cluster.start()
        cluster.copy_file_to_container(
            "roottesthivequery_hdfs1_1",
            "/ClickHouse/tests/integration/test_hive_query/data/prepare_hive_data.sh",
            "/prepare_hive_data.sh",
        )
        cluster.exec_in_container(
            "roottesthivequery_hdfs1_1", ["bash", "-c", "bash /prepare_hive_data.sh"]
        )
        yield cluster
    finally:
        cluster.shutdown()