From 633c3c8287b30e56676cdaf57a04c8942fef5179 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 21 Jun 2022 15:10:48 +0800 Subject: [PATCH 1/6] update hive test --- docker/test/integration/hive_server/Dockerfile | 3 --- docker/test/integration/hive_server/start.sh | 4 ++-- .../integration/test_hive_query/data}/prepare_hive_data.sh | 0 tests/integration/test_hive_query/test.py | 4 +++- 4 files changed, 5 insertions(+), 6 deletions(-) rename {docker/test/integration/hive_server => tests/integration/test_hive_query/data}/prepare_hive_data.sh (100%) diff --git a/docker/test/integration/hive_server/Dockerfile b/docker/test/integration/hive_server/Dockerfile index 391f9a5e22f..b06a0dcc830 100644 --- a/docker/test/integration/hive_server/Dockerfile +++ b/docker/test/integration/hive_server/Dockerfile @@ -43,8 +43,5 @@ COPY demo_data.txt / ENV PATH=/apache-hive-2.3.9-bin/bin:/hadoop-3.1.0/bin:/hadoop-3.1.0/sbin:$PATH RUN service ssh start && sed s/HOSTNAME/$HOSTNAME/ /hadoop-3.1.0/etc/hadoop/core-site.xml.template > /hadoop-3.1.0/etc/hadoop/core-site.xml && hdfs namenode -format -RUN apt install -y python3 python3-pip -RUN pip3 install flask requests -COPY http_api_server.py / COPY start.sh / diff --git a/docker/test/integration/hive_server/start.sh b/docker/test/integration/hive_server/start.sh index 4224b8126e6..efeda37048d 100755 --- a/docker/test/integration/hive_server/start.sh +++ b/docker/test/integration/hive_server/start.sh @@ -7,5 +7,5 @@ mysql -u root -e "GRANT ALL ON * . * TO 'test'@'localhost'" schematool -initSchema -dbType mysql #nohup hiveserver2 & nohup hive --service metastore & -bash /prepare_hive_data.sh -python3 http_api_server.py + +while true; do sleep 60; done diff --git a/docker/test/integration/hive_server/prepare_hive_data.sh b/tests/integration/test_hive_query/data/prepare_hive_data.sh similarity index 100% rename from docker/test/integration/hive_server/prepare_hive_data.sh rename to tests/integration/test_hive_query/data/prepare_hive_data.sh diff --git a/tests/integration/test_hive_query/test.py b/tests/integration/test_hive_query/test.py index fd4d91d6f78..e1b32024fbb 100644 --- a/tests/integration/test_hive_query/test.py +++ b/tests/integration/test_hive_query/test.py @@ -19,12 +19,14 @@ def started_cluster(): cluster.add_instance( "h0_0_0", main_configs=["configs/config.xml"], - extra_configs=["configs/hdfs-site.xml"], + extra_configs=["configs/hdfs-site.xml", "data/prepare_hive_data.sh"], with_hive=True, ) logging.info("Starting cluster ...") cluster.start() + cluster.copy_file_to_container("roottesthivequery_hdfs1_1", "/etc/clickhouse-server/extra_conf.d/prepare_hive_data.sh", "/prepare_hive_data.sh") + cluster.exec_in_container("roottesthivequery_hdfs1_1", ["bash", "-c", "bash /prepare_hive_data.sh"]) yield cluster finally: cluster.shutdown() From cdd7a549954436b1bcf7667bd6da246d8d551bb9 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 21 Jun 2022 15:59:54 +0800 Subject: [PATCH 2/6] improve stability for hive intergration test --- docker/test/integration/runner/compose/docker_compose_hive.yml | 2 +- docker/test/integration/runner/dockerd-entrypoint.sh | 2 +- tests/integration/helpers/cluster.py | 2 +- tests/integration/test_hive_query/test.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/test/integration/runner/compose/docker_compose_hive.yml b/docker/test/integration/runner/compose/docker_compose_hive.yml index 44f23655d2a..459e8481d0b 100644 --- a/docker/test/integration/runner/compose/docker_compose_hive.yml +++ b/docker/test/integration/runner/compose/docker_compose_hive.yml @@ -1,7 +1,7 @@ version: '2.3' services: hdfs1: - image: lgboustc/hive_test:v1.0 + image: lgboustc/hive_test:v2.0 hostname: hivetest restart: always entrypoint: bash /start.sh diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index 0cb25d12a9f..bc64b04eba9 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -12,7 +12,7 @@ echo '{ "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] }' | dd of=/etc/docker/daemon.json 2>/dev/null -dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & +dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.16.0.1/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & set +e reties=0 diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index f8ad9213e5b..e6f6bbe05d1 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -2396,7 +2396,7 @@ class ClickHouseCluster: logging.debug("Setup hive") subprocess_check_call(self.base_hive_cmd + common_opts) self.up_called = True - time.sleep(300) + time.sleep(30) if self.with_minio and self.base_minio_cmd: # Copy minio certificates to minio/certs diff --git a/tests/integration/test_hive_query/test.py b/tests/integration/test_hive_query/test.py index e1b32024fbb..6a0b1246ba0 100644 --- a/tests/integration/test_hive_query/test.py +++ b/tests/integration/test_hive_query/test.py @@ -25,7 +25,7 @@ def started_cluster(): logging.info("Starting cluster ...") cluster.start() - cluster.copy_file_to_container("roottesthivequery_hdfs1_1", "/etc/clickhouse-server/extra_conf.d/prepare_hive_data.sh", "/prepare_hive_data.sh") + cluster.copy_file_to_container("roottesthivequery_hdfs1_1", "/ClickHouse/tests/integration/test_hive_query/data/prepare_hive_data.sh", "/prepare_hive_data.sh") cluster.exec_in_container("roottesthivequery_hdfs1_1", ["bash", "-c", "bash /prepare_hive_data.sh"]) yield cluster finally: From eddb6443f2d75a964d4501d79b9ce5a756a6c768 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 21 Jun 2022 16:25:24 +0800 Subject: [PATCH 3/6] rollback dockerd-entrypoint.sh --- docker/test/integration/runner/dockerd-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index bc64b04eba9..0cb25d12a9f 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -12,7 +12,7 @@ echo '{ "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] }' | dd of=/etc/docker/daemon.json 2>/dev/null -dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.16.0.1/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & +dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log & set +e reties=0 From 63937682da07c6fe9d1e144ce532a8be30f1a4ba Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Wed, 22 Jun 2022 09:39:05 +0800 Subject: [PATCH 4/6] fixed code style --- tests/integration/test_hive_query/test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_hive_query/test.py b/tests/integration/test_hive_query/test.py index e1b32024fbb..e12dfc2f312 100644 --- a/tests/integration/test_hive_query/test.py +++ b/tests/integration/test_hive_query/test.py @@ -25,8 +25,14 @@ def started_cluster(): logging.info("Starting cluster ...") cluster.start() - cluster.copy_file_to_container("roottesthivequery_hdfs1_1", "/etc/clickhouse-server/extra_conf.d/prepare_hive_data.sh", "/prepare_hive_data.sh") - cluster.exec_in_container("roottesthivequery_hdfs1_1", ["bash", "-c", "bash /prepare_hive_data.sh"]) + cluster.copy_file_to_container( + "roottesthivequery_hdfs1_1", + "/etc/clickhouse-server/extra_conf.d/prepare_hive_data.sh", + "/prepare_hive_data.sh",s + ) + cluster.exec_in_container( + "roottesthivequery_hdfs1_1", ["bash", "-c", "bash /prepare_hive_data.sh"] + ) yield cluster finally: cluster.shutdown() From 123d57484ff2f0bf758b927731d95473da68834a Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Wed, 22 Jun 2022 17:15:47 +0800 Subject: [PATCH 5/6] fix a path error --- tests/integration/test_hive_query/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_hive_query/test.py b/tests/integration/test_hive_query/test.py index dcf12576bfd..a28269f3521 100644 --- a/tests/integration/test_hive_query/test.py +++ b/tests/integration/test_hive_query/test.py @@ -27,7 +27,7 @@ def started_cluster(): cluster.start() cluster.copy_file_to_container( "roottesthivequery_hdfs1_1", - "/etc/clickhouse-server/extra_conf.d/prepare_hive_data.sh", + "/ClickHouse/tests/integration/test_hive_query/data/prepare_hive_data.sh", "/prepare_hive_data.sh", ) cluster.exec_in_container( From aafe91a8cab166a0f379199db6a339e44a20828a Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 22 Jun 2022 14:48:18 -0400 Subject: [PATCH 6/6] add H3 tages for Algolia search --- .../sql-reference/statements/create/table.md | 55 ++++++++++++++----- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index bdf6c02c737..2cf57cc2243 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -230,12 +230,21 @@ ClickHouse supports general purpose codecs and specialized codecs. ### General Purpose Codecs -Codecs: +#### NONE -- `NONE` — No compression. -- `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression. -- `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: \[1, 12\]. Recommended level range: \[4, 9\]. -- `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default value: 1. +`NONE` — No compression. + +#### LZ4 + +`LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression. + +#### LZ4HC + +`LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: \[1, 12\]. Recommended level range: \[4, 9\]. + +#### ZSTD + +`ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default value: 1. High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage. @@ -243,13 +252,25 @@ High compression levels are useful for asymmetric scenarios, like compress once, These codecs are designed to make compression more effective by using specific features of data. Some of these codecs do not compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. -Specialized codecs: +#### Delta -- `Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. -- `DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `FPC` - Repeatedly predicts the next floating point value in the sequence using the better of two predictors, then XORs the actual with the predicted value, and leading-zero compresses the result. Similar to Gorilla, this is efficient when storing a series of floating point values that change slowly. For 64-bit values (double), FPC is faster than Gorilla, for 32-bit values your mileage may vary. For a detailed description of the algorithm see [High Throughput Compression of Double-Precision Floating-Point Data](https://userweb.cs.txstate.edu/~burtscher/papers/dcc07a.pdf). -- `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that do not differ between maximum and minimum values in the whole data part for which the compression is used. +`Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. + +#### DoubleDelta + +`DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). + +#### Gorilla + +`Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). + +#### FPC + +`FPC` - Repeatedly predicts the next floating point value in the sequence using the better of two predictors, then XORs the actual with the predicted value, and leading-zero compresses the result. Similar to Gorilla, this is efficient when storing a series of floating point values that change slowly. For 64-bit values (double), FPC is faster than Gorilla, for 32-bit values your mileage may vary. For a detailed description of the algorithm see [High Throughput Compression of Double-Precision Floating-Point Data](https://userweb.cs.txstate.edu/~burtscher/papers/dcc07a.pdf). + +#### T64 + +`T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that do not differ between maximum and minimum values in the whole data part for which the compression is used. `DoubleDelta` and `Gorilla` codecs are used in Gorilla TSDB as the components of its compressing algorithm. Gorilla approach is effective in scenarios when there is a sequence of slowly changing values with their timestamps. Timestamps are effectively compressed by the `DoubleDelta` codec, and values are effectively compressed by the `Gorilla` codec. For example, to get an effectively stored table, you can create it in the following configuration: @@ -268,14 +289,20 @@ These codecs don't actually compress data, but instead encrypt data on disk. The Encryption codecs: -- `CODEC('AES-128-GCM-SIV')` — Encrypts data with AES-128 in [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV mode. -- `CODEC('AES-256-GCM-SIV')` — Encrypts data with AES-256 in GCM-SIV mode. +#### AES_128_GCM_SIV + +`CODEC('AES-128-GCM-SIV')` — Encrypts data with AES-128 in [RFC 8452](https://tools.ietf.org/html/rfc8452) GCM-SIV mode. + + +#### AES-256-GCM-SIV + +`CODEC('AES-256-GCM-SIV')` — Encrypts data with AES-256 in GCM-SIV mode. These codecs use a fixed nonce and encryption is therefore deterministic. This makes it compatible with deduplicating engines such as [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) but has a weakness: when the same data block is encrypted twice, the resulting ciphertext will be exactly the same so an adversary who can read the disk can see this equivalence (although only the equivalence, without getting its content). :::warning -Most engines including the "*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed. +Most engines including the "\*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed. ::: :::warning