diff --git a/cmake/find_hyperscan.cmake b/cmake/find_hyperscan.cmake index 826ee555d53..a3e0b6bc9bc 100644 --- a/cmake/find_hyperscan.cmake +++ b/cmake/find_hyperscan.cmake @@ -1,7 +1,33 @@ if (HAVE_SSSE3) + option (ENABLE_HYPERSCAN "Enable hyperscan" ON) +endif () + +if (ENABLE_HYPERSCAN) + +option (USE_INTERNAL_HYPERSCAN_LIBRARY "Set to FALSE to use system hyperscan instead of the bundled" ${NOT_UNBUNDLED}) + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/hyperscan/CMakeLists.txt") + if (USE_INTERNAL_HYPERSCAN_LIBRARY) + message (WARNING "submodule contrib/hyperscan is missing. to fix try run: \n git submodule update --init --recursive") + endif () + set (MISSING_INTERNAL_HYPERSCAN_LIBRARY 1) + set (USE_INTERNAL_HYPERSCAN_LIBRARY 0) +endif () + +if (NOT USE_INTERNAL_HYPERSCAN_LIBRARY) + find_library (HYPERSCAN_LIBRARY hs) + find_path (HYPERSCAN_INCLUDE_DIR NAMES hs/hs.h hs.h PATHS ${HYPERSCAN_INCLUDE_PATHS}) +endif () + +if (HYPERSCAN_LIBRARY AND HYPERSCAN_INCLUDE_DIR) + set (USE_HYPERSCAN 1) +elseif (NOT MISSING_INTERNAL_HYPERSCAN_LIBRARY) set (HYPERSCAN_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/hyperscan/src) set (HYPERSCAN_LIBRARY hs) set (USE_HYPERSCAN 1) set (USE_INTERNAL_HYPERSCAN_LIBRARY 1) - message (STATUS "Using hyperscan: ${HYPERSCAN_INCLUDE_DIR} " : ${HYPERSCAN_LIBRARY}) endif() + +message (STATUS "Using hyperscan=${USE_HYPERSCAN}: ${HYPERSCAN_INCLUDE_DIR} : ${HYPERSCAN_LIBRARY}") + +endif () diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index dee9b3765de..03daaf8907b 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -305,6 +305,6 @@ if (USE_BASE64) add_subdirectory (base64-cmake) endif() -if (USE_HYPERSCAN) +if (USE_INTERNAL_HYPERSCAN_LIBRARY) add_subdirectory (hyperscan) endif() diff --git a/contrib/boost b/contrib/boost index 32abf16beb7..471ea208abb 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 32abf16beb7bb8b243a4d100ccdd6acb271738c4 +Subproject commit 471ea208abb92a5cba7d3a08a819bb728f27e95f diff --git a/dbms/src/Common/Volnitsky.h b/dbms/src/Common/Volnitsky.h index d8fc42245bf..bce37e655cd 100644 --- a/dbms/src/Common/Volnitsky.h +++ b/dbms/src/Common/Volnitsky.h @@ -516,7 +516,7 @@ public: template void searchFirstPosition(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const CountCharsCallback & count_chars_callback, ResultType & ans) { - auto callback = [this, &count_chars_callback](const UInt8 * haystack, const UInt8 * haystack_end) -> size_t + auto callback = [this, &count_chars_callback](const UInt8 * haystack, const UInt8 * haystack_end) -> UInt64 { return this->searchOneFirstPosition(haystack, haystack_end, count_chars_callback); }; @@ -676,11 +676,11 @@ private: } template - inline size_t searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & callback) const + inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & callback) const { const size_t fallback_size = fallback_needles.size(); - size_t ans = std::numeric_limits::max(); + UInt64 ans = std::numeric_limits::max(); for (size_t i = 0; i < fallback_size; ++i) if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end) @@ -705,7 +705,7 @@ private: } } } - if (ans == std::numeric_limits::max()) + if (ans == std::numeric_limits::max()) return 0; return ans; } diff --git a/dbms/src/Common/config.h.in b/dbms/src/Common/config.h.in index 0b31466d522..c323afe369e 100644 --- a/dbms/src/Common/config.h.in +++ b/dbms/src/Common/config.h.in @@ -24,6 +24,7 @@ #cmakedefine01 USE_CPUINFO #cmakedefine01 USE_BROTLI #cmakedefine01 USE_SSL +#cmakedefine01 USE_HYPERSCAN #cmakedefine01 CLICKHOUSE_SPLIT_BINARY #cmakedefine01 LLVM_HAS_RTTI diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 0af2d3d7007..7c22afc9020 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1,8 +1,6 @@ -#include +#include "FunctionsStringSearch.h" #include -#include - #include #include #include @@ -11,12 +9,16 @@ #include #include #include - #include #include -#ifdef __SSSE3__ -# include +#include +#if USE_HYPERSCAN +# if __has_include() +# include +# else +# include +# endif #endif #if USE_RE2_ST @@ -617,7 +619,7 @@ struct MultiMatchAnyImpl { (void)FindAny; (void)FindAnyIndex; -#ifdef __SSSE3__ +#if USE_HYPERSCAN using ScratchPtr = std::unique_ptr>; const auto & hyperscan_regex = MultiRegexps::get(needles); @@ -670,7 +672,7 @@ struct MultiMatchAnyImpl res[i] = j + 1; } } -#endif // __SSSE3__ +#endif // USE_HYPERSCAN } }; diff --git a/dbms/src/Functions/Regexps.h b/dbms/src/Functions/Regexps.h index f5ad738425c..f6a37f94ddc 100644 --- a/dbms/src/Functions/Regexps.h +++ b/dbms/src/Functions/Regexps.h @@ -5,13 +5,17 @@ #include #include #include - #include #include #include -#ifdef __SSSE3__ -# include +#include +#if USE_HYPERSCAN +# if __has_include() +# include +# else +# include +# endif #endif namespace ProfileEvents @@ -63,7 +67,7 @@ namespace Regexps } } -#ifdef __SSSE3__ +#if USE_HYPERSCAN namespace MultiRegexps { @@ -139,6 +143,6 @@ namespace MultiRegexps } } -#endif // __SSSE3__ +#endif // USE_HYPERSCAN } diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index e2e4b397d0b..97358ac02c9 100644 --- a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -56,6 +56,7 @@ const char * auto_config_build[] "USE_PROTOBUF", "@USE_PROTOBUF@", "USE_BROTLI", "@USE_BROTLI@", "USE_SSL", "@USE_SSL@", + "USE_HYPERSCAN", "@USE_HYPERSCAN@", nullptr, nullptr }; diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index b17f408f959..242dc17de0b 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -404,6 +404,8 @@ def main(args): def find_binary(name): + if os.path.exists(name) and os.access(name, os.X_OK): + return True paths = os.environ.get("PATH").split(':') for path in paths: if os.access(os.path.join(path, name), os.X_OK): @@ -416,7 +418,7 @@ if __name__ == '__main__': parser=ArgumentParser(description='ClickHouse functional tests') parser.add_argument('-q', '--queries', help='Path to queries dir') parser.add_argument('--tmp', help='Path to tmp dir') - parser.add_argument('-b', '--binary', default='clickhouse', help='Main clickhouse binary') + parser.add_argument('-b', '--binary', default='clickhouse', help='Path to clickhouse binary or name of binary in PATH') parser.add_argument('-c', '--client', help='Client program') parser.add_argument('--extract_from_config', help='extract-from-config program') parser.add_argument('--configclient', help='Client config (if you use not default ports)') diff --git a/dbms/tests/clickhouse-test-server b/dbms/tests/clickhouse-test-server index b324a270473..80a1db4a153 100755 --- a/dbms/tests/clickhouse-test-server +++ b/dbms/tests/clickhouse-test-server @@ -9,18 +9,18 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd) DATA_DIR=${DATA_DIR:=`mktemp -d /tmp/clickhouse.test..XXXXX`} DATA_DIR_PATTERN=${DATA_DIR_PATTERN:=/tmp/clickhouse} # path from config file, will be replaced to temporary LOG_DIR=${LOG_DIR:=$DATA_DIR/log} -export CLICKHOUSE_BINARY=${CLICKHOUSE_BINARY:="clickhouse"} -( [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY}-server" ] || [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY}" ] ) && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR} # Build without separate build dir +export CLICKHOUSE_BINARY_NAME=${CLICKHOUSE_BINARY_NAME:="clickhouse"} +( [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}-server" ] || [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}" ] ) && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR} # Build without separate build dir [ -d "$ROOT_DIR/build${BUILD_TYPE}" ] && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR/build${BUILD_TYPE}} BUILD_DIR=${BUILD_DIR:=$ROOT_DIR} -[ -x ${CLICKHOUSE_BINARY}-server" ] && [ -x ${CLICKHOUSE_BINARY}-client" ] && BIN_DIR= # Allow run in /usr/bin -( [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY}" ] || [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY}-server" ] ) && BIN_DIR=${BIN_DIR:=$BUILD_DIR/dbms/programs/} -[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}-server" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY}-server} -[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY} server} -[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}-client" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY}-client} -[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY} client} -[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}-extract-from-config" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY}-extract-from-config} -[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY} extract-from-config} +[ -x ${CLICKHOUSE_BINARY_NAME}-server" ] && [ -x ${CLICKHOUSE_BINARY_NAME}-client" ] && BIN_DIR= # Allow run in /usr/bin +( [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}" ] || [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}-server" ] ) && BIN_DIR=${BIN_DIR:=$BUILD_DIR/dbms/programs/} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-server" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-server} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} server} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-client" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-client} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} client} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-extract-from-config" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-extract-from-config} +[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} extract-from-config} [ -f "$CUR_DIR/server-test.xml" ] && CONFIG_DIR=${CONFIG_DIR=$CUR_DIR}/ CONFIG_CLIENT_DIR=${CONFIG_CLIENT_DIR=$CONFIG_DIR} @@ -131,7 +131,7 @@ else TEST_DICT=${TEST_DICT=1} CLICKHOUSE_CLIENT_QUERY="${CLICKHOUSE_CLIENT} --config ${CLICKHOUSE_CONFIG_CLIENT} --port $CLICKHOUSE_PORT_TCP -m -n -q" $CLICKHOUSE_CLIENT_QUERY 'SELECT * from system.build_options; SELECT * FROM system.clusters;' - CLICKHOUSE_TEST="env ${TEST_DIR}clickhouse-test --binary ${BIN_DIR}${CLICKHOUSE_BINARY} --configclient $CLICKHOUSE_CONFIG_CLIENT --configserver $CLICKHOUSE_CONFIG --tmp $DATA_DIR/tmp --queries $QUERIES_DIR $TEST_OPT0 $TEST_OPT" + CLICKHOUSE_TEST="env ${TEST_DIR}clickhouse-test --binary ${BIN_DIR}${CLICKHOUSE_BINARY_NAME} --configclient $CLICKHOUSE_CONFIG_CLIENT --configserver $CLICKHOUSE_CONFIG --tmp $DATA_DIR/tmp --queries $QUERIES_DIR $TEST_OPT0 $TEST_OPT" CLICKHOUSE_PERFORMANCE_TEST="${BIN_DIR}clickhouse-performance-test --port $CLICKHOUSE_PORT_TCP --recursive $CUR_DIR/performance --skip-tags=long" if [ "${TEST_RUN_STRESS}" ]; then # Running test in parallel will fail some results (tests can create/fill/drop same tables) diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 8dece39249f..a72a2e4a06e 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -128,16 +128,29 @@ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- For successful requests that don't return a data table, an empty response body is returned. -You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you will need to use the special clickhouse-compressor program to work with it (it is installed with the clickhouse-client package). +You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you will need to use the special `clickhouse-compressor` program to work with it (it is installed with the `clickhouse-client` package). To increase the efficiency of the data insertion, you may disable the server-side checksum verification with the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. -If you specified 'compress=1' in the URL, the server will compress the data it sends you. -If you specified 'decompress=1' in the URL, the server will decompress the same data that you pass in the POST method. +If you specified `compress = 1` in the URL, the server compresses the data it sends you. +If you specified `decompress = 1` in the URL, the server decompresses the same data that you pass in the `POST` method. -It is also possible to use the standard gzip-based HTTP compression. To send a POST request compressed using gzip, append the request header `Content-Encoding: gzip`. -In order for ClickHouse to compress the response using gzip, you must append `Accept-Encoding: gzip` to the request headers, and enable the ClickHouse setting `enable_http_compression`. +It is also possible to use the standard `gzip`-based [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). To send a `POST` request compressed using `gzip`, append the request header `Content-Encoding: gzip`. +In order for ClickHouse to compress the response using `gzip`, you must append `Accept-Encoding: gzip` to the request headers, and enable the ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting. You can configure the compression level of the data with the [http_zlib_compression_level](#settings-http_zlib_compression_level) setting. You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed. +Examples of sending the data with compression: + +```bash +#Sending the data to the server: +curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' + +#Sending the data to the client: +echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +``` + +!!! note "Note" + Some HTTP clients can decompress data (`gzip` and `deflate`) from the server by default and you may get the decompressed data even if you use the compression settings correctly. + You can use the 'database' URL parameter to specify the default database. ```bash diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 3e9ecef9d17..00c5d476771 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -79,6 +79,41 @@ Enable or disable fsync when writing .sql files. Enabled by default. It makes sense to disable it if the server has millions of tiny table chunks that are constantly being created and destroyed. +## enable_http_compression {#settings-enable_http_compression} + +Enables/disables compression of the data in the response to an HTTP request. + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — The functionality is disabled. +- 1 — The functionality is enabled. + +Default value: 0. + +## http_zlib_compression_level {#settings-http_zlib_compression_level} + +Sets the level of the compression of the data in the response to an HTTP request if [enable_http_compression = 1](#settings-enable_http_compression). + +Possible values: numbers from 1 to 9. + +Default value: 3. + + +## http_native_compression_disable_checksumming_on_decompress {#settings-http_native_compression_disable_checksumming_on_decompress} + +Enables/disables the verification of the checksum when uncompressing the HTTP POST data from the client. Used only for ClickHouse native format of compression (neither `gzip` nor `deflate`). + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — The functionality is disabled. +- 1 — The functionality is enabled. + +Default value: 0. + ## input_format_allow_errors_num Sets the maximum number of acceptable errors when reading from text formats (CSV, TSV, etc.). diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index 9846e9fd8e4..ee08913dcd1 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -191,9 +191,7 @@ added dimensions. In this case it makes sense to leave only a few columns in the primary key that will provide efficient range scans and add the remaining dimension columns to the sorting key tuple. -[ALTER of the sorting key](../../query_language/alter.md) is a -lightweight operation because when a new column is simultaneously added to the table and to the sorting key -data parts need not be changed (they remain sorted by the new sorting key expression). +[ALTER of the sorting key](../../query_language/alter.md) is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts don't need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the just added column, the data at the moment of table modification is sorted by both the old and the new sorting key. ### Use of Indexes and Partitions in Queries diff --git a/docs/ru/operations/table_engines/mergetree.md b/docs/ru/operations/table_engines/mergetree.md index 9182d23a4c8..aeb29c270e1 100644 --- a/docs/ru/operations/table_engines/mergetree.md +++ b/docs/ru/operations/table_engines/mergetree.md @@ -189,7 +189,7 @@ ClickHouse не требует уникального первичного кл В этом сценарии имеет смысл оставить в первичном ключе всего несколько столбцов, которые обеспечат эффективную фильтрацию по индексу, а остальные столбцы-измерения добавить в выражение ключа сортировки. -[ALTER ключа сортировки](../../query_language/alter.md) — легкая операция, так как при одновременном добавлении нового столбца в таблицу и ключ сортировки не нужно изменять +[ALTER ключа сортировки](../../query_language/alter.md) — легкая операция, так как при одновременном добавлении нового столбца в таблицу и в ключ сортировки, не нужно изменять данные кусков (они остаются упорядоченными и по новому выражению ключа). ### Использование индексов и партиций в запросах diff --git a/utils/build/build_debian_unbundled.sh b/utils/build/build_debian_unbundled.sh index 0d9ae74f169..41c951c4bae 100755 --- a/utils/build/build_debian_unbundled.sh +++ b/utils/build/build_debian_unbundled.sh @@ -22,5 +22,5 @@ env TEST_RUN=1 \ `# Use all possible contrib libs from system` \ `# psmisc - killall` \ `# gdb - symbol test in pbuilder` \ - EXTRAPACKAGES="psmisc libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-thread-dev libboost-regex-dev zlib1g-dev liblz4-dev libdouble-conversion-dev libsparsehash-dev librdkafka-dev libpoco-dev unixodbc-dev libsparsehash-dev libgoogle-perftools-dev libzstd-dev libre2-dev libunwind-dev googletest libcctz-dev libcapnp-dev libjemalloc-dev libssl-dev libunwind-dev libgsasl7-dev libxml2-dev libbrotli-dev $EXTRAPACKAGES" \ + EXTRAPACKAGES="psmisc libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-thread-dev libboost-regex-dev zlib1g-dev liblz4-dev libdouble-conversion-dev libsparsehash-dev librdkafka-dev libpoco-dev unixodbc-dev libsparsehash-dev libgoogle-perftools-dev libzstd-dev libre2-dev libunwind-dev googletest libcctz-dev libcapnp-dev libjemalloc-dev libssl-dev libunwind-dev libgsasl7-dev libxml2-dev libbrotli-dev libhyperscan-dev $EXTRAPACKAGES" \ pdebuild --configfile $ROOT_DIR/debian/.pbuilderrc $PDEBUILD_OPT