diff --git a/cmake/clang_tidy.cmake b/cmake/clang_tidy.cmake index 200282234ca..57295682487 100644 --- a/cmake/clang_tidy.cmake +++ b/cmake/clang_tidy.cmake @@ -3,10 +3,20 @@ option (ENABLE_CLANG_TIDY "Use clang-tidy static analyzer" OFF) if (ENABLE_CLANG_TIDY) - find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + find_program (CLANG_TIDY_CACHE_PATH NAMES "clang-tidy-cache") + if (CLANG_TIDY_CACHE_PATH) + find_program (_CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + + # Why do we use ';' here? + # It's a cmake black magic: https://cmake.org/cmake/help/latest/prop_tgt/LANG_CLANG_TIDY.html#prop_tgt:%3CLANG%3E_CLANG_TIDY + # The CLANG_TIDY_PATH is passed to CMAKE_CXX_CLANG_TIDY, which follows CXX_CLANG_TIDY syntax. + set (CLANG_TIDY_PATH "${CLANG_TIDY_CACHE_PATH};${_CLANG_TIDY_PATH}" CACHE STRING "A combined command to run clang-tidy with caching wrapper") + else () + find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + endif () if (CLANG_TIDY_PATH) - message(STATUS + message (STATUS "Using clang-tidy: ${CLANG_TIDY_PATH}. The checks will be run during build process. See the .clang-tidy file at the root directory to configure the checks.") @@ -15,11 +25,15 @@ if (ENABLE_CLANG_TIDY) # clang-tidy requires assertions to guide the analysis # Note that NDEBUG is set implicitly by CMake for non-debug builds - set(COMPILER_FLAGS "${COMPILER_FLAGS} -UNDEBUG") + set (COMPILER_FLAGS "${COMPILER_FLAGS} -UNDEBUG") - # The variable CMAKE_CXX_CLANG_TIDY will be set inside src and base directories with non third-party code. + # The variable CMAKE_CXX_CLANG_TIDY will be set inside the following directories with non third-party code. + # - base + # - programs + # - src + # - utils # set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") else () - message(${RECONFIGURE_MESSAGE_LEVEL} "clang-tidy is not found") + message (${RECONFIGURE_MESSAGE_LEVEL} "clang-tidy is not found") endif () endif () diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 77afc3e924b..06c3c0d80f0 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -91,6 +91,9 @@ ENV PATH="$PATH:/usr/local/go/bin" ENV GOPATH=/workdir/go ENV GOCACHE=/workdir/ +RUN curl https://raw.githubusercontent.com/matus-chochlik/ctcache/7fd516e91c17779cbc6fc18bd119313d9532dd90/clang-tidy-cache -Lo /usr/bin/clang-tidy-cache \ + && chmod +x /usr/bin/clang-tidy-cache + RUN mkdir /workdir && chmod 777 /workdir WORKDIR /workdir diff --git a/docker/packager/packager b/docker/packager/packager index 83629dc7408..7f6bd8818fb 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -258,6 +258,10 @@ def parse_env_variables( if clang_tidy: # 15G is not enough for tidy build cache_maxsize = "25G" + + # `CTCACHE_DIR` has the same purpose as the `CCACHE_DIR` above. + # It's there to have the clang-tidy cache embedded into our standard `CCACHE_DIR` + result.append("CTCACHE_DIR=/ccache/clang-tidy-cache") result.append(f"CCACHE_MAXSIZE={cache_maxsize}") if distcc_hosts: @@ -282,9 +286,7 @@ def parse_env_variables( cmake_flags.append("-DENABLE_TESTS=1") if shared_libraries: - cmake_flags.append( - "-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1" - ) + cmake_flags.append("-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1") # We can't always build utils because it requires too much space, but # we have to build them at least in some way in CI. The shared library # build is probably the least heavy disk-wise. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index dfcef4ae200..7f7c14817ba 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -2939,7 +2939,7 @@ Possible values: - 0 — Projection optimization disabled. - 1 — Projection optimization enabled. -Default value: `0`. +Default value: `1`. ## force_optimize_projection {#force-optimize-projection} diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/troubleshooting.md index 6a1ca3176ad..ad92e773ea3 100644 --- a/docs/en/operations/troubleshooting.md +++ b/docs/en/operations/troubleshooting.md @@ -28,18 +28,34 @@ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D7 sudo apt-get update ``` -### You Get the Unsupported Architecture Warning with Apt-get {#you-get-the-unsupported-architecture-warning-with-apt-get} +### You Get Different Warnings with `apt-get update` {#you-get-different-warnings-with-apt-get-update} -- The completed warning message is as follows: +- The completed warning messages are as one of following: ``` N: Skipping acquire of configured file 'main/binary-i386/Packages' as repository 'https://packages.clickhouse.com/deb stable InRelease' doesn't support architecture 'i386' ``` +``` +E: Failed to fetch https://packages.clickhouse.com/deb/dists/stable/main/binary-amd64/Packages.gz File has unexpected size (30451 != 28154). Mirror sync in progress? +``` + +``` +E: Repository 'https://packages.clickhouse.com/deb stable InRelease' changed its 'Origin' value from 'Artifactory' to 'ClickHouse' +E: Repository 'https://packages.clickhouse.com/deb stable InRelease' changed its 'Label' value from 'Artifactory' to 'ClickHouse' +N: Repository 'https://packages.clickhouse.com/deb stable InRelease' changed its 'Suite' value from 'stable' to '' +N: This must be accepted explicitly before updates for this repository can be applied. See apt-secure(8) manpage for details. +``` + +``` +Err:11 https://packages.clickhouse.com/deb stable InRelease + 400 Bad Request [IP: 172.66.40.249 443] +``` + To resolve the above issue, please use the following script: ```bash -sudo rm /var/lib/apt/lists/packages.clickhouse.com_* /var/lib/dpkg/arch +sudo rm /var/lib/apt/lists/packages.clickhouse.com_* /var/lib/dpkg/arch /var/lib/apt/lists/partial/packages.clickhouse.com_* sudo apt-get clean sudo apt-get autoclean ``` diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index a8ba4843279..e0418a81f14 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -571,13 +571,13 @@ Similar to base58Decode, but returns an empty string in case of error. ## base64Encode(s) -Encodes ‘s’ string into base64 +Encodes ‘s’ FixedString or String into base64. Alias: `TO_BASE64`. ## base64Decode(s) -Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception. +Decode base64-encoded FixedString or String ‘s’ into original string. In case of failure raises an exception. Alias: `FROM_BASE64`. diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index adf2a07b732..d1f0e44f6b4 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -6,28 +6,29 @@ sidebar_label: For Replacing in Strings # Functions for Searching and Replacing in Strings -:::note +:::note Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. ::: ## replaceOne(haystack, pattern, replacement) -Replaces the first occurrence, if it exists, of the ‘pattern’ substring in ‘haystack’ with the ‘replacement’ substring. -Hereafter, ‘pattern’ and ‘replacement’ must be constants. +Replaces the first occurrence of the substring ‘pattern’ (if it exists) in ‘haystack’ by the ‘replacement’ string. +‘pattern’ and ‘replacement’ must be constants. ## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) -Replaces all occurrences of the ‘pattern’ substring in ‘haystack’ with the ‘replacement’ substring. +Replaces all occurrences of the substring ‘pattern’ in ‘haystack’ by the ‘replacement’ string. ## replaceRegexpOne(haystack, pattern, replacement) -Replacement using the ‘pattern’ regular expression. A re2 regular expression. -Replaces only the first occurrence, if it exists. -A pattern can be specified as ‘replacement’. This pattern can include substitutions `\0-\9`. -The substitution `\0` includes the entire regular expression. Substitutions `\1-\9` correspond to the subpattern numbers.To use the `\` character in a template, escape it using `\`. -Also keep in mind that a string literal requires an extra escape. +Replaces the first occurrence of the substring matching the regular expression ‘pattern’ in ‘haystack‘ by the ‘replacement‘ string. +‘pattern‘ must be a constant [re2 regular expression](https://github.com/google/re2/wiki/Syntax). +‘replacement’ must be a plain constant string or a constant string containing substitutions `\0-\9`. +Substitutions `\1-\9` correspond to the 1st to 9th capturing group (submatch), substitution `\0` corresponds to the entire match. +To use a verbatim `\` character in the ‘pattern‘ or ‘replacement‘ string, escape it using `\`. +Also keep in mind that string literals require an extra escaping. -Example 1. Converting the date to American format: +Example 1. Converting ISO dates to American format: ``` sql SELECT DISTINCT @@ -62,7 +63,7 @@ SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') ## replaceRegexpAll(haystack, pattern, replacement) -This does the same thing, but replaces all the occurrences. Example: +Like ‘replaceRegexpOne‘, but replaces all occurrences of the pattern. Example: ``` sql SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res diff --git a/packages/clickhouse-client.yaml b/packages/clickhouse-client.yaml index 459a09ee0b8..d4fd9300208 100644 --- a/packages/clickhouse-client.yaml +++ b/packages/clickhouse-client.yaml @@ -37,7 +37,7 @@ deb: contents: - src: root/etc/clickhouse-client/config.xml dst: /etc/clickhouse-client/config.xml - type: config + type: config|noreplace - src: root/usr/bin/clickhouse-benchmark dst: /usr/bin/clickhouse-benchmark - src: root/usr/bin/clickhouse-compressor diff --git a/packages/clickhouse-keeper.yaml b/packages/clickhouse-keeper.yaml index 8f319c97b65..f2095dda02a 100644 --- a/packages/clickhouse-keeper.yaml +++ b/packages/clickhouse-keeper.yaml @@ -29,7 +29,7 @@ deb: contents: - src: root/etc/clickhouse-keeper/keeper_config.xml dst: /etc/clickhouse-keeper/keeper_config.xml - type: config + type: config|noreplace - src: root/usr/bin/clickhouse-keeper dst: /usr/bin/clickhouse-keeper # docs diff --git a/packages/clickhouse-server.yaml b/packages/clickhouse-server.yaml index b0778e6bf72..fe59828ca43 100644 --- a/packages/clickhouse-server.yaml +++ b/packages/clickhouse-server.yaml @@ -44,10 +44,10 @@ deb: contents: - src: root/etc/clickhouse-server/config.xml dst: /etc/clickhouse-server/config.xml - type: config + type: config|noreplace - src: root/etc/clickhouse-server/users.xml dst: /etc/clickhouse-server/users.xml - type: config + type: config|noreplace - src: clickhouse-server.init dst: /etc/init.d/clickhouse-server - src: clickhouse-server.service diff --git a/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.cpp b/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.cpp index ffb651b3288..9ef2d295828 100644 --- a/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.cpp +++ b/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.cpp @@ -18,8 +18,10 @@ AggregateFunctionPtr createAggregateFunctionAnalysisOfVariance(const std::string assertNoParameters(name, parameters); assertBinary(name, arguments); - if (!isNumber(arguments[0]) || !isNumber(arguments[1])) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} only supports numerical types", name); + if (!isNumber(arguments[0])) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} only supports numerical argument types", name); + if (!WhichDataType(arguments[1]).isNativeUInt()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument of aggregate function {} should be a native unsigned integer", name); return std::make_shared(arguments, parameters); } diff --git a/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.h b/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.h index efb6426a96c..e891fb191f6 100644 --- a/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.h +++ b/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.h @@ -77,7 +77,7 @@ public: void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { auto f_stat = data(place).getFStatistic(); - if (std::isinf(f_stat) || isNaN(f_stat)) + if (std::isinf(f_stat) || isNaN(f_stat) || f_stat < 0) throw Exception("F statistic is not defined or infinite for these arguments", ErrorCodes::BAD_ARGUMENTS); auto p_value = data(place).getPValue(f_stat); diff --git a/src/AggregateFunctions/Moments.h b/src/AggregateFunctions/Moments.h index 16279cb93a4..2dfd5bc46d6 100644 --- a/src/AggregateFunctions/Moments.h +++ b/src/AggregateFunctions/Moments.h @@ -482,6 +482,8 @@ struct ZTestMoments template struct AnalysisOfVarianceMoments { + constexpr static size_t MAX_GROUPS_NUMBER = 1024 * 1024; + /// Sums of values within a group std::vector xs1{}; /// Sums of squared values within a group @@ -494,6 +496,10 @@ struct AnalysisOfVarianceMoments if (xs1.size() >= possible_size) return; + if (possible_size > MAX_GROUPS_NUMBER) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Too many groups for analysis of variance (should be no more than {}, got {})", + MAX_GROUPS_NUMBER, possible_size); + xs1.resize(possible_size, 0.0); xs2.resize(possible_size, 0.0); ns.resize(possible_size, 0); diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 12038a8a30c..f7f7643a6e3 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -126,6 +126,7 @@ BackupWriterS3::BackupWriterS3( , max_single_read_retries(context_->getSettingsRef().s3_max_single_read_retries) , read_settings(context_->getReadSettings()) , rw_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).rw_settings) + , log(&Poco::Logger::get("BackupWriterS3")) { rw_settings.updateFromSettingsIfEmpty(context_->getSettingsRef()); } @@ -146,9 +147,12 @@ void BackupWriterS3::copyObjectImpl( const String & src_key, const String & dst_bucket, const String & dst_key, - std::optional head, - std::optional metadata) const + const Aws::S3::Model::HeadObjectResult & head, + const std::optional & metadata) const { + size_t size = head.GetContentLength(); + LOG_TRACE(log, "Copying {} bytes using single-operation copy", size); + Aws::S3::Model::CopyObjectRequest request; request.SetCopySource(src_bucket + "/" + src_key); request.SetBucket(dst_bucket); @@ -186,13 +190,11 @@ void BackupWriterS3::copyObjectMultipartImpl( const String & src_key, const String & dst_bucket, const String & dst_key, - std::optional head, - std::optional metadata) const + const Aws::S3::Model::HeadObjectResult & head, + const std::optional & metadata) const { - if (!head) - head = requestObjectHeadData(src_bucket, src_key).GetResult(); - - size_t size = head->GetContentLength(); + size_t size = head.GetContentLength(); + LOG_TRACE(log, "Copying {} bytes using multipart upload copy", size); String multipart_upload_id; @@ -213,16 +215,20 @@ void BackupWriterS3::copyObjectMultipartImpl( std::vector part_tags; + size_t position = 0; size_t upload_part_size = rw_settings.min_upload_part_size; - for (size_t position = 0, part_number = 1; position < size; ++part_number, position += upload_part_size) + + for (size_t part_number = 1; position < size; ++part_number) { + size_t next_position = std::min(position + upload_part_size, size); + Aws::S3::Model::UploadPartCopyRequest part_request; part_request.SetCopySource(src_bucket + "/" + src_key); part_request.SetBucket(dst_bucket); part_request.SetKey(dst_key); part_request.SetUploadId(multipart_upload_id); part_request.SetPartNumber(static_cast(part_number)); - part_request.SetCopySourceRange(fmt::format("bytes={}-{}", position, std::min(size, position + upload_part_size) - 1)); + part_request.SetCopySourceRange(fmt::format("bytes={}-{}", position, next_position - 1)); auto outcome = client->UploadPartCopy(part_request); if (!outcome.IsSuccess()) @@ -239,6 +245,14 @@ void BackupWriterS3::copyObjectMultipartImpl( auto etag = outcome.GetResult().GetCopyPartResult().GetETag(); part_tags.push_back(etag); + + position = next_position; + + if (part_number % rw_settings.upload_part_size_multiply_parts_count_threshold == 0) + { + upload_part_size *= rw_settings.upload_part_size_multiply_factor; + upload_part_size = std::min(upload_part_size, rw_settings.max_upload_part_size); + } } { @@ -280,15 +294,14 @@ void BackupWriterS3::copyFileNative(DiskPtr from_disk, const String & file_name_ auto file_path = fs::path(s3_uri.key) / file_name_to; auto head = requestObjectHeadData(source_bucket, objects[0].absolute_path).GetResult(); - static constexpr int64_t multipart_upload_threashold = 5UL * 1024 * 1024 * 1024; - if (head.GetContentLength() >= multipart_upload_threashold) + if (static_cast(head.GetContentLength()) < rw_settings.max_single_operation_copy_size) { - copyObjectMultipartImpl( + copyObjectImpl( source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head); } else { - copyObjectImpl( + copyObjectMultipartImpl( source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head); } } diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 471ddcc06e6..b52de23e262 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -61,7 +61,6 @@ public: void copyFileNative(DiskPtr from_disk, const String & file_name_from, const String & file_name_to) override; private: - Aws::S3::Model::HeadObjectOutcome requestObjectHeadData(const std::string & bucket_from, const std::string & key) const; void copyObjectImpl( @@ -69,22 +68,23 @@ private: const String & src_key, const String & dst_bucket, const String & dst_key, - std::optional head = std::nullopt, - std::optional metadata = std::nullopt) const; + const Aws::S3::Model::HeadObjectResult & head, + const std::optional & metadata = std::nullopt) const; void copyObjectMultipartImpl( const String & src_bucket, const String & src_key, const String & dst_bucket, const String & dst_key, - std::optional head = std::nullopt, - std::optional metadata = std::nullopt) const; + const Aws::S3::Model::HeadObjectResult & head, + const std::optional & metadata = std::nullopt) const; S3::URI s3_uri; std::shared_ptr client; UInt64 max_single_read_retries; ReadSettings read_settings; S3Settings::ReadWriteSettings rw_settings; + Poco::Logger * log; }; } diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 2f801e496fa..e30a6bb6aaf 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -12,6 +12,7 @@ M(FailedQuery, "Number of failed queries.") \ M(FailedSelectQuery, "Same as FailedQuery, but only for SELECT queries.") \ M(FailedInsertQuery, "Same as FailedQuery, but only for INSERT queries.") \ + M(FailedAsyncInsertQuery, "Number of failed ASYNC INSERT queries.") \ M(QueryTimeMicroseconds, "Total time of all queries.") \ M(SelectQueryTimeMicroseconds, "Total time of SELECT queries.") \ M(InsertQueryTimeMicroseconds, "Total time of INSERT queries.") \ diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index d7e4ea3486d..ef340a33149 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -2,21 +2,19 @@ #include "config.h" #if USE_BASE64 -# include -# include +# include # include # include -# include # include -# include -# include +# include +# include # include +# include +# include namespace DB { -using namespace GatherUtils; - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -25,33 +23,86 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } +namespace Detail +{ + inline size_t base64Decode(const std::span src, UInt8 * dst) + { +# if defined(__aarch64__) + return tb64sdec(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# else + return _tb64d(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# endif + } +} + struct Base64Encode { static constexpr auto name = "base64Encode"; - static size_t getBufferSize(size_t string_length, size_t string_count) + + static size_t getBufferSize(const size_t string_length, const size_t string_count) { return ((string_length - string_count) / 3 + string_count) * 4 + string_count; } + + static size_t performCoding(const std::span src, UInt8 * dst) + { + /* + * Some bug in sse arm64 implementation? + * `base64Encode(repeat('a', 46))` returns wrong padding character + */ +# if defined(__aarch64__) + return tb64senc(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# else + return _tb64e(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# endif + } }; struct Base64Decode { static constexpr auto name = "base64Decode"; - static size_t getBufferSize(size_t string_length, size_t string_count) + static size_t getBufferSize(const size_t string_length, const size_t string_count) { return ((string_length - string_count) / 4 + string_count) * 3 + string_count; } + + static size_t performCoding(const std::span src, UInt8 * dst) + { + const auto outlen = Detail::base64Decode(src, dst); + if (src.size() > 0 && !outlen) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Failed to {} input '{}'", + name, + String(reinterpret_cast(src.data()), src.size())); + + return outlen; + } }; struct TryBase64Decode { static constexpr auto name = "tryBase64Decode"; - static size_t getBufferSize(size_t string_length, size_t string_count) + static size_t getBufferSize(const size_t string_length, const size_t string_count) { return Base64Decode::getBufferSize(string_length, string_count); } + + static size_t performCoding(const std::span src, UInt8 * dst) + { + if (src.empty()) + return 0; + + const auto outlen = Detail::base64Decode(src, dst); + // during decoding character array can be partially polluted + // if fail, revert back and clean + if (!outlen) + *dst = 0; + + return outlen; + } }; template @@ -71,99 +122,60 @@ public: if (arguments.size() != 1) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong number of arguments for function {}: 1 expected.", getName()); - if (!WhichDataType(arguments[0].type).isString()) + if (!WhichDataType(arguments[0].type).isStringOrFixedString()) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of 1st argument of function {}. Must be String.", - arguments[0].type->getName(), getName()); + "Illegal type {} of 1st argument of function {}. Must be FixedString or String.", + arguments[0].type->getName(), + getName()); return std::make_shared(); } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const override { - const ColumnPtr column_string = arguments[0].column; - const ColumnString * input = checkAndGetColumn(column_string.get()); + const auto & input_column = arguments[0].column; + if (const auto * src_column_as_fixed_string = checkAndGetColumn(*input_column)) + return execute(*src_column_as_fixed_string, input_rows_count); + else if (const auto * src_column_as_string = checkAndGetColumn(*input_column)) + return execute(*src_column_as_string, input_rows_count); - if (!input) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of first argument of function {}, must be of type String", - arguments[0].column->getName(), getName()); + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}, must be of type FixedString or String.", + input_column->getName(), + getName()); + } +private: + static ColumnPtr execute(const ColumnString & src_column, const size_t src_row_count) + { auto dst_column = ColumnString::create(); - auto & dst_data = dst_column->getChars(); + auto & dst_chars = dst_column->getChars(); auto & dst_offsets = dst_column->getOffsets(); - size_t reserve = Func::getBufferSize(input->getChars().size(), input->size()); - dst_data.resize(reserve); - dst_offsets.resize(input_rows_count); + const auto reserve = Func::getBufferSize(src_column.byteSize(), src_column.size()); + dst_chars.resize(reserve); + dst_offsets.resize(src_row_count); - const ColumnString::Offsets & src_offsets = input->getOffsets(); + const auto & src_chars = src_column.getChars(); + const auto & src_offsets = src_column.getOffsets(); - const auto * source = input->getChars().data(); - auto * dst = dst_data.data(); + auto * dst = dst_chars.data(); auto * dst_pos = dst; + const auto * src = src_chars.data(); size_t src_offset_prev = 0; - - for (size_t row = 0; row < input_rows_count; ++row) + for (size_t row = 0; row < src_row_count; ++row) { - size_t srclen = src_offsets[row] - src_offset_prev - 1; - size_t outlen = 0; - - if constexpr (std::is_same_v) - { - /* - * Some bug in sse arm64 implementation? - * `base64Encode(repeat('a', 46))` returns wrong padding character - */ -#if defined(__aarch64__) - outlen = tb64senc(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#else - outlen = _tb64e(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#endif - } - else if constexpr (std::is_same_v) - { - if (srclen > 0) - { -#if defined(__aarch64__) - outlen = tb64sdec(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#else - outlen = _tb64d(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#endif - - if (!outlen) - throw Exception( - ErrorCodes::INCORRECT_DATA, - "Failed to {} input '{}'", - getName(), String(reinterpret_cast(source), srclen)); - } - } - else - { - if (srclen > 0) - { - // during decoding character array can be partially polluted - // if fail, revert back and clean - auto * savepoint = dst_pos; - outlen = _tb64d(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); - if (!outlen) - { - outlen = 0; - dst_pos = savepoint; //-V1048 - // clean the symbol - dst_pos[0] = 0; - } - } - } + const size_t src_length = src_offsets[row] - src_offset_prev - 1; + const auto outlen = Func::performCoding({src, src_length}, dst_pos); /// Base64 library is using AVX-512 with some shuffle operations. /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. __msan_unpoison(dst_pos, outlen); - source += srclen + 1; + src += src_length + 1; dst_pos += outlen; *dst_pos = '\0'; dst_pos += 1; @@ -172,8 +184,44 @@ public: src_offset_prev = src_offsets[row]; } - dst_data.resize(dst_pos - dst); + dst_chars.resize(dst_pos - dst); + return dst_column; + } + static ColumnPtr execute(const ColumnFixedString & src_column, const size_t src_row_count) + { + auto dst_column = ColumnString::create(); + auto & dst_chars = dst_column->getChars(); + auto & dst_offsets = dst_column->getOffsets(); + + const auto reserve = Func::getBufferSize(src_column.byteSize(), src_column.size()); + dst_chars.resize(reserve); + dst_offsets.resize(src_row_count); + + const auto & src_chars = src_column.getChars(); + const auto & src_n = src_column.getN(); + + auto * dst = dst_chars.data(); + auto * dst_pos = dst; + const auto * src = src_chars.data(); + + for (size_t row = 0; row < src_row_count; ++row) + { + const auto outlen = Func::performCoding({src, src_n}, dst_pos); + + /// Base64 library is using AVX-512 with some shuffle operations. + /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. + __msan_unpoison(dst_pos, outlen); + + src += src_n; + dst_pos += outlen; + *dst_pos = '\0'; + dst_pos += 1; + + dst_offsets[row] = dst_pos - dst; + } + + dst_chars.resize(dst_pos - dst); return dst_column; } }; diff --git a/src/Functions/FunctionStringReplace.h b/src/Functions/FunctionStringReplace.h index 09aa5586929..f90eac2e7f3 100644 --- a/src/Functions/FunctionStringReplace.h +++ b/src/Functions/FunctionStringReplace.h @@ -38,18 +38,21 @@ public: { if (!isStringOrFixedString(arguments[0])) throw Exception( - "Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of first argument of function {}", + arguments[0]->getName(), getName()); if (!isStringOrFixedString(arguments[1])) throw Exception( - "Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of second argument of function {}", + arguments[1]->getName(), getName()); if (!isStringOrFixedString(arguments[2])) throw Exception( - "Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of third argument of function {}", + arguments[2]->getName(), getName()); return std::make_shared(); } @@ -61,7 +64,10 @@ public: const ColumnPtr column_replacement = arguments[2].column; if (!isColumnConst(*column_needle) || !isColumnConst(*column_replacement)) - throw Exception("2nd and 3rd arguments of function " + getName() + " must be constants.", ErrorCodes::ILLEGAL_COLUMN); + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "2nd and 3rd arguments of function {} must be constants.", + getName()); const IColumn * c1 = arguments[1].column.get(); const IColumn * c2 = arguments[2].column.get(); @@ -71,7 +77,9 @@ public: String replacement = c2_const->getValue(); if (needle.empty()) - throw Exception("Length of the second argument of function replace must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + throw Exception( + ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "Length of the second argument of function replace must be greater than 0."); if (const ColumnString * col = checkAndGetColumn(column_src.get())) { @@ -87,8 +95,9 @@ public: } else throw Exception( - "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), getName()); } }; diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index 8e355405093..18b69606359 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -91,7 +91,7 @@ struct MultiMatchAllIndicesImpl hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch); if (err != HS_SUCCESS) - throw Exception("Could not clone scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + throw Exception("Could not clone scratch space for vectorscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); MultiRegexps::ScratchPtr smart_scratch(scratch); @@ -203,7 +203,7 @@ struct MultiMatchAllIndicesImpl hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch); if (err != HS_SUCCESS) - throw Exception("Could not clone scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + throw Exception("Could not clone scratch space for vectorscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); MultiRegexps::ScratchPtr smart_scratch(scratch); diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index c1ff83d04fe..0380e8d1750 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -38,6 +38,7 @@ namespace ErrorCodes namespace Regexps { + using Regexp = OptimizedRegularExpressionSingleThreaded; using RegexpPtr = std::shared_ptr; @@ -112,11 +113,11 @@ struct HyperscanDeleter }; /// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception. -using CompilerError = std::unique_ptr>; +using CompilerErrorPtr = std::unique_ptr>; using ScratchPtr = std::unique_ptr>; using DataBasePtr = std::unique_ptr>; -/// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher. +/// Database is immutable/thread-safe across multiple threads. Scratch is not but we can copy it whenever we use it in the searcher. class Regexps { public: @@ -154,7 +155,7 @@ private: using DeferredConstructedRegexpsPtr = std::shared_ptr; -template +template inline Regexps constructRegexps(const std::vector & str_patterns, [[maybe_unused]] std::optional edit_distance) { /// Common pointers @@ -168,7 +169,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb patterns.reserve(str_patterns.size()); flags.reserve(str_patterns.size()); - if constexpr (WithEditDistance) + if constexpr (with_edit_distance) { ext_exprs.reserve(str_patterns.size()); ext_exprs_ptrs.reserve(str_patterns.size()); @@ -186,7 +187,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag */ flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8); - if constexpr (WithEditDistance) + if constexpr (with_edit_distance) { /// Hyperscan currently does not support UTF8 matching with edit distance. flags.back() &= ~HS_FLAG_UTF8; @@ -211,7 +212,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb } hs_error_t err; - if constexpr (!WithEditDistance) + if constexpr (!with_edit_distance) err = hs_compile_multi( patterns.data(), flags.data(), @@ -236,7 +237,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb if (err != HS_SUCCESS) { /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown. - CompilerError error(compile_error); + CompilerErrorPtr error(compile_error); if (error->expression < 0) throw Exception(ErrorCodes::LOGICAL_ERROR, String(error->message)); @@ -253,7 +254,7 @@ inline Regexps constructRegexps(const std::vector & str_patterns, [[mayb /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch. if (err != HS_SUCCESS) - throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for hyperscan"); + throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for vectorscan"); return {db, scratch}; } @@ -288,9 +289,9 @@ struct GlobalCacheTable } }; -/// If WithEditDistance is False, edit_distance must be nullopt. Also, we use templates here because each instantiation of function template +/// If with_edit_distance is False, edit_distance must be nullopt. Also, we use templates here because each instantiation of function template /// has its own copy of local static variables which must not be the same for different hyperscan compilations. -template +template inline DeferredConstructedRegexpsPtr getOrSet(const std::vector & patterns, std::optional edit_distance) { static GlobalCacheTable pool; /// Different variables for different pattern parameters, thread-safe in C++11 @@ -320,7 +321,7 @@ inline DeferredConstructedRegexpsPtr getOrSet(const std::vector( [str_patterns, edit_distance]() { - return constructRegexps(str_patterns, edit_distance); + return constructRegexps(str_patterns, edit_distance); }); bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps}; } @@ -331,7 +332,7 @@ inline DeferredConstructedRegexpsPtr getOrSet(const std::vector( [str_patterns, edit_distance]() { - return constructRegexps(str_patterns, edit_distance); + return constructRegexps(str_patterns, edit_distance); }); bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps}; } diff --git a/src/Functions/ReplaceRegexpImpl.h b/src/Functions/ReplaceRegexpImpl.h index a1d17ce9da1..88d7a40d2dd 100644 --- a/src/Functions/ReplaceRegexpImpl.h +++ b/src/Functions/ReplaceRegexpImpl.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include @@ -17,131 +16,130 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +struct ReplaceRegexpTraits +{ + enum class Replace + { + First, + All + }; +}; /** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants. - * 'replacement' could contain substitutions, for example: '\2-\3-\1' + * 'replacement' can contain substitutions, for example: '\2-\3-\1' */ -template +template struct ReplaceRegexpImpl { - /// Sequence of instructions, describing how to get resulting string. struct Instruction { - /// If not negative - perform substitution of n-th subpattern from the regexp match. + /// If not negative, perform substitution of n-th subpattern from the regexp match. int substitution_num = -1; - /// Otherwise - paste this string verbatim. - std::string literal; + /// Otherwise, paste this literal string verbatim. + String literal; - Instruction(int substitution_num_) : substitution_num(substitution_num_) {} /// NOLINT - Instruction(std::string literal_) : literal(std::move(literal_)) {} /// NOLINT + explicit Instruction(int substitution_num_) : substitution_num(substitution_num_) {} + explicit Instruction(String literal_) : literal(std::move(literal_)) {} }; + /// Decomposes the replacement string into a sequence of substitutions and literals. + /// E.g. "abc\1de\2fg\1\2" --> inst("abc"), inst(1), inst("de"), inst(2), inst("fg"), inst(1), inst(2) using Instructions = std::vector; - static const size_t max_captures = 10; + static constexpr int max_captures = 10; - - static Instructions createInstructions(const std::string & s, int num_captures) + static Instructions createInstructions(std::string_view replacement, int num_captures) { Instructions instructions; - String now; - for (size_t i = 0; i < s.size(); ++i) + String literals; + for (size_t i = 0; i < replacement.size(); ++i) { - if (s[i] == '\\' && i + 1 < s.size()) + if (replacement[i] == '\\' && i + 1 < replacement.size()) { - if (isNumericASCII(s[i + 1])) /// Substitution + if (isNumericASCII(replacement[i + 1])) /// Substitution { - if (!now.empty()) + if (!literals.empty()) { - instructions.emplace_back(now); - now = ""; + instructions.emplace_back(literals); + literals = ""; } - instructions.emplace_back(s[i + 1] - '0'); + instructions.emplace_back(replacement[i + 1] - '0'); } else - now += s[i + 1]; /// Escaping + literals += replacement[i + 1]; /// Escaping ++i; } else - now += s[i]; /// Plain character + literals += replacement[i]; /// Plain character } - if (!now.empty()) - { - instructions.emplace_back(now); - now = ""; - } + if (!literals.empty()) + instructions.emplace_back(literals); - for (const auto & it : instructions) - if (it.substitution_num >= num_captures) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Invalid replace instruction in replacement string. Id: {}, but regexp has only {} subpatterns", - it.substitution_num, num_captures - 1); + for (const auto & instr : instructions) + if (instr.substitution_num >= num_captures) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Id {} in replacement string is an invalid substitution, regexp has only {} capturing groups", + instr.substitution_num, num_captures - 1); return instructions; } - static void processString( - const re2_st::StringPiece & input, + const char * haystack_data, + size_t haystack_length, ColumnString::Chars & res_data, ColumnString::Offset & res_offset, - re2_st::RE2 & searcher, + const re2_st::RE2 & searcher, int num_captures, const Instructions & instructions) { + re2_st::StringPiece haystack(haystack_data, haystack_length); re2_st::StringPiece matches[max_captures]; size_t copy_pos = 0; size_t match_pos = 0; - while (match_pos < static_cast(input.length())) + while (match_pos < haystack_length) { /// If no more replacements possible for current string bool can_finish_current_string = false; - if (searcher.Match(input, match_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) + if (searcher.Match(haystack, match_pos, haystack_length, re2_st::RE2::Anchor::UNANCHORED, matches, num_captures)) { - const auto & match = matches[0]; - size_t bytes_to_copy = (match.data() - input.data()) - copy_pos; + const auto & match = matches[0]; /// Complete match (\0) + size_t bytes_to_copy = (match.data() - haystack.data()) - copy_pos; - /// Copy prefix before matched regexp without modification + /// Copy prefix before current match without modification res_data.resize(res_data.size() + bytes_to_copy); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + copy_pos, bytes_to_copy); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], haystack.data() + copy_pos, bytes_to_copy); res_offset += bytes_to_copy; copy_pos += bytes_to_copy + match.length(); match_pos = copy_pos; - /// Do substitution instructions - for (const auto & it : instructions) + /// Substitute inside current match using instructions + for (const auto & instr : instructions) { - if (it.substitution_num >= 0) - { - const auto & substitution = matches[it.substitution_num]; - - res_data.resize(res_data.size() + substitution.length()); - memcpy(&res_data[res_offset], substitution.data(), substitution.length()); - res_offset += substitution.length(); - } + std::string_view replacement; + if (instr.substitution_num >= 0) + replacement = std::string_view(matches[instr.substitution_num].data(), matches[instr.substitution_num].size()); else - { - const auto & literal = it.literal; - - res_data.resize(res_data.size() + literal.size()); - memcpy(&res_data[res_offset], literal.data(), literal.size()); - res_offset += literal.size(); - } + replacement = instr.literal; + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); } - if (replace_one) + if constexpr (replace == ReplaceRegexpTraits::Replace::First) can_finish_current_string = true; - if (match.length() == 0) + if (match.empty()) { /// Step one character to avoid infinite loop ++match_pos; - if (match_pos >= static_cast(input.length())) + if (match_pos >= haystack_length) can_finish_current_string = true; } } @@ -151,10 +149,10 @@ struct ReplaceRegexpImpl /// If ready, append suffix after match to end of string. if (can_finish_current_string) { - res_data.resize(res_data.size() + input.length() - copy_pos); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + copy_pos, input.length() - copy_pos); - res_offset += input.length() - copy_pos; - copy_pos = input.length(); + res_data.resize(res_data.size() + haystack_length - copy_pos); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], haystack.data() + copy_pos, haystack_length - copy_pos); + res_offset += haystack_length - copy_pos; + copy_pos = haystack_length; match_pos = copy_pos; } } @@ -164,12 +162,11 @@ struct ReplaceRegexpImpl ++res_offset; } - static void vector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, - const std::string & needle, - const std::string & replacement, + const String & needle, + const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { @@ -178,11 +175,19 @@ struct ReplaceRegexpImpl size_t size = offsets.size(); res_offsets.resize(size); - typename re2_st::RE2::Options regexp_options; - /// Never write error messages to stderr. It's ignorant to do it from library code. + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. regexp_options.set_log_errors(false); + re2_st::RE2 searcher(needle, regexp_options); - int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast(max_captures)); + + if (!searcher.ok()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "The pattern argument is not a valid re2 pattern: {}", + searcher.error()); + + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); Instructions instructions = createInstructions(replacement, num_captures); @@ -190,9 +195,10 @@ struct ReplaceRegexpImpl for (size_t i = 0; i < size; ++i) { size_t from = i > 0 ? offsets[i - 1] : 0; - re2_st::StringPiece input(reinterpret_cast(data.data() + from), offsets[i] - from - 1); + const char * haystack_data = reinterpret_cast(data.data() + from); + const size_t haystack_length = static_cast(offsets[i] - from - 1); - processString(input, res_data, res_offset, searcher, num_captures, instructions); + processString(haystack_data, haystack_length, res_data, res_offset, searcher, num_captures, instructions); res_offsets[i] = res_offset; } } @@ -200,8 +206,8 @@ struct ReplaceRegexpImpl static void vectorFixed( const ColumnString::Chars & data, size_t n, - const std::string & needle, - const std::string & replacement, + const String & needle, + const String & replacement, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { @@ -210,20 +216,29 @@ struct ReplaceRegexpImpl res_data.reserve(data.size()); res_offsets.resize(size); - typename re2_st::RE2::Options regexp_options; - /// Never write error messages to stderr. It's ignorant to do it from library code. + re2_st::RE2::Options regexp_options; + /// Don't write error messages to stderr. regexp_options.set_log_errors(false); + re2_st::RE2 searcher(needle, regexp_options); - int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast(max_captures)); + + if (!searcher.ok()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "The pattern argument is not a valid re2 pattern: {}", + searcher.error()); + + int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, max_captures); Instructions instructions = createInstructions(replacement, num_captures); for (size_t i = 0; i < size; ++i) { size_t from = i * n; - re2_st::StringPiece input(reinterpret_cast(data.data() + from), n); + const char * haystack_data = reinterpret_cast(data.data() + from); + const size_t haystack_length = n; - processString(input, res_data, res_offset, searcher, num_captures, instructions); + processString(haystack_data, haystack_length, res_data, res_offset, searcher, num_captures, instructions); res_offsets[i] = res_offset; } } diff --git a/src/Functions/ReplaceStringImpl.h b/src/Functions/ReplaceStringImpl.h index ab0e53d3c45..1a9ec49c58c 100644 --- a/src/Functions/ReplaceStringImpl.h +++ b/src/Functions/ReplaceStringImpl.h @@ -8,9 +8,17 @@ namespace DB { +struct ReplaceStringTraits +{ + enum class Replace + { + First, + All + }; +}; /** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants. */ -template +template struct ReplaceStringImpl { static void vector( @@ -66,7 +74,7 @@ struct ReplaceStringImpl memcpy(&res_data[res_offset], replacement.data(), replacement.size()); res_offset += replacement.size(); pos = match + needle.size(); - if (replace_one) + if constexpr (replace == ReplaceStringTraits::Replace::First) can_finish_current_string = true; } else @@ -155,7 +163,7 @@ struct ReplaceStringImpl memcpy(&res_data[res_offset], replacement.data(), replacement.size()); res_offset += replacement.size(); pos = match + needle.size(); - if (replace_one || pos == begin + n * (i + 1)) + if (replace == ReplaceStringTraits::Replace::First || pos == begin + n * (i + 1)) can_finish_current_string = true; } else diff --git a/src/Functions/base64Decode.cpp b/src/Functions/base64Decode.cpp index f6943233d44..4060aafe1a3 100644 --- a/src/Functions/base64Decode.cpp +++ b/src/Functions/base64Decode.cpp @@ -1,8 +1,7 @@ #include + #if USE_BASE64 #include -#include - namespace DB { @@ -15,4 +14,5 @@ REGISTER_FUNCTION(Base64Decode) factory.registerAlias("FROM_BASE64", "base64Decode", FunctionFactory::CaseInsensitive); } } + #endif diff --git a/src/Functions/base64Encode.cpp b/src/Functions/base64Encode.cpp index fc06935e0a1..773db7e09d9 100644 --- a/src/Functions/base64Encode.cpp +++ b/src/Functions/base64Encode.cpp @@ -1,10 +1,7 @@ -#include #include -#include "config.h" - #if USE_BASE64 -# include +#include namespace DB { @@ -17,4 +14,5 @@ REGISTER_FUNCTION(Base64Encode) factory.registerAlias("TO_BASE64", "base64Encode", FunctionFactory::CaseInsensitive); } } + #endif diff --git a/src/Functions/replaceAll.cpp b/src/Functions/replaceAll.cpp index 7c5cd82ca5d..d85d192d199 100644 --- a/src/Functions/replaceAll.cpp +++ b/src/Functions/replaceAll.cpp @@ -13,7 +13,7 @@ struct NameReplaceAll static constexpr auto name = "replaceAll"; }; -using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; +using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; } diff --git a/src/Functions/replaceOne.cpp b/src/Functions/replaceOne.cpp index c0c21dbf51f..6557339537e 100644 --- a/src/Functions/replaceOne.cpp +++ b/src/Functions/replaceOne.cpp @@ -13,7 +13,7 @@ struct NameReplaceOne static constexpr auto name = "replaceOne"; }; -using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; +using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; } diff --git a/src/Functions/replaceRegexpAll.cpp b/src/Functions/replaceRegexpAll.cpp index 0250b4a5ba6..4eaf46c05d4 100644 --- a/src/Functions/replaceRegexpAll.cpp +++ b/src/Functions/replaceRegexpAll.cpp @@ -13,7 +13,7 @@ struct NameReplaceRegexpAll static constexpr auto name = "replaceRegexpAll"; }; -using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; +using FunctionReplaceRegexpAll = FunctionStringReplace, NameReplaceRegexpAll>; } diff --git a/src/Functions/replaceRegexpOne.cpp b/src/Functions/replaceRegexpOne.cpp index b40992b73fc..60e29213a9a 100644 --- a/src/Functions/replaceRegexpOne.cpp +++ b/src/Functions/replaceRegexpOne.cpp @@ -13,7 +13,7 @@ struct NameReplaceRegexpOne static constexpr auto name = "replaceRegexpOne"; }; -using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; +using FunctionReplaceRegexpOne = FunctionStringReplace, NameReplaceRegexpOne>; } diff --git a/src/Functions/tryBase64Decode.cpp b/src/Functions/tryBase64Decode.cpp index 1102c7a3418..bd452b8357b 100644 --- a/src/Functions/tryBase64Decode.cpp +++ b/src/Functions/tryBase64Decode.cpp @@ -1,7 +1,7 @@ #include + #if USE_BASE64 #include -#include namespace DB { @@ -10,4 +10,5 @@ REGISTER_FUNCTION(TryBase64Decode) factory.registerFunction>(); } } + #endif diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index de2b5654ae5..b60fdee1184 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -528,16 +528,17 @@ namespace detail auto on_retriable_error = [&]() { - retry_with_range_header = true; - impl.reset(); - auto http_session = session->getSession(); - http_session->reset(); - sleepForMilliseconds(milliseconds_to_wait); + retry_with_range_header = true; + impl.reset(); + auto http_session = session->getSession(); + http_session->reset(); + sleepForMilliseconds(milliseconds_to_wait); }; for (size_t i = 0; i < settings.http_max_tries; ++i) { exception = nullptr; + initialization_error = InitializeError::NONE; try { diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index f823015bd7d..9ed2c41fd01 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -123,7 +123,10 @@ void WriteBufferFromS3::nextImpl() void WriteBufferFromS3::allocateBuffer() { if (total_parts_uploaded != 0 && total_parts_uploaded % s3_settings.upload_part_size_multiply_parts_count_threshold == 0) + { upload_part_size *= s3_settings.upload_part_size_multiply_factor; + upload_part_size = std::min(upload_part_size, s3_settings.max_upload_part_size); + } temporary_buffer = Aws::MakeShared("temporary buffer"); temporary_buffer->exceptions(std::ios::badbit); diff --git a/src/Interpreters/AsynchronousInsertQueue.cpp b/src/Interpreters/AsynchronousInsertQueue.cpp index 3aadea918fb..bf85affcb90 100644 --- a/src/Interpreters/AsynchronousInsertQueue.cpp +++ b/src/Interpreters/AsynchronousInsertQueue.cpp @@ -37,6 +37,7 @@ namespace ProfileEvents { extern const Event AsyncInsertQuery; extern const Event AsyncInsertBytes; + extern const Event FailedAsyncInsertQuery; } namespace DB @@ -101,6 +102,8 @@ void AsynchronousInsertQueue::InsertData::Entry::finish(std::exception_ptr excep { std::lock_guard lock(mutex); finished = true; + if (exception_) + ProfileEvents::increment(ProfileEvents::FailedAsyncInsertQuery, 1); exception = exception_; cv.notify_all(); } diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp index 88d7cedec83..36b2f17e8a1 100644 --- a/src/Interpreters/ClusterDiscovery.cpp +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -217,7 +217,7 @@ bool ClusterDiscovery::needUpdate(const Strings & node_uuids, const NodesInfo & ClusterPtr ClusterDiscovery::makeCluster(const ClusterInfo & cluster_info) { - std::vector> shards; + std::vector shards; { std::map replica_adresses; @@ -244,7 +244,7 @@ ClusterPtr ClusterDiscovery::makeCluster(const ClusterInfo & cluster_info) /* password= */ "", /* clickhouse_port= */ secure ? context->getTCPPortSecure().value_or(DBMS_DEFAULT_SECURE_PORT) : context->getTCPPort(), /* treat_local_as_remote= */ false, - /* treat_local_port_as_remote= */ context->getApplicationType() == Context::ApplicationType::LOCAL, + /* treat_local_port_as_remote= */ false, /// should be set only for clickhouse-local, but cluster discovery is not used there /* secure= */ secure); return cluster; } diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 26b9b843567..41c7c28a6fa 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -658,7 +658,9 @@ void HashJoin::initRightBlockStructure(Block & saved_block_sample) /// Save non key columns for (auto & column : sample_block_with_columns_to_add) { - if (!saved_block_sample.findByName(column.name)) + if (auto * col = saved_block_sample.findByName(column.name)) + *col = column; + else saved_block_sample.insert(column); } } diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index 65e9bb1ab8c..68e15d10f52 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -5,13 +5,23 @@ #include #include #include - +#include #include namespace DB { +namespace +{ + /// An object up to 5 GB can be copied in a single atomic operation. + constexpr UInt64 DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE = 5_GiB; + + /// The maximum size of an uploaded part. + constexpr UInt64 DEFAULT_MAX_UPLOAD_PART_SIZE = 5_GiB; +} + + void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings) { std::lock_guard lock(mutex); @@ -50,9 +60,11 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U S3Settings::ReadWriteSettings rw_settings; rw_settings.max_single_read_retries = get_uint_for_key(key, "max_single_read_retries", true, settings.s3_max_single_read_retries); rw_settings.min_upload_part_size = get_uint_for_key(key, "min_upload_part_size", true, settings.s3_min_upload_part_size); + rw_settings.max_upload_part_size = get_uint_for_key(key, "max_upload_part_size", true, DEFAULT_MAX_UPLOAD_PART_SIZE); rw_settings.upload_part_size_multiply_factor = get_uint_for_key(key, "upload_part_size_multiply_factor", true, settings.s3_upload_part_size_multiply_factor); rw_settings.upload_part_size_multiply_parts_count_threshold = get_uint_for_key(key, "upload_part_size_multiply_parts_count_threshold", true, settings.s3_upload_part_size_multiply_parts_count_threshold); rw_settings.max_single_part_upload_size = get_uint_for_key(key, "max_single_part_upload_size", true, settings.s3_max_single_part_upload_size); + rw_settings.max_single_operation_copy_size = get_uint_for_key(key, "max_single_operation_copy_size", true, DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE); rw_settings.max_connections = get_uint_for_key(key, "max_connections", true, settings.s3_max_connections); rw_settings.check_objects_after_upload = get_bool_for_key(key, "check_objects_after_upload", true, false); @@ -95,12 +107,16 @@ void S3Settings::ReadWriteSettings::updateFromSettingsIfEmpty(const Settings & s max_single_read_retries = settings.s3_max_single_read_retries; if (!min_upload_part_size) min_upload_part_size = settings.s3_min_upload_part_size; + if (!max_upload_part_size) + max_upload_part_size = DEFAULT_MAX_UPLOAD_PART_SIZE; if (!upload_part_size_multiply_factor) upload_part_size_multiply_factor = settings.s3_upload_part_size_multiply_factor; if (!upload_part_size_multiply_parts_count_threshold) upload_part_size_multiply_parts_count_threshold = settings.s3_upload_part_size_multiply_parts_count_threshold; if (!max_single_part_upload_size) max_single_part_upload_size = settings.s3_max_single_part_upload_size; + if (!max_single_operation_copy_size) + max_single_operation_copy_size = DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE; if (!max_connections) max_connections = settings.s3_max_connections; if (!max_unexpected_write_error_retries) diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 2da4a1d7590..bd90ba569d8 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -27,9 +27,11 @@ struct S3Settings { size_t max_single_read_retries = 0; size_t min_upload_part_size = 0; + size_t max_upload_part_size = 0; size_t upload_part_size_multiply_factor = 0; size_t upload_part_size_multiply_parts_count_threshold = 0; size_t max_single_part_upload_size = 0; + size_t max_single_operation_copy_size = 0; size_t max_connections = 0; bool check_objects_after_upload = false; size_t max_unexpected_write_error_retries = 0; @@ -41,9 +43,11 @@ struct S3Settings { return max_single_read_retries == other.max_single_read_retries && min_upload_part_size == other.min_upload_part_size + && max_upload_part_size == other.max_upload_part_size && upload_part_size_multiply_factor == other.upload_part_size_multiply_factor && upload_part_size_multiply_parts_count_threshold == other.upload_part_size_multiply_parts_count_threshold && max_single_part_upload_size == other.max_single_part_upload_size + && max_single_operation_copy_size == other.max_single_operation_copy_size && max_connections == other.max_connections && check_objects_after_upload == other.check_objects_after_upload && max_unexpected_write_error_retries == other.max_unexpected_write_error_retries; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index c38b4313359..0f01dc4288c 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -280,7 +280,7 @@ namespace timeouts, credentials, settings.max_http_get_redirects, - DBMS_DEFAULT_BUFFER_SIZE, + settings.max_read_buffer_size, read_settings, headers, ReadWriteBufferFromHTTP::Range{0, std::nullopt}, @@ -341,7 +341,7 @@ namespace timeouts, credentials, settings.max_http_get_redirects, - DBMS_DEFAULT_BUFFER_SIZE, + settings.max_read_buffer_size, read_settings, headers, &context->getRemoteHostFilter(), @@ -378,7 +378,7 @@ namespace timeouts, credentials, settings.max_http_get_redirects, - DBMS_DEFAULT_BUFFER_SIZE, + settings.max_read_buffer_size, read_settings, headers, ReadWriteBufferFromHTTP::Range{}, @@ -863,6 +863,8 @@ std::optional IStorageURLBase::getLastModificationTime( const Poco::Net::HTTPBasicCredentials & credentials, const ContextPtr & context) { + auto settings = context->getSettingsRef(); + try { ReadWriteBufferFromHTTP buf( @@ -871,8 +873,8 @@ std::optional IStorageURLBase::getLastModificationTime( {}, ConnectionTimeouts::getHTTPTimeouts(context), credentials, - context->getSettingsRef().max_http_get_redirects, - DBMS_DEFAULT_BUFFER_SIZE, + settings.max_http_get_redirects, + settings.max_read_buffer_size, context->getReadSettings(), headers, ReadWriteBufferFromHTTP::Range{}, diff --git a/tests/ci/bugfix_validate_check.py b/tests/ci/bugfix_validate_check.py index 9ecada816b6..e5f37f2940b 100644 --- a/tests/ci/bugfix_validate_check.py +++ b/tests/ci/bugfix_validate_check.py @@ -3,6 +3,7 @@ import argparse import csv import itertools +import logging import os from github import Github @@ -37,6 +38,8 @@ def process_result(file_path): state, report_url, description = post_commit_status_from_file(file_path) prefix = os.path.basename(os.path.dirname(file_path)) is_ok = state == "success" + if is_ok and report_url == "null": + return is_ok, None status = f'OK: Bug reproduced (Report' if not is_ok: @@ -51,15 +54,23 @@ def process_all_results(file_paths): for status_path in file_paths: is_ok, test_results = process_result(status_path) any_ok = any_ok or is_ok - all_results.extend(test_results) + if test_results is not None: + all_results.extend(test_results) + return any_ok, all_results def main(args): + logging.basicConfig(level=logging.INFO) + check_name_with_group = "Bugfix validate check" is_ok, test_results = process_all_results(args.status) + if not test_results: + logging.info("No results to upload") + return + pr_info = PRInfo() report_url = upload_results( S3Helper(), diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt b/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt index c0dcf4a4dde..e607f1a9f39 100644 --- a/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt +++ b/tests/ci/cancel_and_rerun_workflow_lambda/requirements.txt @@ -1,3 +1,3 @@ requests PyJWT -cryptography +cryptography==37.0.4 diff --git a/tests/ci/metrics_lambda/requirements.txt b/tests/ci/metrics_lambda/requirements.txt index c0dcf4a4dde..e607f1a9f39 100644 --- a/tests/ci/metrics_lambda/requirements.txt +++ b/tests/ci/metrics_lambda/requirements.txt @@ -1,3 +1,3 @@ requests PyJWT -cryptography +cryptography==37.0.4 diff --git a/tests/ci/termination_lambda/requirements.txt b/tests/ci/termination_lambda/requirements.txt index c0dcf4a4dde..e607f1a9f39 100644 --- a/tests/ci/termination_lambda/requirements.txt +++ b/tests/ci/termination_lambda/requirements.txt @@ -1,3 +1,3 @@ requests PyJWT -cryptography +cryptography==37.0.4 diff --git a/tests/ci/token_lambda/requirements.txt b/tests/ci/token_lambda/requirements.txt index c0dcf4a4dde..e607f1a9f39 100644 --- a/tests/ci/token_lambda/requirements.txt +++ b/tests/ci/token_lambda/requirements.txt @@ -1,3 +1,3 @@ requests PyJWT -cryptography +cryptography==37.0.4 diff --git a/tests/ci/workflow_approve_rerun_lambda/requirements.txt b/tests/ci/workflow_approve_rerun_lambda/requirements.txt index c0dcf4a4dde..e607f1a9f39 100644 --- a/tests/ci/workflow_approve_rerun_lambda/requirements.txt +++ b/tests/ci/workflow_approve_rerun_lambda/requirements.txt @@ -1,3 +1,3 @@ requests PyJWT -cryptography +cryptography==37.0.4 diff --git a/tests/integration/test_backup_restore_s3/configs/s3_settings.xml b/tests/integration/test_backup_restore_s3/configs/s3_settings.xml new file mode 100644 index 00000000000..2aef4db55c8 --- /dev/null +++ b/tests/integration/test_backup_restore_s3/configs/s3_settings.xml @@ -0,0 +1,12 @@ + + + + http://minio1:9001/root/data/backups/multipart_upload_copy/ + + 1 + 5242880 + 3 + 2 + + + diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 617c14d6736..7ddb1459ab9 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -4,7 +4,11 @@ from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) node = cluster.add_instance( "node", - main_configs=["configs/disk_s3.xml", "configs/named_collection_s3_backups.xml"], + main_configs=[ + "configs/disk_s3.xml", + "configs/named_collection_s3_backups.xml", + "configs/s3_settings.xml", + ], with_minio=True, ) @@ -27,17 +31,17 @@ def new_backup_name(): return f"backup{backup_id_counter}" -def check_backup_and_restore(storage_policy, backup_destination): +def check_backup_and_restore(storage_policy, backup_destination, size=1000): node.query( f""" DROP TABLE IF EXISTS data NO DELAY; CREATE TABLE data (key Int, value String, array Array(String)) Engine=MergeTree() ORDER BY tuple() SETTINGS storage_policy='{storage_policy}'; - INSERT INTO data SELECT * FROM generateRandom('key Int, value String, array Array(String)') LIMIT 1000; + INSERT INTO data SELECT * FROM generateRandom('key Int, value String, array Array(String)') LIMIT {size}; BACKUP TABLE data TO {backup_destination}; RESTORE TABLE data AS data_restored FROM {backup_destination}; SELECT throwIf( - (SELECT groupArray(tuple(*)) FROM data) != - (SELECT groupArray(tuple(*)) FROM data_restored), + (SELECT count(), sum(sipHash64(*)) FROM data) != + (SELECT count(), sum(sipHash64(*)) FROM data_restored), 'Data does not matched after BACKUP/RESTORE' ); DROP TABLE data NO DELAY; @@ -106,9 +110,10 @@ def test_backup_to_s3_native_copy(): ) check_backup_and_restore(storage_policy, backup_destination) assert node.contains_in_log("using native copy") + assert node.contains_in_log("single-operation copy") -def test_backup_to_s3_other_bucket_native_copy(): +def test_backup_to_s3_native_copy_other_bucket(): storage_policy = "policy_s3_other_bucket" backup_name = new_backup_name() backup_destination = ( @@ -116,3 +121,13 @@ def test_backup_to_s3_other_bucket_native_copy(): ) check_backup_and_restore(storage_policy, backup_destination) assert node.contains_in_log("using native copy") + assert node.contains_in_log("single-operation copy") + + +def test_backup_to_s3_native_copy_multipart_upload(): + storage_policy = "policy_s3" + backup_name = new_backup_name() + backup_destination = f"S3('http://minio1:9001/root/data/backups/multipart_upload_copy/{backup_name}', 'minio', 'minio123')" + check_backup_and_restore(storage_policy, backup_destination, size=1000000) + assert node.contains_in_log("using native copy") + assert node.contains_in_log("multipart upload copy") diff --git a/tests/integration/test_failed_async_inserts/__init__.py b/tests/integration/test_failed_async_inserts/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_failed_async_inserts/configs/config.xml b/tests/integration/test_failed_async_inserts/configs/config.xml new file mode 100644 index 00000000000..038c0792b44 --- /dev/null +++ b/tests/integration/test_failed_async_inserts/configs/config.xml @@ -0,0 +1,3 @@ + + 1000 + diff --git a/tests/integration/test_failed_async_inserts/test.py b/tests/integration/test_failed_async_inserts/test.py new file mode 100644 index 00000000000..6d66ac97006 --- /dev/null +++ b/tests/integration/test_failed_async_inserts/test.py @@ -0,0 +1,54 @@ +import logging +from time import sleep + +import pytest +from helpers.cluster import ClickHouseCluster + + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance( + "node", main_configs=["configs/config.xml"], with_zookeeper=True +) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def test_failed_async_inserts(started_cluster): + node = started_cluster.instances["node"] + + node.query( + "CREATE TABLE async_insert_30_10_2022 (id UInt32, s String) ENGINE = Memory" + ) + node.query( + "INSERT INTO async_insert_30_10_2022 SETTINGS async_insert = 1 VALUES ()", + ignore_error=True, + ) + node.query( + "INSERT INTO async_insert_30_10_2022 SETTINGS async_insert = 1 VALUES ([1,2,3], 1)", + ignore_error=True, + ) + node.query( + 'INSERT INTO async_insert_30_10_2022 SETTINGS async_insert = 1 FORMAT JSONEachRow {"id" : 1} {"x"}', + ignore_error=True, + ) + node.query( + "INSERT INTO async_insert_30_10_2022 SETTINGS async_insert = 1 VALUES (throwIf(4),'')", + ignore_error=True, + ) + + select_query = ( + "SELECT value FROM system.events WHERE event == 'FailedAsyncInsertQuery'" + ) + + assert node.query(select_query) == "4\n" + + node.query("DROP TABLE IF EXISTS async_insert_30_10_2022 NO DELAY") diff --git a/tests/queries/0_stateless/00732_base64_functions.reference b/tests/queries/0_stateless/00732_base64_functions.reference index b22ae4e7e24..5dc1ba03b89 100644 --- a/tests/queries/0_stateless/00732_base64_functions.reference +++ b/tests/queries/0_stateless/00732_base64_functions.reference @@ -14,3 +14,5 @@ fooba foobar 1 1 +Zm9v +foo diff --git a/tests/queries/0_stateless/00732_base64_functions.sql b/tests/queries/0_stateless/00732_base64_functions.sql index 4ed86e20913..adba0cdebbd 100644 --- a/tests/queries/0_stateless/00732_base64_functions.sql +++ b/tests/queries/0_stateless/00732_base64_functions.sql @@ -14,3 +14,6 @@ SELECT base64Decode(val, 'excess argument') FROM (select arrayJoin(['', 'Zg==', SELECT tryBase64Decode('Zm9vYmF=Zm9v', 'excess argument'); -- { serverError 42 } SELECT base64Decode('Zm9vYmF=Zm9v'); -- { serverError 117 } + +select base64Encode(toFixedString('foo', 3)); +select base64Decode(toFixedString('Zm9v', 4)); diff --git a/tests/queries/0_stateless/02233_HTTP_ranged.python b/tests/queries/0_stateless/02233_HTTP_ranged.python index e0198210c16..e74d494edf5 100644 --- a/tests/queries/0_stateless/02233_HTTP_ranged.python +++ b/tests/queries/0_stateless/02233_HTTP_ranged.python @@ -120,8 +120,9 @@ class HttpProcessor(BaseHTTPRequestHandler): allow_range = False range_used = False get_call_num = 0 + responses_to_get = [] - def send_head(self): + def send_head(self, from_get = False): if self.headers["Range"] and HttpProcessor.allow_range: try: self.range = parse_byte_range(self.headers["Range"]) @@ -145,7 +146,14 @@ class HttpProcessor(BaseHTTPRequestHandler): self.send_error(416, "Requested Range Not Satisfiable") return None - self.send_response(206 if HttpProcessor.allow_range else 200) + retry_range_request = first != 0 and from_get is True and len(HttpProcessor.responses_to_get) > 0 + if retry_range_request: + code = HttpProcessor.responses_to_get.pop() + if code not in HttpProcessor.responses: + self.send_response(int(code)) + else: + self.send_response(206 if HttpProcessor.allow_range else 200) + self.send_header("Content-type", "application/json") if HttpProcessor.allow_range: @@ -169,7 +177,7 @@ class HttpProcessor(BaseHTTPRequestHandler): self.send_head() def do_GET(self): - result = self.send_head() + result = self.send_head(True) if result == None: return @@ -211,26 +219,36 @@ def start_server(): ##################################################################### -def test_select(download_buffer_size): +def test_select(settings): global HTTP_SERVER_URL_STR - query = f"SELECT * FROM url('{HTTP_SERVER_URL_STR}','JSONAsString') SETTINGS max_download_buffer_size={download_buffer_size};" + query = f"SELECT * FROM url('{HTTP_SERVER_URL_STR}','JSONAsString') SETTINGS {','.join((k+'='+repr(v) for k, v in settings.items()))};" check_answers(query, EXPECTED_ANSWER) -def run_test(allow_range, download_buffer_size=20): +def run_test(allow_range, settings, check_retries=False): HttpProcessor.range_used = False HttpProcessor.get_call_num = 0 HttpProcessor.allow_range = allow_range + if check_retries: + HttpProcessor.responses_to_get = ["500", "200", "206"] + retries_num = len(HttpProcessor.responses_to_get) t, httpd = start_server() t.start() - test_select(download_buffer_size) + test_select(settings) + download_buffer_size = settings["max_download_buffer_size"] expected_get_call_num = (PAYLOAD_LEN - 1) // download_buffer_size + 1 if allow_range: if not HttpProcessor.range_used: raise Exception("HTTP Range was not used when supported") + if check_retries and len(HttpProcessor.responses_to_get) > 0: + raise Exception("Expected to get http response 500, which had to be retried, but 200 ok returned and then retried") + + if retries_num > 0: + expected_get_call_num += retries_num - 1 + if expected_get_call_num != HttpProcessor.get_call_num: raise Exception( f"Invalid amount of GET calls with Range. Expected {expected_get_call_num}, actual {HttpProcessor.get_call_num}" @@ -245,9 +263,23 @@ def run_test(allow_range, download_buffer_size=20): def main(): - run_test(allow_range=False) - run_test(allow_range=True, download_buffer_size=20) - run_test(allow_range=True, download_buffer_size=10) + settings = {"max_download_buffer_size" : 20} + + # Test Accept-Ranges=False + run_test(allow_range=False, settings=settings) + # Test Accept-Ranges=True, parallel download is used + run_test(allow_range=True, settings=settings) + + # Test Accept-Ranges=True, parallel download is used + settings = {"max_download_buffer_size" : 10} + run_test(allow_range=True, settings=settings) + + # Test Accept-Ranges=True, parallel download is not used, + # first get request 500 response, + # second get request 200ok response, + # third get request (retry) 206 response. + settings["max_download_threads"] = 2 + run_test(allow_range=True, settings=settings, check_retries=True) if __name__ == "__main__": diff --git a/tests/queries/0_stateless/02233_HTTP_ranged.reference b/tests/queries/0_stateless/02233_HTTP_ranged.reference index 17f0fff172a..6164e96afc5 100644 --- a/tests/queries/0_stateless/02233_HTTP_ranged.reference +++ b/tests/queries/0_stateless/02233_HTTP_ranged.reference @@ -1,3 +1,4 @@ PASSED PASSED PASSED +PASSED diff --git a/tests/queries/0_stateless/02475_analysis_of_variance.reference b/tests/queries/0_stateless/02475_analysis_of_variance.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02475_analysis_of_variance.sql b/tests/queries/0_stateless/02475_analysis_of_variance.sql new file mode 100644 index 00000000000..86996f784ea --- /dev/null +++ b/tests/queries/0_stateless/02475_analysis_of_variance.sql @@ -0,0 +1,10 @@ + +SELECT analysisOfVariance(number, number % 2) FROM numbers(10) FORMAT Null; +SELECT analysisOfVariance(number :: Decimal32(5), number % 2) FROM numbers(10) FORMAT Null; +SELECT analysisOfVariance(number :: Decimal256(5), number % 2) FROM numbers(10) FORMAT Null; + +SELECT analysisOfVariance(1.11, -20); -- { serverError BAD_ARGUMENTS } +SELECT analysisOfVariance(1.11, 20 :: UInt128); -- { serverError BAD_ARGUMENTS } +SELECT analysisOfVariance(1.11, 9000000000000000); -- { serverError BAD_ARGUMENTS } + +SELECT analysisOfVariance(number, number % 2), analysisOfVariance(100000000000000000000., number % 65535) FROM numbers(1048575); -- { serverError BAD_ARGUMENTS } diff --git a/tests/queries/0_stateless/02475_join_bug_42832.reference b/tests/queries/0_stateless/02475_join_bug_42832.reference new file mode 100644 index 00000000000..e5310261d0a --- /dev/null +++ b/tests/queries/0_stateless/02475_join_bug_42832.reference @@ -0,0 +1,2 @@ +4 6 +4 4 diff --git a/tests/queries/0_stateless/02475_join_bug_42832.sql b/tests/queries/0_stateless/02475_join_bug_42832.sql new file mode 100644 index 00000000000..e383949fb22 --- /dev/null +++ b/tests/queries/0_stateless/02475_join_bug_42832.sql @@ -0,0 +1,16 @@ +DROP TABLE IF EXISTS tab1; +DROP TABLE IF EXISTS tab2; + +SET allow_suspicious_low_cardinality_types = 1; + +CREATE TABLE tab1 (a1 Int32, b1 Int32, val UInt64) ENGINE = MergeTree ORDER BY a1; +CREATE TABLE tab2 (a2 LowCardinality(Int32), b2 Int32) ENGINE = MergeTree ORDER BY a2; + +INSERT INTO tab1 SELECT number, number, 1 from numbers(4); +INSERT INTO tab2 SELECT number + 2, number + 2 from numbers(4); + +SELECT sum(val), count(val) FROM tab1 FULL OUTER JOIN tab2 ON b1 - 2 = a2 OR a1 = b2 SETTINGS join_use_nulls = 0; +SELECT sum(val), count(val) FROM tab1 FULL OUTER JOIN tab2 ON b1 - 2 = a2 OR a1 = b2 SETTINGS join_use_nulls = 1; + +DROP TABLE IF EXISTS tab1; +DROP TABLE IF EXISTS tab2;