From 14141c9071bd75a118fe0fac89a78459788602ff Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Thu, 13 Oct 2022 15:28:25 -0700 Subject: [PATCH 1/6] Add support for FixedString input to base64 functions --- .../functions/string-functions.md | 4 +- src/Functions/FunctionBase64Conversion.h | 210 +++++++++++------- src/Functions/base64Decode.cpp | 4 +- src/Functions/base64Encode.cpp | 6 +- src/Functions/tryBase64Decode.cpp | 3 +- 5 files changed, 137 insertions(+), 90 deletions(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index a8ba4843279..e0418a81f14 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -571,13 +571,13 @@ Similar to base58Decode, but returns an empty string in case of error. ## base64Encode(s) -Encodes ‘s’ string into base64 +Encodes ‘s’ FixedString or String into base64. Alias: `TO_BASE64`. ## base64Decode(s) -Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception. +Decode base64-encoded FixedString or String ‘s’ into original string. In case of failure raises an exception. Alias: `FROM_BASE64`. diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index d7e4ea3486d..ef340a33149 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -2,21 +2,19 @@ #include "config.h" #if USE_BASE64 -# include -# include +# include # include # include -# include # include -# include -# include +# include +# include # include +# include +# include namespace DB { -using namespace GatherUtils; - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -25,33 +23,86 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } +namespace Detail +{ + inline size_t base64Decode(const std::span src, UInt8 * dst) + { +# if defined(__aarch64__) + return tb64sdec(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# else + return _tb64d(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# endif + } +} + struct Base64Encode { static constexpr auto name = "base64Encode"; - static size_t getBufferSize(size_t string_length, size_t string_count) + + static size_t getBufferSize(const size_t string_length, const size_t string_count) { return ((string_length - string_count) / 3 + string_count) * 4 + string_count; } + + static size_t performCoding(const std::span src, UInt8 * dst) + { + /* + * Some bug in sse arm64 implementation? + * `base64Encode(repeat('a', 46))` returns wrong padding character + */ +# if defined(__aarch64__) + return tb64senc(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# else + return _tb64e(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst)); +# endif + } }; struct Base64Decode { static constexpr auto name = "base64Decode"; - static size_t getBufferSize(size_t string_length, size_t string_count) + static size_t getBufferSize(const size_t string_length, const size_t string_count) { return ((string_length - string_count) / 4 + string_count) * 3 + string_count; } + + static size_t performCoding(const std::span src, UInt8 * dst) + { + const auto outlen = Detail::base64Decode(src, dst); + if (src.size() > 0 && !outlen) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Failed to {} input '{}'", + name, + String(reinterpret_cast(src.data()), src.size())); + + return outlen; + } }; struct TryBase64Decode { static constexpr auto name = "tryBase64Decode"; - static size_t getBufferSize(size_t string_length, size_t string_count) + static size_t getBufferSize(const size_t string_length, const size_t string_count) { return Base64Decode::getBufferSize(string_length, string_count); } + + static size_t performCoding(const std::span src, UInt8 * dst) + { + if (src.empty()) + return 0; + + const auto outlen = Detail::base64Decode(src, dst); + // during decoding character array can be partially polluted + // if fail, revert back and clean + if (!outlen) + *dst = 0; + + return outlen; + } }; template @@ -71,99 +122,60 @@ public: if (arguments.size() != 1) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong number of arguments for function {}: 1 expected.", getName()); - if (!WhichDataType(arguments[0].type).isString()) + if (!WhichDataType(arguments[0].type).isStringOrFixedString()) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of 1st argument of function {}. Must be String.", - arguments[0].type->getName(), getName()); + "Illegal type {} of 1st argument of function {}. Must be FixedString or String.", + arguments[0].type->getName(), + getName()); return std::make_shared(); } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const override { - const ColumnPtr column_string = arguments[0].column; - const ColumnString * input = checkAndGetColumn(column_string.get()); + const auto & input_column = arguments[0].column; + if (const auto * src_column_as_fixed_string = checkAndGetColumn(*input_column)) + return execute(*src_column_as_fixed_string, input_rows_count); + else if (const auto * src_column_as_string = checkAndGetColumn(*input_column)) + return execute(*src_column_as_string, input_rows_count); - if (!input) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of first argument of function {}, must be of type String", - arguments[0].column->getName(), getName()); + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}, must be of type FixedString or String.", + input_column->getName(), + getName()); + } +private: + static ColumnPtr execute(const ColumnString & src_column, const size_t src_row_count) + { auto dst_column = ColumnString::create(); - auto & dst_data = dst_column->getChars(); + auto & dst_chars = dst_column->getChars(); auto & dst_offsets = dst_column->getOffsets(); - size_t reserve = Func::getBufferSize(input->getChars().size(), input->size()); - dst_data.resize(reserve); - dst_offsets.resize(input_rows_count); + const auto reserve = Func::getBufferSize(src_column.byteSize(), src_column.size()); + dst_chars.resize(reserve); + dst_offsets.resize(src_row_count); - const ColumnString::Offsets & src_offsets = input->getOffsets(); + const auto & src_chars = src_column.getChars(); + const auto & src_offsets = src_column.getOffsets(); - const auto * source = input->getChars().data(); - auto * dst = dst_data.data(); + auto * dst = dst_chars.data(); auto * dst_pos = dst; + const auto * src = src_chars.data(); size_t src_offset_prev = 0; - - for (size_t row = 0; row < input_rows_count; ++row) + for (size_t row = 0; row < src_row_count; ++row) { - size_t srclen = src_offsets[row] - src_offset_prev - 1; - size_t outlen = 0; - - if constexpr (std::is_same_v) - { - /* - * Some bug in sse arm64 implementation? - * `base64Encode(repeat('a', 46))` returns wrong padding character - */ -#if defined(__aarch64__) - outlen = tb64senc(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#else - outlen = _tb64e(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#endif - } - else if constexpr (std::is_same_v) - { - if (srclen > 0) - { -#if defined(__aarch64__) - outlen = tb64sdec(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#else - outlen = _tb64d(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); -#endif - - if (!outlen) - throw Exception( - ErrorCodes::INCORRECT_DATA, - "Failed to {} input '{}'", - getName(), String(reinterpret_cast(source), srclen)); - } - } - else - { - if (srclen > 0) - { - // during decoding character array can be partially polluted - // if fail, revert back and clean - auto * savepoint = dst_pos; - outlen = _tb64d(reinterpret_cast(source), srclen, reinterpret_cast(dst_pos)); - if (!outlen) - { - outlen = 0; - dst_pos = savepoint; //-V1048 - // clean the symbol - dst_pos[0] = 0; - } - } - } + const size_t src_length = src_offsets[row] - src_offset_prev - 1; + const auto outlen = Func::performCoding({src, src_length}, dst_pos); /// Base64 library is using AVX-512 with some shuffle operations. /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. __msan_unpoison(dst_pos, outlen); - source += srclen + 1; + src += src_length + 1; dst_pos += outlen; *dst_pos = '\0'; dst_pos += 1; @@ -172,8 +184,44 @@ public: src_offset_prev = src_offsets[row]; } - dst_data.resize(dst_pos - dst); + dst_chars.resize(dst_pos - dst); + return dst_column; + } + static ColumnPtr execute(const ColumnFixedString & src_column, const size_t src_row_count) + { + auto dst_column = ColumnString::create(); + auto & dst_chars = dst_column->getChars(); + auto & dst_offsets = dst_column->getOffsets(); + + const auto reserve = Func::getBufferSize(src_column.byteSize(), src_column.size()); + dst_chars.resize(reserve); + dst_offsets.resize(src_row_count); + + const auto & src_chars = src_column.getChars(); + const auto & src_n = src_column.getN(); + + auto * dst = dst_chars.data(); + auto * dst_pos = dst; + const auto * src = src_chars.data(); + + for (size_t row = 0; row < src_row_count; ++row) + { + const auto outlen = Func::performCoding({src, src_n}, dst_pos); + + /// Base64 library is using AVX-512 with some shuffle operations. + /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. + __msan_unpoison(dst_pos, outlen); + + src += src_n; + dst_pos += outlen; + *dst_pos = '\0'; + dst_pos += 1; + + dst_offsets[row] = dst_pos - dst; + } + + dst_chars.resize(dst_pos - dst); return dst_column; } }; diff --git a/src/Functions/base64Decode.cpp b/src/Functions/base64Decode.cpp index f6943233d44..4060aafe1a3 100644 --- a/src/Functions/base64Decode.cpp +++ b/src/Functions/base64Decode.cpp @@ -1,8 +1,7 @@ #include + #if USE_BASE64 #include -#include - namespace DB { @@ -15,4 +14,5 @@ REGISTER_FUNCTION(Base64Decode) factory.registerAlias("FROM_BASE64", "base64Decode", FunctionFactory::CaseInsensitive); } } + #endif diff --git a/src/Functions/base64Encode.cpp b/src/Functions/base64Encode.cpp index fc06935e0a1..773db7e09d9 100644 --- a/src/Functions/base64Encode.cpp +++ b/src/Functions/base64Encode.cpp @@ -1,10 +1,7 @@ -#include #include -#include "config.h" - #if USE_BASE64 -# include +#include namespace DB { @@ -17,4 +14,5 @@ REGISTER_FUNCTION(Base64Encode) factory.registerAlias("TO_BASE64", "base64Encode", FunctionFactory::CaseInsensitive); } } + #endif diff --git a/src/Functions/tryBase64Decode.cpp b/src/Functions/tryBase64Decode.cpp index 1102c7a3418..bd452b8357b 100644 --- a/src/Functions/tryBase64Decode.cpp +++ b/src/Functions/tryBase64Decode.cpp @@ -1,7 +1,7 @@ #include + #if USE_BASE64 #include -#include namespace DB { @@ -10,4 +10,5 @@ REGISTER_FUNCTION(TryBase64Decode) factory.registerFunction>(); } } + #endif From 431e1aa3faf43b7cc1625df8d02615f195f20af8 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Fri, 14 Oct 2022 11:11:08 -0700 Subject: [PATCH 2/6] Implement functional tests --- tests/queries/0_stateless/00732_base64_functions.reference | 2 ++ tests/queries/0_stateless/00732_base64_functions.sql | 3 +++ 2 files changed, 5 insertions(+) diff --git a/tests/queries/0_stateless/00732_base64_functions.reference b/tests/queries/0_stateless/00732_base64_functions.reference index b22ae4e7e24..5dc1ba03b89 100644 --- a/tests/queries/0_stateless/00732_base64_functions.reference +++ b/tests/queries/0_stateless/00732_base64_functions.reference @@ -14,3 +14,5 @@ fooba foobar 1 1 +Zm9v +foo diff --git a/tests/queries/0_stateless/00732_base64_functions.sql b/tests/queries/0_stateless/00732_base64_functions.sql index 4ed86e20913..adba0cdebbd 100644 --- a/tests/queries/0_stateless/00732_base64_functions.sql +++ b/tests/queries/0_stateless/00732_base64_functions.sql @@ -14,3 +14,6 @@ SELECT base64Decode(val, 'excess argument') FROM (select arrayJoin(['', 'Zg==', SELECT tryBase64Decode('Zm9vYmF=Zm9v', 'excess argument'); -- { serverError 42 } SELECT base64Decode('Zm9vYmF=Zm9v'); -- { serverError 117 } + +select base64Encode(toFixedString('foo', 3)); +select base64Decode(toFixedString('Zm9v', 4)); From 52a34cecdad16baf2cac3b29283c748c375b0688 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 2 Nov 2022 20:28:30 +0100 Subject: [PATCH 3/6] Reformat the cmake a bit --- cmake/clang_tidy.cmake | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cmake/clang_tidy.cmake b/cmake/clang_tidy.cmake index 200282234ca..c1bf4404d34 100644 --- a/cmake/clang_tidy.cmake +++ b/cmake/clang_tidy.cmake @@ -6,7 +6,7 @@ if (ENABLE_CLANG_TIDY) find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") if (CLANG_TIDY_PATH) - message(STATUS + message (STATUS "Using clang-tidy: ${CLANG_TIDY_PATH}. The checks will be run during build process. See the .clang-tidy file at the root directory to configure the checks.") @@ -15,11 +15,15 @@ if (ENABLE_CLANG_TIDY) # clang-tidy requires assertions to guide the analysis # Note that NDEBUG is set implicitly by CMake for non-debug builds - set(COMPILER_FLAGS "${COMPILER_FLAGS} -UNDEBUG") + set (COMPILER_FLAGS "${COMPILER_FLAGS} -UNDEBUG") - # The variable CMAKE_CXX_CLANG_TIDY will be set inside src and base directories with non third-party code. + # The variable CMAKE_CXX_CLANG_TIDY will be set inside the following directories with non third-party code. + # - base + # - programs + # - src + # - utils # set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") else () - message(${RECONFIGURE_MESSAGE_LEVEL} "clang-tidy is not found") + message (${RECONFIGURE_MESSAGE_LEVEL} "clang-tidy is not found") endif () endif () From 150ceb7d37464ebb2c2f61c4a96c36cfced80928 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 2 Nov 2022 20:29:13 +0100 Subject: [PATCH 4/6] Use matus-chochlik/ctcache for clang-tidy --- cmake/clang_tidy.cmake | 12 +++++++++++- docker/packager/binary/Dockerfile | 3 +++ docker/packager/packager | 8 +++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/cmake/clang_tidy.cmake b/cmake/clang_tidy.cmake index c1bf4404d34..57295682487 100644 --- a/cmake/clang_tidy.cmake +++ b/cmake/clang_tidy.cmake @@ -3,7 +3,17 @@ option (ENABLE_CLANG_TIDY "Use clang-tidy static analyzer" OFF) if (ENABLE_CLANG_TIDY) - find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + find_program (CLANG_TIDY_CACHE_PATH NAMES "clang-tidy-cache") + if (CLANG_TIDY_CACHE_PATH) + find_program (_CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + + # Why do we use ';' here? + # It's a cmake black magic: https://cmake.org/cmake/help/latest/prop_tgt/LANG_CLANG_TIDY.html#prop_tgt:%3CLANG%3E_CLANG_TIDY + # The CLANG_TIDY_PATH is passed to CMAKE_CXX_CLANG_TIDY, which follows CXX_CLANG_TIDY syntax. + set (CLANG_TIDY_PATH "${CLANG_TIDY_CACHE_PATH};${_CLANG_TIDY_PATH}" CACHE STRING "A combined command to run clang-tidy with caching wrapper") + else () + find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12") + endif () if (CLANG_TIDY_PATH) message (STATUS diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 77afc3e924b..06c3c0d80f0 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -91,6 +91,9 @@ ENV PATH="$PATH:/usr/local/go/bin" ENV GOPATH=/workdir/go ENV GOCACHE=/workdir/ +RUN curl https://raw.githubusercontent.com/matus-chochlik/ctcache/7fd516e91c17779cbc6fc18bd119313d9532dd90/clang-tidy-cache -Lo /usr/bin/clang-tidy-cache \ + && chmod +x /usr/bin/clang-tidy-cache + RUN mkdir /workdir && chmod 777 /workdir WORKDIR /workdir diff --git a/docker/packager/packager b/docker/packager/packager index 83629dc7408..7f6bd8818fb 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -258,6 +258,10 @@ def parse_env_variables( if clang_tidy: # 15G is not enough for tidy build cache_maxsize = "25G" + + # `CTCACHE_DIR` has the same purpose as the `CCACHE_DIR` above. + # It's there to have the clang-tidy cache embedded into our standard `CCACHE_DIR` + result.append("CTCACHE_DIR=/ccache/clang-tidy-cache") result.append(f"CCACHE_MAXSIZE={cache_maxsize}") if distcc_hosts: @@ -282,9 +286,7 @@ def parse_env_variables( cmake_flags.append("-DENABLE_TESTS=1") if shared_libraries: - cmake_flags.append( - "-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1" - ) + cmake_flags.append("-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1") # We can't always build utils because it requires too much space, but # we have to build them at least in some way in CI. The shared library # build is probably the least heavy disk-wise. From b7fb500d5b7f5b140c2692207de9d25eadfe7883 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 1 Nov 2022 18:24:13 +0000 Subject: [PATCH 5/6] Fix incorrect saved_block_sample with multiple disjuncts in hash join --- src/Interpreters/HashJoin.cpp | 4 +++- .../0_stateless/02475_join_bug_42832.reference | 2 ++ .../queries/0_stateless/02475_join_bug_42832.sql | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02475_join_bug_42832.reference create mode 100644 tests/queries/0_stateless/02475_join_bug_42832.sql diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 26b9b843567..41c7c28a6fa 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -658,7 +658,9 @@ void HashJoin::initRightBlockStructure(Block & saved_block_sample) /// Save non key columns for (auto & column : sample_block_with_columns_to_add) { - if (!saved_block_sample.findByName(column.name)) + if (auto * col = saved_block_sample.findByName(column.name)) + *col = column; + else saved_block_sample.insert(column); } } diff --git a/tests/queries/0_stateless/02475_join_bug_42832.reference b/tests/queries/0_stateless/02475_join_bug_42832.reference new file mode 100644 index 00000000000..e5310261d0a --- /dev/null +++ b/tests/queries/0_stateless/02475_join_bug_42832.reference @@ -0,0 +1,2 @@ +4 6 +4 4 diff --git a/tests/queries/0_stateless/02475_join_bug_42832.sql b/tests/queries/0_stateless/02475_join_bug_42832.sql new file mode 100644 index 00000000000..e383949fb22 --- /dev/null +++ b/tests/queries/0_stateless/02475_join_bug_42832.sql @@ -0,0 +1,16 @@ +DROP TABLE IF EXISTS tab1; +DROP TABLE IF EXISTS tab2; + +SET allow_suspicious_low_cardinality_types = 1; + +CREATE TABLE tab1 (a1 Int32, b1 Int32, val UInt64) ENGINE = MergeTree ORDER BY a1; +CREATE TABLE tab2 (a2 LowCardinality(Int32), b2 Int32) ENGINE = MergeTree ORDER BY a2; + +INSERT INTO tab1 SELECT number, number, 1 from numbers(4); +INSERT INTO tab2 SELECT number + 2, number + 2 from numbers(4); + +SELECT sum(val), count(val) FROM tab1 FULL OUTER JOIN tab2 ON b1 - 2 = a2 OR a1 = b2 SETTINGS join_use_nulls = 0; +SELECT sum(val), count(val) FROM tab1 FULL OUTER JOIN tab2 ON b1 - 2 = a2 OR a1 = b2 SETTINGS join_use_nulls = 1; + +DROP TABLE IF EXISTS tab1; +DROP TABLE IF EXISTS tab2; From 5f184359a1dfa98336a10fa9f2641b1254e165b4 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 3 Nov 2022 12:44:07 +0000 Subject: [PATCH 6/6] Do not show status for Bugfix validate check in non bugfix PRs --- tests/ci/bugfix_validate_check.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/ci/bugfix_validate_check.py b/tests/ci/bugfix_validate_check.py index 9ecada816b6..e5f37f2940b 100644 --- a/tests/ci/bugfix_validate_check.py +++ b/tests/ci/bugfix_validate_check.py @@ -3,6 +3,7 @@ import argparse import csv import itertools +import logging import os from github import Github @@ -37,6 +38,8 @@ def process_result(file_path): state, report_url, description = post_commit_status_from_file(file_path) prefix = os.path.basename(os.path.dirname(file_path)) is_ok = state == "success" + if is_ok and report_url == "null": + return is_ok, None status = f'OK: Bug reproduced (Report' if not is_ok: @@ -51,15 +54,23 @@ def process_all_results(file_paths): for status_path in file_paths: is_ok, test_results = process_result(status_path) any_ok = any_ok or is_ok - all_results.extend(test_results) + if test_results is not None: + all_results.extend(test_results) + return any_ok, all_results def main(args): + logging.basicConfig(level=logging.INFO) + check_name_with_group = "Bugfix validate check" is_ok, test_results = process_all_results(args.status) + if not test_results: + logging.info("No results to upload") + return + pr_info = PRInfo() report_url = upload_results( S3Helper(),