From 06d2c82c22d07cc32db51728f558c200a8acd493 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 29 Feb 2020 22:39:18 +0300 Subject: [PATCH] Optimize bit operations on FixedString when one of the arguments is constant #9091 --- dbms/src/Functions/FunctionBinaryArithmetic.h | 61 +++++++++++++++++-- .../01090_fixed_string_bit_ops.reference | 4 ++ .../01090_fixed_string_bit_ops.sql | 5 ++ 3 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.reference create mode 100644 dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.sql diff --git a/dbms/src/Functions/FunctionBinaryArithmetic.h b/dbms/src/Functions/FunctionBinaryArithmetic.h index 0ab0e77f4eb..fdd36240bfe 100644 --- a/dbms/src/Functions/FunctionBinaryArithmetic.h +++ b/dbms/src/Functions/FunctionBinaryArithmetic.h @@ -95,16 +95,65 @@ struct FixedStringOperationImpl c[i] = Op::template apply(a[i], b[i]); } - static void NO_INLINE vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N) + template + static void NO_INLINE vector_constant_impl(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N) { - for (size_t i = 0; i < size; ++i) - c[i] = Op::template apply(a[i], b[i % N]); + /// These complications are needed to avoid integer division in inner loop. + + /// Create a pattern of repeated values of b with at least 16 bytes, + /// so we can read 16 bytes of this repeated pattern starting from any offset inside b. + /// + /// Example: + /// + /// N = 6 + /// ------ + /// [abcdefabcdefabcdefabc] + /// ^^^^^^^^^^^^^^^^ + /// 16 bytes starting from the last offset inside b. + + const size_t b_repeated_size = N + 15; + UInt8 b_repeated[b_repeated_size]; + for (size_t i = 0; i < b_repeated_size; ++i) + b_repeated[i] = b[i % N]; + + size_t b_offset = 0; + size_t b_increment = 16 % N; + + /// Example: + /// + /// At first iteration we copy 16 bytes at offset 0 from b_repeated: + /// [abcdefabcdefabcdefabc] + /// ^^^^^^^^^^^^^^^^ + /// At second iteration we copy 16 bytes at offset 4 = 16 % 6 from b_repeated: + /// [abcdefabcdefabcdefabc] + /// ^^^^^^^^^^^^^^^^ + /// At third iteration we copy 16 bytes at offset 2 = (16 * 2) % 6 from b_repeated: + /// [abcdefabcdefabcdefabc] + /// ^^^^^^^^^^^^^^^^ + + /// PaddedPODArray allows overflow for 15 bytes. + for (size_t i = 0; i < size; i += 16) + { + /// This loop is formed in a way to be vectorized into two SIMD mov. + for (size_t j = 0; j < 16; ++j) + c[i + j] = inverted + ? Op::template apply(a[i + j], b_repeated[b_offset + j]) + : Op::template apply(b_repeated[b_offset + j], a[i + j]); + + b_offset += b_increment; + if (b_offset >= N) /// This condition is easily predictable. + b_offset -= N; + } } - static void NO_INLINE constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N) + static void vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N) { - for (size_t i = 0; i < size; ++i) - c[i] = Op::template apply(a[i % N], b[i]); + vector_constant_impl(a, b, c, size, N); + } + + static void constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N) + { + vector_constant_impl(b, a, c, size, N); } }; diff --git a/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.reference b/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.reference new file mode 100644 index 00000000000..5c8bb6fac36 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.reference @@ -0,0 +1,4 @@ +aca +acagac +aca +acagac diff --git a/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.sql b/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.sql new file mode 100644 index 00000000000..72ef1c3746c --- /dev/null +++ b/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.sql @@ -0,0 +1,5 @@ +SELECT DISTINCT bitXor(materialize(toFixedString('abc', 3)), toFixedString('\x00\x01\x02', 3)) FROM numbers(10); +SELECT DISTINCT bitXor(materialize(toFixedString('abcdef', 6)), toFixedString('\x00\x01\x02\x03\x04\x05', 6)) FROM numbers(10); + +SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02', 3), materialize(toFixedString('abc', 3))) FROM numbers(10); +SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02\x03\x04\x05', 6), materialize(toFixedString('abcdef', 6))) FROM numbers(10);