Merge pull request #9454 from ClickHouse/fixed-string-bit-ops-vector-constant-optimization

Optimize bit operations on FixedString when one of the arguments is constant
2024-11-21 23:21:59 +00:00 · 2020-03-02 15:18:30 +03:00 · 2020-03-02 15:18:30 +03:00 · d15f11afc4
commit d15f11afc4
parent 020122d995 06d2c82c22
3 changed files with 64 additions and 6 deletions
--- a/dbms/src/Functions/FunctionBinaryArithmetic.h
+++ b/dbms/src/Functions/FunctionBinaryArithmetic.h
@ -95,16 +95,65 @@ struct FixedStringOperationImpl
            c[i] = Op::template apply<UInt8>(a[i], b[i]);
    }

-    static void NO_INLINE vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
+    template <bool inverted>
+    static void NO_INLINE vector_constant_impl(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
    {
-        for (size_t i = 0; i < size; ++i)
-            c[i] = Op::template apply<UInt8>(a[i], b[i % N]);
+        /// These complications are needed to avoid integer division in inner loop.
+
+        /// Create a pattern of repeated values of b with at least 16 bytes,
+        /// so we can read 16 bytes of this repeated pattern starting from any offset inside b.
+        ///
+        /// Example:
+        ///
+        ///  N = 6
+        ///  ------
+        /// [abcdefabcdefabcdefabc]
+        ///       ^^^^^^^^^^^^^^^^
+        ///      16 bytes starting from the last offset inside b.
+
+        const size_t b_repeated_size = N + 15;
+        UInt8 b_repeated[b_repeated_size];
+        for (size_t i = 0; i < b_repeated_size; ++i)
+            b_repeated[i] = b[i % N];
+
+        size_t b_offset = 0;
+        size_t b_increment = 16 % N;
+
+        /// Example:
+        ///
+        /// At first iteration we copy 16 bytes at offset 0 from b_repeated:
+        /// [abcdefabcdefabcdefabc]
+        ///  ^^^^^^^^^^^^^^^^
+        /// At second iteration we copy 16 bytes at offset 4 = 16 % 6 from b_repeated:
+        /// [abcdefabcdefabcdefabc]
+        ///      ^^^^^^^^^^^^^^^^
+        /// At third iteration we copy 16 bytes at offset 2 = (16 * 2) % 6 from b_repeated:
+        /// [abcdefabcdefabcdefabc]
+        ///    ^^^^^^^^^^^^^^^^
+
+        /// PaddedPODArray allows overflow for 15 bytes.
+        for (size_t i = 0; i < size; i += 16)
+        {
+            /// This loop is formed in a way to be vectorized into two SIMD mov.
+            for (size_t j = 0; j < 16; ++j)
+                c[i + j] = inverted
+                    ? Op::template apply<UInt8>(a[i + j], b_repeated[b_offset + j])
+                    : Op::template apply<UInt8>(b_repeated[b_offset + j], a[i + j]);
+
+            b_offset += b_increment;
+            if (b_offset >= N) /// This condition is easily predictable.
+                b_offset -= N;
+        }
    }

-    static void NO_INLINE constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
+    static void vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
    {
-        for (size_t i = 0; i < size; ++i)
-            c[i] = Op::template apply<UInt8>(a[i % N], b[i]);
+        vector_constant_impl<false>(a, b, c, size, N);
+    }
+
+    static void constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
+    {
+        vector_constant_impl<true>(b, a, c, size, N);
    }
 };

--- a/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.reference
+++ b/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.reference
@ -0,0 +1,4 @@
+aca
+acagac
+aca
+acagac
--- a/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.sql
+++ b/dbms/tests/queries/0_stateless/01090_fixed_string_bit_ops.sql
@ -0,0 +1,5 @@
+SELECT DISTINCT bitXor(materialize(toFixedString('abc', 3)), toFixedString('\x00\x01\x02', 3)) FROM numbers(10);
+SELECT DISTINCT bitXor(materialize(toFixedString('abcdef', 6)), toFixedString('\x00\x01\x02\x03\x04\x05', 6)) FROM numbers(10);
+
+SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02', 3), materialize(toFixedString('abc', 3))) FROM numbers(10);
+SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02\x03\x04\x05', 6), materialize(toFixedString('abcdef', 6))) FROM numbers(10);