Optimize bit operations on FixedString when one of the arguments is constant #9091

This commit is contained in:
Alexey Milovidov 2020-02-29 22:39:18 +03:00
parent 2fc799c1c1
commit 06d2c82c22
3 changed files with 64 additions and 6 deletions

View File

@ -95,16 +95,65 @@ struct FixedStringOperationImpl
c[i] = Op::template apply<UInt8>(a[i], b[i]);
}
static void NO_INLINE vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
template <bool inverted>
static void NO_INLINE vector_constant_impl(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
{
for (size_t i = 0; i < size; ++i)
c[i] = Op::template apply<UInt8>(a[i], b[i % N]);
/// These complications are needed to avoid integer division in inner loop.
/// Create a pattern of repeated values of b with at least 16 bytes,
/// so we can read 16 bytes of this repeated pattern starting from any offset inside b.
///
/// Example:
///
/// N = 6
/// ------
/// [abcdefabcdefabcdefabc]
/// ^^^^^^^^^^^^^^^^
/// 16 bytes starting from the last offset inside b.
const size_t b_repeated_size = N + 15;
UInt8 b_repeated[b_repeated_size];
for (size_t i = 0; i < b_repeated_size; ++i)
b_repeated[i] = b[i % N];
size_t b_offset = 0;
size_t b_increment = 16 % N;
/// Example:
///
/// At first iteration we copy 16 bytes at offset 0 from b_repeated:
/// [abcdefabcdefabcdefabc]
/// ^^^^^^^^^^^^^^^^
/// At second iteration we copy 16 bytes at offset 4 = 16 % 6 from b_repeated:
/// [abcdefabcdefabcdefabc]
/// ^^^^^^^^^^^^^^^^
/// At third iteration we copy 16 bytes at offset 2 = (16 * 2) % 6 from b_repeated:
/// [abcdefabcdefabcdefabc]
/// ^^^^^^^^^^^^^^^^
/// PaddedPODArray allows overflow for 15 bytes.
for (size_t i = 0; i < size; i += 16)
{
/// This loop is formed in a way to be vectorized into two SIMD mov.
for (size_t j = 0; j < 16; ++j)
c[i + j] = inverted
? Op::template apply<UInt8>(a[i + j], b_repeated[b_offset + j])
: Op::template apply<UInt8>(b_repeated[b_offset + j], a[i + j]);
b_offset += b_increment;
if (b_offset >= N) /// This condition is easily predictable.
b_offset -= N;
}
}
static void NO_INLINE constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
static void vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
{
for (size_t i = 0; i < size; ++i)
c[i] = Op::template apply<UInt8>(a[i % N], b[i]);
vector_constant_impl<false>(a, b, c, size, N);
}
static void constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
{
vector_constant_impl<true>(b, a, c, size, N);
}
};

View File

@ -0,0 +1,4 @@
aca
acagac
aca
acagac

View File

@ -0,0 +1,5 @@
SELECT DISTINCT bitXor(materialize(toFixedString('abc', 3)), toFixedString('\x00\x01\x02', 3)) FROM numbers(10);
SELECT DISTINCT bitXor(materialize(toFixedString('abcdef', 6)), toFixedString('\x00\x01\x02\x03\x04\x05', 6)) FROM numbers(10);
SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02', 3), materialize(toFixedString('abc', 3))) FROM numbers(10);
SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02\x03\x04\x05', 6), materialize(toFixedString('abcdef', 6))) FROM numbers(10);