mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 09:02:00 +00:00
Merge pull request #9454 from ClickHouse/fixed-string-bit-ops-vector-constant-optimization
Optimize bit operations on FixedString when one of the arguments is constant
This commit is contained in:
commit
d15f11afc4
@ -95,16 +95,65 @@ struct FixedStringOperationImpl
|
||||
c[i] = Op::template apply<UInt8>(a[i], b[i]);
|
||||
}
|
||||
|
||||
static void NO_INLINE vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
|
||||
template <bool inverted>
|
||||
static void NO_INLINE vector_constant_impl(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
|
||||
{
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
c[i] = Op::template apply<UInt8>(a[i], b[i % N]);
|
||||
/// These complications are needed to avoid integer division in inner loop.
|
||||
|
||||
/// Create a pattern of repeated values of b with at least 16 bytes,
|
||||
/// so we can read 16 bytes of this repeated pattern starting from any offset inside b.
|
||||
///
|
||||
/// Example:
|
||||
///
|
||||
/// N = 6
|
||||
/// ------
|
||||
/// [abcdefabcdefabcdefabc]
|
||||
/// ^^^^^^^^^^^^^^^^
|
||||
/// 16 bytes starting from the last offset inside b.
|
||||
|
||||
const size_t b_repeated_size = N + 15;
|
||||
UInt8 b_repeated[b_repeated_size];
|
||||
for (size_t i = 0; i < b_repeated_size; ++i)
|
||||
b_repeated[i] = b[i % N];
|
||||
|
||||
size_t b_offset = 0;
|
||||
size_t b_increment = 16 % N;
|
||||
|
||||
/// Example:
|
||||
///
|
||||
/// At first iteration we copy 16 bytes at offset 0 from b_repeated:
|
||||
/// [abcdefabcdefabcdefabc]
|
||||
/// ^^^^^^^^^^^^^^^^
|
||||
/// At second iteration we copy 16 bytes at offset 4 = 16 % 6 from b_repeated:
|
||||
/// [abcdefabcdefabcdefabc]
|
||||
/// ^^^^^^^^^^^^^^^^
|
||||
/// At third iteration we copy 16 bytes at offset 2 = (16 * 2) % 6 from b_repeated:
|
||||
/// [abcdefabcdefabcdefabc]
|
||||
/// ^^^^^^^^^^^^^^^^
|
||||
|
||||
/// PaddedPODArray allows overflow for 15 bytes.
|
||||
for (size_t i = 0; i < size; i += 16)
|
||||
{
|
||||
/// This loop is formed in a way to be vectorized into two SIMD mov.
|
||||
for (size_t j = 0; j < 16; ++j)
|
||||
c[i + j] = inverted
|
||||
? Op::template apply<UInt8>(a[i + j], b_repeated[b_offset + j])
|
||||
: Op::template apply<UInt8>(b_repeated[b_offset + j], a[i + j]);
|
||||
|
||||
b_offset += b_increment;
|
||||
if (b_offset >= N) /// This condition is easily predictable.
|
||||
b_offset -= N;
|
||||
}
|
||||
}
|
||||
|
||||
static void NO_INLINE constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
|
||||
static void vector_constant(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
|
||||
{
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
c[i] = Op::template apply<UInt8>(a[i % N], b[i]);
|
||||
vector_constant_impl<false>(a, b, c, size, N);
|
||||
}
|
||||
|
||||
static void constant_vector(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt8 * __restrict c, size_t size, size_t N)
|
||||
{
|
||||
vector_constant_impl<true>(b, a, c, size, N);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -0,0 +1,4 @@
|
||||
aca
|
||||
acagac
|
||||
aca
|
||||
acagac
|
@ -0,0 +1,5 @@
|
||||
SELECT DISTINCT bitXor(materialize(toFixedString('abc', 3)), toFixedString('\x00\x01\x02', 3)) FROM numbers(10);
|
||||
SELECT DISTINCT bitXor(materialize(toFixedString('abcdef', 6)), toFixedString('\x00\x01\x02\x03\x04\x05', 6)) FROM numbers(10);
|
||||
|
||||
SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02', 3), materialize(toFixedString('abc', 3))) FROM numbers(10);
|
||||
SELECT DISTINCT bitXor(toFixedString('\x00\x01\x02\x03\x04\x05', 6), materialize(toFixedString('abcdef', 6))) FROM numbers(10);
|
Loading…
Reference in New Issue
Block a user