From d560db65f666fae90ee591693dc60e6852d0ba97 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 Jun 2018 06:32:48 +0300 Subject: [PATCH] Fixed weird error #1890 --- dbms/src/IO/LZ4_decompress_faster.cpp | 112 +++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/dbms/src/IO/LZ4_decompress_faster.cpp b/dbms/src/IO/LZ4_decompress_faster.cpp index 25316fee5a6..adddb075fa7 100644 --- a/dbms/src/IO/LZ4_decompress_faster.cpp +++ b/dbms/src/IO/LZ4_decompress_faster.cpp @@ -52,6 +52,8 @@ inline void wildCopy8(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) inline void copyOverlap8(UInt8 * op, const UInt8 *& match, const size_t offset) { /// 4 % n. + /// Or if 4 % n is zero, we use n. + /// It gives equivalent result, but is better CPU friendly for unknown reason. static constexpr int shift1[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /// 8 % n - 4 % n @@ -67,11 +69,89 @@ inline void copyOverlap8(UInt8 * op, const UInt8 *& match, const size_t offset) match += shift2[offset]; } + +/** We use 'xmm' (128bit SSE) registers here to shuffle 16 bytes. + * + * It is possible to use 'mm' (64bit MMX) registers to shuffle just 8 bytes as we need. + * + * There is corresponding version of 'pshufb' instruction that operates on 'mm' registers, + * (it operates on MMX registers although it is available in SSSE3) + * and compiler library has the corresponding intrinsic: '_mm_shuffle_pi8'. + * + * It can be done like this: + * + * unalignedStore(op, _mm_shuffle_pi8( + * unalignedLoad<__m64>(match), + * unalignedLoad<__m64>(masks + 8 * offset))); + * + * This is perfectly correct and this code have the same or even better performance. + * + * But if we write code this way, it will lead to + * extremely weird and extremely non obvious + * effects in completely unrelated parts of code. + * + * Because using MMX registers alters the mode of operation of x87 FPU, + * and then operations with FPU become broken. + * + * Example 1. + * Compile this code without optimizations: + * + #include + #include + #include + #include + + int main(int, char **) + { + [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); + + std::vector vec; + std::unordered_set set(vec.begin(), vec.end()); + + std::cerr << set.size() << "\n"; + return 0; + } + + $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug1.cpp && ./a.out + terminate called after throwing an instance of 'std::bad_alloc' + what(): std::bad_alloc + + Also reproduced with clang. But only with libstdc++, not with libc++. + + * Example 2. + + #include + #include + #include + + int main(int, char **) + { + double max_fill = 1; + + std::cerr << (long double)max_fill << "\n"; + [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); + std::cerr << (long double)max_fill << "\n"; + + return 0; + } + + $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug2.cpp && ./a.out + 1 + -nan + + * Explanation: + * + * https://stackoverflow.com/questions/33692969/assembler-mmx-errors + * https://software.intel.com/en-us/node/524274 + * + * Actually it's possible to use 'emms' instruction after decompression routine. + * But it's more easy to just use 'xmm' registers and avoid using 'mm' registers. + */ inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) { #ifdef __SSSE3__ - static constexpr UInt8 __attribute__((__aligned__(8))) masks[] = + static constexpr UInt8 __attribute__((__aligned__(16))) masks[] = { 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ @@ -81,11 +161,13 @@ inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t o 0, 1, 2, 3, 4, 0, 1, 2, 0, 1, 2, 3, 4, 5, 0, 1, 0, 1, 2, 3, 4, 5, 6, 0, + 0, 0, 0, 0, 0, 0, 0, 0, /* this row is not used: padding to allow read 16 bytes starting at previous row */ }; - unalignedStore(op, _mm_shuffle_pi8( - unalignedLoad<__m64>(match), - unalignedLoad<__m64>(masks + 8 * offset))); + _mm_storeu_si128(reinterpret_cast<__m128i *>(op), + _mm_shuffle_epi8( + _mm_loadu_si128(reinterpret_cast(match)), + _mm_load_si128(reinterpret_cast(masks + 8 * offset)))); match += masks[offset]; @@ -225,6 +307,17 @@ void NO_INLINE decompressImpl( UInt8 * copy_end = op + length; + /// input: Hello, world + /// ^-ip + /// output: xyz + /// ^-op ^-copy_end + /// output: xyzHello, w + /// ^- excessive copied bytes due to "wildCopy" + /// input: Hello, world + /// ^-ip + /// output: xyzHello, w + /// ^-op (we will overwrite excessive bytes on next iteration) + wildCopy(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer. ip += length; @@ -256,6 +349,17 @@ void NO_INLINE decompressImpl( if (unlikely(offset < copy_amount)) { + /// output: Hello + /// ^-op + /// ^-match; offset = 5 + /// + /// output: Hello + /// [------] - copy_amount bytes + /// [------] - copy them here + /// + /// output: HelloHelloHel + /// ^-match ^-op + copyOverlap(op, match, offset); } else