2018-12-28 18:15:26 +00:00
|
|
|
#include "LZ4_decompress_faster.h"
|
|
|
|
|
2018-01-15 05:54:28 +00:00
|
|
|
#include <string.h>
|
|
|
|
#include <iostream>
|
|
|
|
#include <Core/Defines.h>
|
2018-01-16 01:59:51 +00:00
|
|
|
#include <Common/Stopwatch.h>
|
2020-03-19 10:38:34 +00:00
|
|
|
#include <common/types.h>
|
2018-01-15 05:54:28 +00:00
|
|
|
#include <common/unaligned.h>
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2018-01-15 05:54:28 +00:00
|
|
|
#include <emmintrin.h>
|
|
|
|
#endif
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSSE3__
|
2018-01-15 21:00:26 +00:00
|
|
|
#include <tmmintrin.h>
|
|
|
|
#endif
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __aarch64__
|
2018-06-14 21:13:13 +00:00
|
|
|
#include <arm_neon.h>
|
|
|
|
#endif
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
namespace LZ4
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
2018-01-15 21:00:26 +00:00
|
|
|
template <size_t N> [[maybe_unused]] void copy(UInt8 * dst, const UInt8 * src);
|
|
|
|
template <size_t N> [[maybe_unused]] void wildCopy(UInt8 * dst, const UInt8 * src, UInt8 * dst_end);
|
2020-03-08 22:08:39 +00:00
|
|
|
template <size_t N, bool USE_SHUFFLE> [[maybe_unused]] void copyOverlap(UInt8 * op, const UInt8 *& match, size_t offset);
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
inline void copy8(UInt8 * dst, const UInt8 * src)
|
|
|
|
{
|
|
|
|
memcpy(dst, src, 8);
|
|
|
|
}
|
|
|
|
|
2020-03-08 22:08:39 +00:00
|
|
|
inline void wildCopy8(UInt8 * dst, const UInt8 * src, const UInt8 * dst_end)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
2019-04-29 21:44:17 +00:00
|
|
|
/// Unrolling with clang is doing >10% performance degrade.
|
2019-04-25 21:53:22 +00:00
|
|
|
#if defined(__clang__)
|
|
|
|
#pragma nounroll
|
|
|
|
#endif
|
2018-01-15 05:54:28 +00:00
|
|
|
do
|
|
|
|
{
|
|
|
|
copy8(dst, src);
|
|
|
|
dst += 8;
|
|
|
|
src += 8;
|
|
|
|
} while (dst < dst_end);
|
|
|
|
}
|
|
|
|
|
2020-03-08 22:08:39 +00:00
|
|
|
inline void copyOverlap8(UInt8 * op, const UInt8 *& match, size_t offset)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
|
|
|
/// 4 % n.
|
2018-06-12 03:32:48 +00:00
|
|
|
/// Or if 4 % n is zero, we use n.
|
|
|
|
/// It gives equivalent result, but is better CPU friendly for unknown reason.
|
2018-01-15 05:54:28 +00:00
|
|
|
static constexpr int shift1[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
|
|
|
|
|
|
|
|
/// 8 % n - 4 % n
|
|
|
|
static constexpr int shift2[] = { 0, 0, 0, 1, 0, -1, -2, -3 };
|
|
|
|
|
|
|
|
op[0] = match[0];
|
|
|
|
op[1] = match[1];
|
|
|
|
op[2] = match[2];
|
|
|
|
op[3] = match[3];
|
|
|
|
|
|
|
|
match += shift1[offset];
|
|
|
|
memcpy(op + 4, match, 4);
|
|
|
|
match += shift2[offset];
|
|
|
|
}
|
|
|
|
|
2018-06-12 03:32:48 +00:00
|
|
|
|
2019-01-23 14:18:19 +00:00
|
|
|
#if defined(__x86_64__) || defined(__PPC__)
|
2018-06-14 21:13:13 +00:00
|
|
|
|
2018-06-12 03:32:48 +00:00
|
|
|
/** We use 'xmm' (128bit SSE) registers here to shuffle 16 bytes.
|
|
|
|
*
|
|
|
|
* It is possible to use 'mm' (64bit MMX) registers to shuffle just 8 bytes as we need.
|
|
|
|
*
|
|
|
|
* There is corresponding version of 'pshufb' instruction that operates on 'mm' registers,
|
|
|
|
* (it operates on MMX registers although it is available in SSSE3)
|
|
|
|
* and compiler library has the corresponding intrinsic: '_mm_shuffle_pi8'.
|
|
|
|
*
|
|
|
|
* It can be done like this:
|
|
|
|
*
|
|
|
|
* unalignedStore(op, _mm_shuffle_pi8(
|
|
|
|
* unalignedLoad<__m64>(match),
|
|
|
|
* unalignedLoad<__m64>(masks + 8 * offset)));
|
|
|
|
*
|
|
|
|
* This is perfectly correct and this code have the same or even better performance.
|
|
|
|
*
|
|
|
|
* But if we write code this way, it will lead to
|
|
|
|
* extremely weird and extremely non obvious
|
|
|
|
* effects in completely unrelated parts of code.
|
|
|
|
*
|
|
|
|
* Because using MMX registers alters the mode of operation of x87 FPU,
|
|
|
|
* and then operations with FPU become broken.
|
|
|
|
*
|
|
|
|
* Example 1.
|
|
|
|
* Compile this code without optimizations:
|
|
|
|
*
|
|
|
|
#include <vector>
|
|
|
|
#include <unordered_set>
|
|
|
|
#include <iostream>
|
|
|
|
#include <tmmintrin.h>
|
|
|
|
|
|
|
|
int main(int, char **)
|
|
|
|
{
|
|
|
|
[[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{});
|
|
|
|
|
|
|
|
std::vector<int> vec;
|
|
|
|
std::unordered_set<int> set(vec.begin(), vec.end());
|
|
|
|
|
|
|
|
std::cerr << set.size() << "\n";
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
$ g++ -g -O0 -mssse3 -std=c++17 mmx_bug1.cpp && ./a.out
|
|
|
|
terminate called after throwing an instance of 'std::bad_alloc'
|
|
|
|
what(): std::bad_alloc
|
|
|
|
|
|
|
|
Also reproduced with clang. But only with libstdc++, not with libc++.
|
|
|
|
|
|
|
|
* Example 2.
|
|
|
|
|
|
|
|
#include <math.h>
|
|
|
|
#include <iostream>
|
|
|
|
#include <tmmintrin.h>
|
|
|
|
|
|
|
|
int main(int, char **)
|
|
|
|
{
|
|
|
|
double max_fill = 1;
|
|
|
|
|
|
|
|
std::cerr << (long double)max_fill << "\n";
|
|
|
|
[[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{});
|
|
|
|
std::cerr << (long double)max_fill << "\n";
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
$ g++ -g -O0 -mssse3 -std=c++17 mmx_bug2.cpp && ./a.out
|
|
|
|
1
|
|
|
|
-nan
|
|
|
|
|
|
|
|
* Explanation:
|
|
|
|
*
|
|
|
|
* https://stackoverflow.com/questions/33692969/assembler-mmx-errors
|
|
|
|
* https://software.intel.com/en-us/node/524274
|
|
|
|
*
|
|
|
|
* Actually it's possible to use 'emms' instruction after decompression routine.
|
|
|
|
* But it's more easy to just use 'xmm' registers and avoid using 'mm' registers.
|
|
|
|
*/
|
2018-01-15 21:00:26 +00:00
|
|
|
inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
|
|
|
|
{
|
2018-12-28 05:11:57 +00:00
|
|
|
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
2018-01-15 21:00:26 +00:00
|
|
|
|
2018-06-12 04:18:01 +00:00
|
|
|
static constexpr UInt8 __attribute__((__aligned__(8))) masks[] =
|
2018-01-15 21:00:26 +00:00
|
|
|
{
|
|
|
|
0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
|
|
|
|
0, 1, 0, 1, 0, 1, 0, 1,
|
|
|
|
0, 1, 2, 0, 1, 2, 0, 1,
|
|
|
|
0, 1, 2, 3, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 0, 1, 2,
|
|
|
|
0, 1, 2, 3, 4, 5, 0, 1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 0,
|
2018-06-12 03:32:48 +00:00
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, /* this row is not used: padding to allow read 16 bytes starting at previous row */
|
2018-01-15 21:00:26 +00:00
|
|
|
};
|
|
|
|
|
2018-06-12 03:32:48 +00:00
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(op),
|
|
|
|
_mm_shuffle_epi8(
|
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(match)),
|
2018-06-12 04:18:01 +00:00
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(masks + 8 * offset))));
|
2018-01-15 21:00:26 +00:00
|
|
|
|
|
|
|
match += masks[offset];
|
|
|
|
|
|
|
|
#else
|
|
|
|
copyOverlap8(op, match, offset);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-06-14 21:13:13 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef __aarch64__
|
|
|
|
|
|
|
|
inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
|
|
|
|
{
|
|
|
|
static constexpr UInt8 __attribute__((__aligned__(8))) masks[] =
|
|
|
|
{
|
|
|
|
0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
|
|
|
|
0, 1, 0, 1, 0, 1, 0, 1,
|
|
|
|
0, 1, 2, 0, 1, 2, 0, 1,
|
|
|
|
0, 1, 2, 3, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 0, 1, 2,
|
|
|
|
0, 1, 2, 3, 4, 5, 0, 1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 0,
|
|
|
|
};
|
|
|
|
|
2019-06-28 16:21:05 +00:00
|
|
|
unalignedStore<uint8x8_t>(op, vtbl1_u8(unalignedLoad<uint8x8_t>(match), unalignedLoad<uint8x8_t>(masks + 8 * offset)));
|
2018-06-14 21:13:13 +00:00
|
|
|
match += masks[offset];
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2018-08-10 04:02:56 +00:00
|
|
|
template <> void inline copy<8>(UInt8 * dst, const UInt8 * src) { copy8(dst, src); }
|
|
|
|
template <> void inline wildCopy<8>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy8(dst, src, dst_end); }
|
|
|
|
template <> void inline copyOverlap<8, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8(op, match, offset); }
|
|
|
|
template <> void inline copyOverlap<8, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8Shuffle(op, match, offset); }
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
inline void copy16(UInt8 * dst, const UInt8 * src)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2018-01-15 05:54:28 +00:00
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
|
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
|
2018-06-14 21:13:13 +00:00
|
|
|
#else
|
|
|
|
memcpy(dst, src, 16);
|
|
|
|
#endif
|
2018-01-15 05:54:28 +00:00
|
|
|
}
|
|
|
|
|
2020-03-08 22:08:39 +00:00
|
|
|
inline void wildCopy16(UInt8 * dst, const UInt8 * src, const UInt8 * dst_end)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
2019-04-29 21:44:17 +00:00
|
|
|
/// Unrolling with clang is doing >10% performance degrade.
|
2019-04-25 21:53:22 +00:00
|
|
|
#if defined(__clang__)
|
|
|
|
#pragma nounroll
|
|
|
|
#endif
|
2018-01-15 05:54:28 +00:00
|
|
|
do
|
|
|
|
{
|
|
|
|
copy16(dst, src);
|
|
|
|
dst += 16;
|
|
|
|
src += 16;
|
|
|
|
} while (dst < dst_end);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline void copyOverlap16(UInt8 * op, const UInt8 *& match, const size_t offset)
|
|
|
|
{
|
|
|
|
/// 4 % n.
|
|
|
|
static constexpr int shift1[]
|
|
|
|
= { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
|
|
|
|
|
|
|
|
/// 8 % n - 4 % n
|
|
|
|
static constexpr int shift2[]
|
|
|
|
= { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4 };
|
|
|
|
|
|
|
|
/// 16 % n - 8 % n
|
|
|
|
static constexpr int shift3[]
|
2018-01-15 21:00:26 +00:00
|
|
|
= { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7 };
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
op[0] = match[0];
|
|
|
|
op[1] = match[1];
|
|
|
|
op[2] = match[2];
|
|
|
|
op[3] = match[3];
|
|
|
|
|
|
|
|
match += shift1[offset];
|
|
|
|
memcpy(op + 4, match, 4);
|
|
|
|
match += shift2[offset];
|
|
|
|
memcpy(op + 8, match, 8);
|
|
|
|
match += shift3[offset];
|
|
|
|
}
|
|
|
|
|
2018-06-14 21:13:13 +00:00
|
|
|
|
2019-01-23 14:18:19 +00:00
|
|
|
#if defined(__x86_64__) || defined(__PPC__)
|
2018-06-14 21:13:13 +00:00
|
|
|
|
2018-01-15 21:00:26 +00:00
|
|
|
inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
|
|
|
|
{
|
2018-12-28 05:11:57 +00:00
|
|
|
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
|
2018-01-15 21:00:26 +00:00
|
|
|
|
|
|
|
static constexpr UInt8 __attribute__((__aligned__(16))) masks[] =
|
|
|
|
{
|
|
|
|
0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
|
|
|
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
|
|
|
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
|
|
|
|
0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
|
|
|
|
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(op),
|
|
|
|
_mm_shuffle_epi8(
|
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(match)),
|
|
|
|
_mm_load_si128(reinterpret_cast<const __m128i *>(masks) + offset)));
|
|
|
|
|
|
|
|
match += masks[offset];
|
|
|
|
|
|
|
|
#else
|
|
|
|
copyOverlap16(op, match, offset);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-06-14 21:13:13 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef __aarch64__
|
|
|
|
|
|
|
|
inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
|
|
|
|
{
|
|
|
|
static constexpr UInt8 __attribute__((__aligned__(16))) masks[] =
|
|
|
|
{
|
|
|
|
0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
|
|
|
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
|
|
|
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
|
|
|
|
0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
|
|
|
|
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1,
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
|
|
|
|
};
|
|
|
|
|
2019-06-28 16:21:05 +00:00
|
|
|
unalignedStore<uint8x8_t>(op,
|
2018-06-14 21:14:08 +00:00
|
|
|
vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset)));
|
2018-06-14 21:13:13 +00:00
|
|
|
|
2019-06-28 16:21:05 +00:00
|
|
|
unalignedStore<uint8x8_t>(op + 8,
|
2018-06-14 21:14:08 +00:00
|
|
|
vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset + 8)));
|
2018-06-14 21:13:13 +00:00
|
|
|
|
|
|
|
match += masks[offset];
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2018-08-10 04:02:56 +00:00
|
|
|
template <> void inline copy<16>(UInt8 * dst, const UInt8 * src) { copy16(dst, src); }
|
|
|
|
template <> void inline wildCopy<16>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy16(dst, src, dst_end); }
|
|
|
|
template <> void inline copyOverlap<16, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16(op, match, offset); }
|
|
|
|
template <> void inline copyOverlap<16, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16Shuffle(op, match, offset); }
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
|
2019-04-21 17:31:15 +00:00
|
|
|
inline void copy32(UInt8 * dst, const UInt8 * src)
|
|
|
|
{
|
2019-04-29 19:30:14 +00:00
|
|
|
/// There was an AVX here but with mash with SSE instructions, we got a big slowdown.
|
|
|
|
#if defined(__SSE2__)
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
|
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + 16),
|
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
|
2019-04-21 17:31:15 +00:00
|
|
|
#else
|
2019-04-29 19:30:14 +00:00
|
|
|
memcpy(dst, src, 16);
|
|
|
|
memcpy(dst + 16, src + 16, 16);
|
2019-04-21 17:31:15 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2020-03-08 22:08:39 +00:00
|
|
|
inline void wildCopy32(UInt8 * dst, const UInt8 * src, const UInt8 * dst_end)
|
2019-04-21 17:31:15 +00:00
|
|
|
{
|
2019-04-29 21:44:17 +00:00
|
|
|
/// Unrolling with clang is doing >10% performance degrade.
|
2019-04-25 21:53:22 +00:00
|
|
|
#if defined(__clang__)
|
|
|
|
#pragma nounroll
|
|
|
|
#endif
|
2019-04-21 17:31:15 +00:00
|
|
|
do
|
|
|
|
{
|
2019-04-29 19:30:14 +00:00
|
|
|
copy32(dst, src);
|
2019-04-21 17:31:15 +00:00
|
|
|
dst += 32;
|
|
|
|
src += 32;
|
|
|
|
} while (dst < dst_end);
|
|
|
|
}
|
|
|
|
|
2019-04-25 18:34:32 +00:00
|
|
|
inline void copyOverlap32(UInt8 * op, const UInt8 *& match, const size_t offset)
|
2019-04-21 17:31:15 +00:00
|
|
|
{
|
2019-04-25 18:34:32 +00:00
|
|
|
/// 4 % n.
|
|
|
|
static constexpr int shift1[]
|
|
|
|
= { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
|
|
|
|
|
|
|
|
/// 8 % n - 4 % n
|
|
|
|
static constexpr int shift2[]
|
|
|
|
= { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
|
|
|
|
|
|
|
|
/// 16 % n - 8 % n
|
|
|
|
static constexpr int shift3[]
|
|
|
|
= { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
|
|
|
|
|
|
|
|
/// 32 % n - 16 % n
|
|
|
|
static constexpr int shift4[]
|
|
|
|
= { 0, 0, 0, 1, 0, 1, -2, 2, 0, -2, -4, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15 };
|
|
|
|
|
|
|
|
op[0] = match[0];
|
|
|
|
op[1] = match[1];
|
|
|
|
op[2] = match[2];
|
|
|
|
op[3] = match[3];
|
|
|
|
|
|
|
|
match += shift1[offset];
|
|
|
|
memcpy(op + 4, match, 4);
|
|
|
|
match += shift2[offset];
|
|
|
|
memcpy(op + 8, match, 8);
|
|
|
|
match += shift3[offset];
|
|
|
|
memcpy(op + 16, match, 16);
|
|
|
|
match += shift4[offset];
|
2019-04-21 17:31:15 +00:00
|
|
|
}
|
2018-01-15 05:54:28 +00:00
|
|
|
|
2019-04-29 21:36:53 +00:00
|
|
|
|
2019-04-29 19:30:14 +00:00
|
|
|
template <> void inline copy<32>(UInt8 * dst, const UInt8 * src) { copy32(dst, src); }
|
|
|
|
template <> void inline wildCopy<32>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy32(dst, src, dst_end); }
|
2019-04-25 18:34:32 +00:00
|
|
|
template <> void inline copyOverlap<32, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap32(op, match, offset); }
|
2018-01-15 21:52:24 +00:00
|
|
|
|
2018-01-15 05:54:28 +00:00
|
|
|
|
2019-04-25 18:34:32 +00:00
|
|
|
/// See also https://stackoverflow.com/a/30669632
|
2018-01-15 05:54:28 +00:00
|
|
|
|
2018-01-15 21:00:26 +00:00
|
|
|
template <size_t copy_amount, bool use_shuffle>
|
2021-08-03 12:24:16 +00:00
|
|
|
bool NO_INLINE decompressImpl(
|
2018-01-15 05:54:28 +00:00
|
|
|
const char * const source,
|
|
|
|
char * const dest,
|
2021-08-03 12:24:16 +00:00
|
|
|
size_t source_size,
|
2018-01-15 05:54:28 +00:00
|
|
|
size_t dest_size)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
|
|
|
|
UInt8 * op = reinterpret_cast<UInt8 *>(dest);
|
2021-08-03 12:24:16 +00:00
|
|
|
const UInt8 * const input_end = ip + source_size;
|
|
|
|
UInt8 * const output_begin = op;
|
2018-01-15 05:54:28 +00:00
|
|
|
UInt8 * const output_end = op + dest_size;
|
|
|
|
|
2019-04-29 21:44:17 +00:00
|
|
|
/// Unrolling with clang is doing >10% performance degrade.
|
2019-04-25 21:53:22 +00:00
|
|
|
#if defined(__clang__)
|
|
|
|
#pragma nounroll
|
|
|
|
#endif
|
2020-03-08 21:29:00 +00:00
|
|
|
while (true)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
|
|
|
size_t length;
|
|
|
|
|
|
|
|
auto continue_read_length = [&]
|
|
|
|
{
|
|
|
|
unsigned s;
|
|
|
|
do
|
|
|
|
{
|
|
|
|
s = *ip++;
|
|
|
|
length += s;
|
|
|
|
} while (unlikely(s == 255));
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Get literal length.
|
|
|
|
|
|
|
|
const unsigned token = *ip++;
|
|
|
|
length = token >> 4;
|
|
|
|
if (length == 0x0F)
|
|
|
|
continue_read_length();
|
|
|
|
|
|
|
|
/// Copy literals.
|
|
|
|
|
|
|
|
UInt8 * copy_end = op + length;
|
|
|
|
|
2018-06-12 03:32:48 +00:00
|
|
|
/// input: Hello, world
|
|
|
|
/// ^-ip
|
|
|
|
/// output: xyz
|
|
|
|
/// ^-op ^-copy_end
|
|
|
|
/// output: xyzHello, w
|
|
|
|
/// ^- excessive copied bytes due to "wildCopy"
|
|
|
|
/// input: Hello, world
|
|
|
|
/// ^-ip
|
|
|
|
/// output: xyzHello, w
|
|
|
|
/// ^-op (we will overwrite excessive bytes on next iteration)
|
|
|
|
|
2021-08-03 12:24:16 +00:00
|
|
|
{
|
|
|
|
auto * target = std::min(copy_end, output_end);
|
|
|
|
wildCopy<copy_amount>(op, ip, target); /// Here we can write up to copy_amount - 1 bytes after buffer.
|
|
|
|
|
|
|
|
if (target == output_end)
|
|
|
|
return true;
|
|
|
|
}
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
ip += length;
|
|
|
|
op = copy_end;
|
|
|
|
|
2021-08-03 12:24:16 +00:00
|
|
|
if (unlikely(ip > input_end))
|
|
|
|
return false;
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
/// Get match offset.
|
|
|
|
|
|
|
|
size_t offset = unalignedLoad<UInt16>(ip);
|
|
|
|
ip += 2;
|
|
|
|
const UInt8 * match = op - offset;
|
|
|
|
|
2021-08-03 12:24:16 +00:00
|
|
|
if (unlikely(match < output_begin))
|
|
|
|
return false;
|
|
|
|
|
2018-01-15 05:54:28 +00:00
|
|
|
/// Get match length.
|
|
|
|
|
|
|
|
length = token & 0x0F;
|
|
|
|
if (length == 0x0F)
|
|
|
|
continue_read_length();
|
|
|
|
length += 4;
|
|
|
|
|
|
|
|
/// Copy match within block, that produce overlapping pattern. Match may replicate itself.
|
|
|
|
|
|
|
|
copy_end = op + length;
|
|
|
|
|
2018-01-15 21:52:24 +00:00
|
|
|
/** Here we can write up to copy_amount - 1 - 4 * 2 bytes after buffer.
|
|
|
|
* The worst case when offset = 1 and length = 4
|
|
|
|
*/
|
|
|
|
|
2018-01-15 05:54:28 +00:00
|
|
|
if (unlikely(offset < copy_amount))
|
|
|
|
{
|
2018-06-12 03:32:48 +00:00
|
|
|
/// output: Hello
|
|
|
|
/// ^-op
|
|
|
|
/// ^-match; offset = 5
|
|
|
|
///
|
|
|
|
/// output: Hello
|
|
|
|
/// [------] - copy_amount bytes
|
|
|
|
/// [------] - copy them here
|
|
|
|
///
|
|
|
|
/// output: HelloHelloHel
|
|
|
|
/// ^-match ^-op
|
|
|
|
|
2018-01-15 21:00:26 +00:00
|
|
|
copyOverlap<copy_amount, use_shuffle>(op, match, offset);
|
2018-01-15 05:54:28 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
copy<copy_amount>(op, match);
|
|
|
|
match += copy_amount;
|
|
|
|
}
|
|
|
|
|
|
|
|
op += copy_amount;
|
|
|
|
|
2018-01-15 21:52:24 +00:00
|
|
|
copy<copy_amount>(op, match); /// copy_amount + copy_amount - 1 - 4 * 2 bytes after buffer.
|
2018-01-15 05:54:28 +00:00
|
|
|
if (length > copy_amount * 2)
|
2021-08-03 12:24:16 +00:00
|
|
|
{
|
|
|
|
auto * target = std::min(copy_end, output_end);
|
|
|
|
wildCopy<copy_amount>(op + copy_amount, match + copy_amount, target);
|
|
|
|
}
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
op = copy_end;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-08-03 12:24:16 +00:00
|
|
|
bool decompress(
|
2018-01-15 05:54:28 +00:00
|
|
|
const char * const source,
|
|
|
|
char * const dest,
|
|
|
|
size_t source_size,
|
2018-01-16 01:59:51 +00:00
|
|
|
size_t dest_size,
|
2018-06-14 20:05:52 +00:00
|
|
|
PerformanceStatistics & statistics [[maybe_unused]])
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
2018-01-16 01:59:51 +00:00
|
|
|
if (source_size == 0 || dest_size == 0)
|
2021-08-03 12:24:16 +00:00
|
|
|
return true;
|
2018-01-16 01:59:51 +00:00
|
|
|
|
|
|
|
/// Don't run timer if the block is too small.
|
|
|
|
if (dest_size >= 32768)
|
|
|
|
{
|
|
|
|
size_t best_variant = statistics.select();
|
|
|
|
|
|
|
|
/// Run the selected method and measure time.
|
|
|
|
|
|
|
|
Stopwatch watch;
|
2021-08-03 12:24:16 +00:00
|
|
|
bool success = true;
|
2018-01-16 01:59:51 +00:00
|
|
|
if (best_variant == 0)
|
2021-08-03 12:24:16 +00:00
|
|
|
success = decompressImpl<16, true>(source, dest, source_size, dest_size);
|
2018-01-16 01:59:51 +00:00
|
|
|
if (best_variant == 1)
|
2021-08-03 12:24:16 +00:00
|
|
|
success = decompressImpl<16, false>(source, dest, source_size, dest_size);
|
2018-06-16 04:48:37 +00:00
|
|
|
if (best_variant == 2)
|
2021-08-03 12:24:16 +00:00
|
|
|
success = decompressImpl<8, true>(source, dest, source_size, dest_size);
|
2019-04-25 21:53:22 +00:00
|
|
|
if (best_variant == 3)
|
2021-08-03 12:24:16 +00:00
|
|
|
success = decompressImpl<32, false>(source, dest, source_size, dest_size);
|
2018-01-16 01:59:51 +00:00
|
|
|
|
|
|
|
watch.stop();
|
|
|
|
|
|
|
|
/// Update performance statistics.
|
|
|
|
|
|
|
|
statistics.data[best_variant].update(watch.elapsedSeconds(), dest_size);
|
2021-08-03 12:24:16 +00:00
|
|
|
|
|
|
|
return success;
|
2018-01-16 01:59:51 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2021-08-03 12:24:16 +00:00
|
|
|
return decompressImpl<8, false>(source, dest, source_size, dest_size);
|
2018-01-16 01:59:51 +00:00
|
|
|
}
|
2018-01-15 05:54:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-01-16 01:59:51 +00:00
|
|
|
void StreamStatistics::literal(size_t length)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
|
|
|
++num_tokens;
|
|
|
|
sum_literal_lengths += length;
|
|
|
|
}
|
|
|
|
|
2018-01-16 01:59:51 +00:00
|
|
|
void StreamStatistics::match(size_t length, size_t offset)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
|
|
|
++num_tokens;
|
|
|
|
sum_match_lengths += length;
|
|
|
|
sum_match_offsets += offset;
|
|
|
|
count_match_offset_less_8 += offset < 8;
|
|
|
|
count_match_offset_less_16 += offset < 16;
|
|
|
|
count_match_replicate_itself += offset < length;
|
|
|
|
}
|
|
|
|
|
2018-01-16 01:59:51 +00:00
|
|
|
void StreamStatistics::print() const
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
|
|
|
std::cerr
|
|
|
|
<< "Num tokens: " << num_tokens
|
|
|
|
<< ", Avg literal length: " << double(sum_literal_lengths) / num_tokens
|
|
|
|
<< ", Avg match length: " << double(sum_match_lengths) / num_tokens
|
|
|
|
<< ", Avg match offset: " << double(sum_match_offsets) / num_tokens
|
|
|
|
<< ", Offset < 8 ratio: " << double(count_match_offset_less_8) / num_tokens
|
|
|
|
<< ", Offset < 16 ratio: " << double(count_match_offset_less_16) / num_tokens
|
|
|
|
<< ", Match replicate itself: " << double(count_match_replicate_itself) / num_tokens
|
|
|
|
<< "\n";
|
|
|
|
}
|
|
|
|
|
2018-01-16 01:59:51 +00:00
|
|
|
void statistics(
|
2018-01-15 05:54:28 +00:00
|
|
|
const char * const source,
|
|
|
|
char * const dest,
|
|
|
|
size_t dest_size,
|
2018-01-16 01:59:51 +00:00
|
|
|
StreamStatistics & stat)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
|
|
|
|
UInt8 * op = reinterpret_cast<UInt8 *>(dest);
|
2018-01-15 05:54:28 +00:00
|
|
|
UInt8 * const output_end = op + dest_size;
|
2020-03-08 21:29:00 +00:00
|
|
|
while (true)
|
2018-01-15 05:54:28 +00:00
|
|
|
{
|
|
|
|
size_t length;
|
|
|
|
|
|
|
|
auto continue_read_length = [&]
|
|
|
|
{
|
|
|
|
unsigned s;
|
|
|
|
do
|
|
|
|
{
|
|
|
|
s = *ip++;
|
|
|
|
length += s;
|
|
|
|
} while (unlikely(s == 255));
|
|
|
|
};
|
|
|
|
|
|
|
|
auto token = *ip++;
|
|
|
|
length = token >> 4;
|
|
|
|
if (length == 0x0F)
|
|
|
|
continue_read_length();
|
|
|
|
|
|
|
|
stat.literal(length);
|
|
|
|
|
|
|
|
ip += length;
|
|
|
|
op += length;
|
|
|
|
|
|
|
|
if (op > output_end)
|
2018-01-16 01:59:51 +00:00
|
|
|
return;
|
2018-01-15 05:54:28 +00:00
|
|
|
|
|
|
|
size_t offset = unalignedLoad<UInt16>(ip);
|
|
|
|
ip += 2;
|
|
|
|
|
|
|
|
length = token & 0x0F;
|
|
|
|
if (length == 0x0F)
|
|
|
|
continue_read_length();
|
|
|
|
length += 4;
|
|
|
|
|
|
|
|
stat.match(length, offset);
|
|
|
|
|
|
|
|
op += length;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|