Remove AVX2 to figure out where is the illegal intruction

Enable AVX2 - int32
This commit is contained in:
Youenn Lebras 2021-09-09 11:21:32 +02:00 committed by youenn lebras
parent 62487fe2fc
commit a810ce5dcb
No known key found for this signature in database
GPG Key ID: E1DF98A69CABD2A5

View File

@ -12,7 +12,6 @@
#if defined(__AVX2__)
#include <immintrin.h>
#endif
namespace DB::GatherUtils
{
@ -236,240 +235,6 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArrayS
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
}
// TODO: Discuss about
// raise an error : "error: no viable conversion from 'const NumericArraySlice<unsigned int>' to 'const NumericArraySlice<int>'"
// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types.
// AVX2 UInt specialization
// template <>
// inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<unsigned>, NumericArraySlice<unsigned>, sliceEqualElements<unsigned,unsigned> >(
// const NumericArraySlice<unsigned> & second, const NumericArraySlice<unsigned> & first, const UInt8 * first_null_map, const UInt8 * second_null_map)
// {
// return sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<int>, NumericArraySlice<int>, sliceEqualElements<int,int> > (
// static_cast<const NumericArraySlice<int> &>(second), static_cast<const NumericArraySlice<int> &>(first), second_null_map, first_null_map);
// }
// AVX2 Int64 specialization
template <>
inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<Int64>, NumericArraySlice<Int64>, sliceEqualElements<Int64,Int64> >(
const NumericArraySlice<Int64> & first, const NumericArraySlice<Int64> & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
{
if (second.size == 0)
return true;
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
return false;
const bool has_first_null_map = first_null_map != nullptr;
const bool has_second_null_map = second_null_map != nullptr;
size_t j = 0;
short has_mask = 1;
const Int64 full = -1, none = 0;
const __m256i ones = _mm256_set1_epi64x(full);
const __m256i zeros = _mm256_setzero_si256();
if (second.size > 3 && first.size > 3)
{
for (; j < second.size - 3 && has_mask; j += 4)
{
has_mask = 0;
const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(second.data + j));
// bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise;
__m256i bitmask = has_second_null_map ?
_mm256_set_epi64x(
(second_null_map[j + 3])? full : none,
(second_null_map[j + 2])? full : none,
(second_null_map[j + 1])? full : none,
(second_null_map[j]) ? full : none)
: zeros;
unsigned i = 0;
for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4)
{
const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(first.data + i));
const __m256i first_nm_mask = has_first_null_map?
_mm256_set_m128i(
_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i + 2))),
_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))))
: zeros;
bitmask =
_mm256_or_si256(
_mm256_or_si256(
_mm256_or_si256(
_mm256_andnot_si256(
first_nm_mask,
_mm256_cmpeq_epi64(f_data, s_data)),
_mm256_andnot_si256(
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)),
_mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))),
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)),
_mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))),
_mm256_andnot_si256(
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)),
_mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))),
bitmask);
}
if (i < first.size)
{
for (; i < first.size && !has_mask; ++i)
{
if (has_first_null_map && first_null_map[i])
continue;
__m256i v_i = _mm256_set1_epi64x(first.data[i]);
bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i));
has_mask = _mm256_testc_si256(bitmask, ones);
}
}
}
}
if (!has_mask && second.size > 3)
return false;
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
}
// AVX2 Int16_t specialization
template <>
inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<int16_t>, NumericArraySlice<int16_t>, sliceEqualElements<int16_t,int16_t> >(
const NumericArraySlice<int16_t> & first, const NumericArraySlice<int16_t> & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
{
if (second.size == 0)
return true;
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
return false;
const bool has_first_null_map = first_null_map != nullptr;
const bool has_second_null_map = second_null_map != nullptr;
size_t j = 0;
short has_mask = 1;
const int16_t full = -1, none = 0;
const __m256i ones = _mm256_set1_epi16(full);
const __m256i zeros = _mm256_setzero_si256();
if (second.size > 15 && first.size > 15)
{
for (; j < second.size - 15 && has_mask; j += 16)
{
has_mask = 0;
const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(second.data + j));
__m256i bitmask = has_second_null_map ?
_mm256_set_epi16(
(second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none,
(second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none,
(second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none,
(second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none,
(second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none,
(second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none,
(second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none,
(second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none)
: zeros;
unsigned i = 0;
for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16)
{
const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(first.data + i));
const __m256i first_nm_mask = has_first_null_map?
_mm256_set_m128i(
_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i + 8))),
_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))))
: zeros;
bitmask =
_mm256_or_si256(
_mm256_or_si256(
_mm256_or_si256(
_mm256_or_si256(
_mm256_or_si256(
_mm256_andnot_si256(
first_nm_mask,
_mm256_cmpeq_epi16(f_data, s_data)),
_mm256_andnot_si256(
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))),
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))),
_mm256_andnot_si256(
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)))))
),
_mm256_or_si256(
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))),
_mm256_andnot_si256(
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))),
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))),
_mm256_andnot_si256(
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18))))))
),
_mm256_or_si256(
_mm256_or_si256(
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1),
_mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))),
_mm256_andnot_si256(
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))),
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))),
_mm256_andnot_si256(
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)))))
),
_mm256_or_si256(
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))),
_mm256_andnot_si256(
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))),
_mm256_or_si256(
_mm256_andnot_si256(
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))),
_mm256_andnot_si256(
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)),
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2))))))
)
),
bitmask);
}
if (i < first.size)
{
for (; i < first.size && !has_mask; ++i)
{
if (has_first_null_map && first_null_map[i])
continue;
__m256i v_i = _mm256_set1_epi16(first.data[i]);
bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i));
has_mask = _mm256_testc_si256(bitmask, ones);
}
}
}
}
if (!has_mask && second.size > 15)
return false;
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
}
#elif defined(__SSE4_2__)
// SSE4.2 Int specialization