#pragma once #include #if defined(__SSE2__) #include #endif #if defined(__SSE4_2__) #include #endif /** find_first_symbols(begin, end): * * Allow to search for next character from the set of 'symbols...' in a string. * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), * but with the following differencies: * - works with any memory ranges, including containing zero bytes; * - doesn't require terminating zero byte: end of memory range is passed explicitly; * - if not found, returns pointer to end instead of NULL; * - maximum number of symbols to search is 16. * * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, * that have more than 2x performance advantage over trivial loop * in the case of parsing tab-separated dump with (probably escaped) string fields. * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. * * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. * * find_last_symbols_or_null(begin, end): * * Allow to search for the last matching character in a string. * If no such characters, returns nullptr. */ namespace detail { template inline bool is_in(char x) { return x == s0; } template inline bool is_in(char x) { return x == s0 || is_in(x); } #if defined(__SSE2__) template inline __m128i mm_is_in(__m128i bytes) { __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); return eq0; } template inline __m128i mm_is_in(__m128i bytes) { __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); __m128i eq = mm_is_in(bytes); return _mm_or_si128(eq0, eq); } #endif template inline const char * find_first_symbols_sse2(const char * begin, const char * end) { #if defined(__SSE2__) for (; begin + 15 < end; begin += 16) { __m128i bytes = _mm_loadu_si128(reinterpret_cast(begin)); __m128i eq = mm_is_in(bytes); uint16_t bit_mask = _mm_movemask_epi8(eq); if (bit_mask) return begin + __builtin_ctz(bit_mask); } #endif for (; begin < end; ++begin) if (is_in(*begin)) return begin; return end; } template inline const char * find_last_symbols_or_null_sse2(const char * begin, const char * end) { #if defined(__SSE2__) for (; end - 16 >= begin; end -= 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. { __m128i bytes = _mm_loadu_si128(reinterpret_cast(end - 16)); __m128i eq = mm_is_in(bytes); uint16_t bit_mask = _mm_movemask_epi8(eq); if (bit_mask) return end - 1 - (__builtin_clz(bit_mask) - 16); /// because __builtin_clz works with mask as uint32. } #endif --end; for (; end >= begin; --end) if (is_in(*end)) return end; return nullptr; } template inline const char * find_first_symbols_sse42_impl(const char * begin, const char * end) { #if defined(__SSE4_2__) #define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT) __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c16); for (; begin + 15 < end; begin += 16) { __m128i bytes = _mm_loadu_si128(reinterpret_cast(begin)); if (_mm_cmpestrc(set, num_chars, bytes, 16, MODE)) return begin + _mm_cmpestri(set, num_chars, bytes, 16, MODE); } #undef MODE #endif for (; begin < end; ++begin) if ( (num_chars >= 1 && *begin == c01) || (num_chars >= 2 && *begin == c02) || (num_chars >= 3 && *begin == c03) || (num_chars >= 4 && *begin == c04) || (num_chars >= 5 && *begin == c05) || (num_chars >= 6 && *begin == c06) || (num_chars >= 7 && *begin == c07) || (num_chars >= 8 && *begin == c08) || (num_chars >= 9 && *begin == c09) || (num_chars >= 10 && *begin == c10) || (num_chars >= 11 && *begin == c11) || (num_chars >= 12 && *begin == c12) || (num_chars >= 13 && *begin == c13) || (num_chars >= 14 && *begin == c14) || (num_chars >= 15 && *begin == c15) || (num_chars >= 16 && *begin == c16)) return begin; return end; } template inline const char * find_first_symbols_sse42(const char * begin, const char * end) { return find_first_symbols_sse42_impl(begin, end); } /// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. template inline const char * find_first_symbols_dispatch(const char * begin, const char * end) { #if defined(__SSE4_2__) if (sizeof...(symbols) >= 5) return find_first_symbols_sse42(begin, end); else #endif return find_first_symbols_sse2(begin, end); } } template inline const char * find_first_symbols(const char * begin, const char * end) { return detail::find_first_symbols_dispatch(begin, end); } /// Returning non const result for non const arguments. /// It is convenient when you are using this function to iterate through non-const buffer. template inline char * find_first_symbols(char * begin, char * end) { return const_cast(detail::find_first_symbols_dispatch(begin, end)); } template inline const char * find_last_symbols_or_null(const char * begin, const char * end) { return detail::find_last_symbols_or_null_sse2(begin, end); } template inline char * find_last_symbols_or_null(char * begin, char * end) { return const_cast(detail::find_last_symbols_or_null_sse2(begin, end)); }