mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Move isAllASCII from UTF8Helpers to StringUtils
This commit is contained in:
parent
2909e6451b
commit
17ce449076
@ -1,4 +1,10 @@
|
||||
#include "StringUtils.h"
|
||||
#include <Common/StringUtils.h>
|
||||
|
||||
#include <Common/TargetSpecific.h>
|
||||
|
||||
#if USE_MULTITARGET_CODE
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace impl
|
||||
@ -15,3 +21,67 @@ bool endsWith(const std::string & s, const char * suffix, size_t suffix_size)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
DECLARE_DEFAULT_CODE(
|
||||
bool isAllASCII(const UInt8 * data, size_t size)
|
||||
{
|
||||
UInt8 mask = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
mask |= data[i];
|
||||
|
||||
return !(mask & 0x80);
|
||||
})
|
||||
|
||||
DECLARE_SSE42_SPECIFIC_CODE(
|
||||
/// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h
|
||||
bool isAllASCII(const UInt8 * data, size_t size)
|
||||
{
|
||||
__m128i masks = _mm_setzero_si128();
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + 16 <= size; i += 16)
|
||||
{
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + i));
|
||||
masks = _mm_or_si128(masks, bytes);
|
||||
}
|
||||
int mask = _mm_movemask_epi8(masks);
|
||||
|
||||
UInt8 tail_mask = 0;
|
||||
for (; i < size; i++)
|
||||
tail_mask |= data[i];
|
||||
|
||||
mask |= (tail_mask & 0x80);
|
||||
return !mask;
|
||||
})
|
||||
|
||||
DECLARE_AVX2_SPECIFIC_CODE(
|
||||
bool isAllASCII(const UInt8 * data, size_t size)
|
||||
{
|
||||
__m256i masks = _mm256_setzero_si256();
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + 32 <= size; i += 32)
|
||||
{
|
||||
__m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(data + i));
|
||||
masks = _mm256_or_si256(masks, bytes);
|
||||
}
|
||||
int mask = _mm256_movemask_epi8(masks);
|
||||
|
||||
UInt8 tail_mask = 0;
|
||||
for (; i < size; i++)
|
||||
tail_mask |= data[i];
|
||||
|
||||
mask |= (tail_mask & 0x80);
|
||||
return !mask;
|
||||
})
|
||||
|
||||
bool isAllASCII(const UInt8 * data, size_t size)
|
||||
{
|
||||
#if USE_MULTITARGET_CODE
|
||||
if (isArchSupported(DB::TargetArch::AVX2))
|
||||
return TargetSpecific::AVX2::isAllASCII(data, size);
|
||||
if (isArchSupported(DB::TargetArch::SSE42))
|
||||
return TargetSpecific::SSE42::isAllASCII(data, size);
|
||||
#endif
|
||||
return TargetSpecific::Default::isAllASCII(data, size);
|
||||
}
|
||||
|
@ -7,6 +7,8 @@
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
|
||||
namespace impl
|
||||
{
|
||||
@ -315,6 +317,9 @@ inline void trim(std::string & str, char c = ' ')
|
||||
trimLeft(str, c);
|
||||
}
|
||||
|
||||
/// If all characters in the string are ASCII, return true
|
||||
bool isAllASCII(const UInt8 * data, size_t size);
|
||||
|
||||
constexpr bool containsGlobs(const std::string & str)
|
||||
{
|
||||
return str.find_first_of("*?{") != std::string::npos;
|
||||
|
@ -1,14 +1,9 @@
|
||||
#include <Common/StringUtils.h>
|
||||
#include <Common/TargetSpecific.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <Common/StringUtils.h>
|
||||
|
||||
#include <widechar_width.h>
|
||||
#include <bit>
|
||||
|
||||
#if USE_MULTITARGET_CODE
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace UTF8
|
||||
@ -208,7 +203,6 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l
|
||||
|
||||
}
|
||||
|
||||
|
||||
size_t computeWidth(const UInt8 * data, size_t size, size_t prefix) noexcept
|
||||
{
|
||||
return computeWidthImpl<Width>(data, size, prefix, 0);
|
||||
@ -219,71 +213,5 @@ size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, s
|
||||
return computeWidthImpl<BytesBeforeLimit>(data, size, prefix, limit);
|
||||
}
|
||||
|
||||
|
||||
DECLARE_DEFAULT_CODE(
|
||||
bool isAllASCII(const UInt8 * data, size_t size)
|
||||
{
|
||||
UInt8 mask = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
mask |= data[i];
|
||||
|
||||
return !(mask & 0x80);
|
||||
})
|
||||
|
||||
DECLARE_SSE42_SPECIFIC_CODE(
|
||||
/// Copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h
|
||||
bool isAllASCII(const UInt8 * data, size_t size)
|
||||
{
|
||||
__m128i masks = _mm_setzero_si128();
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + 16 <= size; i += 16)
|
||||
{
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + i));
|
||||
masks = _mm_or_si128(masks, bytes);
|
||||
}
|
||||
int mask = _mm_movemask_epi8(masks);
|
||||
|
||||
UInt8 tail_mask = 0;
|
||||
for (; i < size; i++)
|
||||
tail_mask |= data[i];
|
||||
|
||||
mask |= (tail_mask & 0x80);
|
||||
return !mask;
|
||||
})
|
||||
|
||||
DECLARE_AVX2_SPECIFIC_CODE(
|
||||
bool isAllASCII(const UInt8 * data, size_t size)
|
||||
{
|
||||
__m256i masks = _mm256_setzero_si256();
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + 32 <= size; i += 32)
|
||||
{
|
||||
__m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(data + i));
|
||||
masks = _mm256_or_si256(masks, bytes);
|
||||
}
|
||||
int mask = _mm256_movemask_epi8(masks);
|
||||
|
||||
UInt8 tail_mask = 0;
|
||||
for (; i < size; i++)
|
||||
tail_mask |= data[i];
|
||||
|
||||
mask |= (tail_mask & 0x80);
|
||||
return !mask;
|
||||
})
|
||||
|
||||
bool isAllASCII(const UInt8* data, size_t size)
|
||||
{
|
||||
#if USE_MULTITARGET_CODE
|
||||
if (isArchSupported(TargetArch::AVX2))
|
||||
return TargetSpecific::AVX2::isAllASCII(data, size);
|
||||
if (isArchSupported(TargetArch::SSE42))
|
||||
return TargetSpecific::SSE42::isAllASCII(data, size);
|
||||
#endif
|
||||
return TargetSpecific::Default::isAllASCII(data, size);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -136,10 +136,6 @@ size_t computeWidth(const UInt8 * data, size_t size, size_t prefix = 0) noexcept
|
||||
*/
|
||||
size_t computeBytesBeforeWidth(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept;
|
||||
|
||||
|
||||
/// If all the characters in the string are ASCII, return true.
|
||||
bool isAllASCII(const UInt8* data, size_t size);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <Functions/LowerUpperImpl.h>
|
||||
#include <base/defines.h>
|
||||
#include <Poco/UTF8Encoding.h>
|
||||
#include <Common/StringUtils.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
|
||||
#ifdef __SSE2__
|
||||
@ -94,7 +95,7 @@ struct LowerUpperUTF8Impl
|
||||
if (data.empty())
|
||||
return;
|
||||
|
||||
bool all_ascii = UTF8::isAllASCII(data.data(), data.size());
|
||||
bool all_ascii = isAllASCII(data.data(), data.size());
|
||||
if (all_ascii)
|
||||
{
|
||||
LowerUpperImpl<not_case_lower_bound, not_case_upper_bound>::vector(data, offsets, res_data, res_offsets);
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Common/StringUtils.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
@ -237,8 +238,8 @@ namespace
|
||||
void executeForSource(SourceStrings && strings, const ColumnPtr & column_length, const String & pad_string, StringSink & res_sink) const
|
||||
{
|
||||
const auto & chars = strings.getElements();
|
||||
bool all_ascii = UTF8::isAllASCII(reinterpret_cast<const UInt8 *>(pad_string.data()), pad_string.size())
|
||||
&& UTF8::isAllASCII(chars.data(), chars.size());
|
||||
bool all_ascii = isAllASCII(reinterpret_cast<const UInt8 *>(pad_string.data()), pad_string.size())
|
||||
&& isAllASCII(chars.data(), chars.size());
|
||||
bool is_actually_utf8 = is_utf8 && !all_ascii;
|
||||
|
||||
if (!is_actually_utf8)
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <Common/StringUtils.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include "reverse.h"
|
||||
|
||||
@ -27,7 +28,7 @@ struct ReverseUTF8Impl
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
bool all_ascii = UTF8::isAllASCII(data.data(), data.size());
|
||||
bool all_ascii = isAllASCII(data.data(), data.size());
|
||||
if (all_ascii)
|
||||
{
|
||||
ReverseImpl::vector(data, offsets, res_data, res_offsets);
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Common/StringUtils.h>
|
||||
#include <DataTypes/DataTypeEnum.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
@ -149,7 +150,7 @@ public:
|
||||
{
|
||||
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
|
||||
{
|
||||
bool all_ascii = UTF8::isAllASCII(col->getChars().data(), col->getChars().size());
|
||||
bool all_ascii = isAllASCII(col->getChars().data(), col->getChars().size());
|
||||
if (all_ascii)
|
||||
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, StringSource(*col), input_rows_count);
|
||||
else
|
||||
@ -159,7 +160,7 @@ public:
|
||||
if (const ColumnConst * col_const = checkAndGetColumnConst<ColumnString>(column_string.get()))
|
||||
{
|
||||
StringRef str_ref = col_const->getDataAt(0);
|
||||
bool all_ascii = UTF8::isAllASCII(reinterpret_cast<const UInt8 *>(str_ref.data), str_ref.size);
|
||||
bool all_ascii = isAllASCII(reinterpret_cast<const UInt8 *>(str_ref.data), str_ref.size);
|
||||
if (all_ascii)
|
||||
return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource<StringSource>(*col_const), input_rows_count);
|
||||
else
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <Functions/PositionImpl.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <base/find_symbols.h>
|
||||
#include <Common/StringUtils.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <Common/register_objects.h>
|
||||
|
||||
@ -129,8 +130,8 @@ namespace
|
||||
res_data.reserve(str_column->getChars().size() / 2);
|
||||
res_offsets.reserve(rows);
|
||||
|
||||
bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size())
|
||||
&& UTF8::isAllASCII(reinterpret_cast<const UInt8 *>(delim.data()), delim.size());
|
||||
bool all_ascii = isAllASCII(str_column->getChars().data(), str_column->getChars().size())
|
||||
&& isAllASCII(reinterpret_cast<const UInt8 *>(delim.data()), delim.size());
|
||||
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
||||
= !is_utf8 || all_ascii ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
|
||||
|
||||
@ -162,8 +163,8 @@ namespace
|
||||
res_data.reserve(str_column->getChars().size() / 2);
|
||||
res_offsets.reserve(rows);
|
||||
|
||||
bool all_ascii = UTF8::isAllASCII(str_column->getChars().data(), str_column->getChars().size())
|
||||
&& UTF8::isAllASCII(reinterpret_cast<const UInt8 *>(delim.data()), delim.size());
|
||||
bool all_ascii = isAllASCII(str_column->getChars().data(), str_column->getChars().size())
|
||||
&& isAllASCII(reinterpret_cast<const UInt8 *>(delim.data()), delim.size());
|
||||
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
||||
= !is_utf8 || all_ascii ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
|
||||
|
||||
@ -194,8 +195,8 @@ namespace
|
||||
res_data.reserve(str.size() * rows / 2);
|
||||
res_offsets.reserve(rows);
|
||||
|
||||
bool all_ascii = UTF8::isAllASCII(reinterpret_cast<const UInt8 *>(str.data()), str.size())
|
||||
&& UTF8::isAllASCII(reinterpret_cast<const UInt8 *>(delim.data()), delim.size());
|
||||
bool all_ascii = isAllASCII(reinterpret_cast<const UInt8 *>(str.data()), str.size())
|
||||
&& isAllASCII(reinterpret_cast<const UInt8 *>(delim.data()), delim.size());
|
||||
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
||||
= !is_utf8 || all_ascii ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user