mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Merge pull request #45382 from bigo-sg/improve_position
Add fastpath for function position when needle is empty
This commit is contained in:
commit
fdd6c77480
@ -95,6 +95,32 @@ Result:
|
||||
└───────────────────────────────┘
|
||||
```
|
||||
|
||||
Note: If argument needle is empty the following rules apply:
|
||||
- if no start_pos was specified: return 1
|
||||
- if start_pos = 0: return 1
|
||||
- if start_pos >= 1 and start_pos <= length(haystack) + 1: return start_pos
|
||||
- otherwise: return 0
|
||||
|
||||
The same rules also apply to functions `positionCaseInsensitive`, `positionUTF8` and `positionCaseInsensitiveUTF8`
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
position('abc', ''),
|
||||
position('abc', '', 0),
|
||||
position('abc', '', 1),
|
||||
position('abc', '', 2),
|
||||
position('abc', '', 3),
|
||||
position('abc', '', 4),
|
||||
position('abc', '', 5)
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─position('abc', '')─┬─position('abc', '', 0)─┬─position('abc', '', 1)─┬─position('abc', '', 2)─┬─position('abc', '', 3)─┬─position('abc', '', 4)─┬─position('abc', '', 5)─┐
|
||||
│ 1 │ 1 │ 1 │ 2 │ 3 │ 4 │ 0 │
|
||||
└─────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┘
|
||||
```
|
||||
|
||||
|
||||
**Examples for POSITION(needle IN haystack) syntax**
|
||||
|
||||
Query:
|
||||
|
@ -113,7 +113,7 @@ struct PositionCaseSensitiveUTF8
|
||||
|
||||
static const char * advancePos(const char * pos, const char * end, size_t n)
|
||||
{
|
||||
for (auto it = pos; it != end; ++it)
|
||||
for (const auto *it = pos; it != end; ++it)
|
||||
{
|
||||
if (!UTF8::isContinuationOctet(static_cast<UInt8>(*it)))
|
||||
{
|
||||
@ -128,7 +128,7 @@ struct PositionCaseSensitiveUTF8
|
||||
static size_t countChars(const char * begin, const char * end)
|
||||
{
|
||||
size_t res = 0;
|
||||
for (auto it = begin; it != end; ++it)
|
||||
for (const auto *it = begin; it != end; ++it)
|
||||
if (!UTF8::isContinuationOctet(static_cast<UInt8>(*it)))
|
||||
++res;
|
||||
return res;
|
||||
@ -202,6 +202,53 @@ struct PositionImpl
|
||||
const UInt8 * const end = haystack_data.data() + haystack_data.size();
|
||||
const UInt8 * pos = begin;
|
||||
|
||||
/// Fastpath when needle is empty
|
||||
if (needle.empty())
|
||||
{
|
||||
/// When needle is empty and start_pos doesn't exist, always return 1
|
||||
if (start_pos == nullptr)
|
||||
{
|
||||
for (auto & x : res)
|
||||
x = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
ColumnString::Offset prev_offset = 0;
|
||||
size_t rows = haystack_offsets.size();
|
||||
|
||||
if (const ColumnConst * start_pos_const = typeid_cast<const ColumnConst *>(&*start_pos))
|
||||
{
|
||||
/// When needle is empty and start_pos is constant
|
||||
UInt64 start = std::max(start_pos_const->getUInt(0), UInt64(1));
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
{
|
||||
size_t haystack_size = Impl::countChars(
|
||||
reinterpret_cast<const char *>(pos), reinterpret_cast<const char *>(pos + haystack_offsets[i] - prev_offset - 1));
|
||||
res[i] = start <= haystack_size + 1 ? start : 0;
|
||||
|
||||
pos = begin + haystack_offsets[i];
|
||||
prev_offset = haystack_offsets[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
/// When needle is empty and start_pos is not constant
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
{
|
||||
size_t haystack_size = Impl::countChars(
|
||||
reinterpret_cast<const char *>(pos), reinterpret_cast<const char *>(pos + haystack_offsets[i] - prev_offset - 1));
|
||||
UInt64 start = start_pos->getUInt(i);
|
||||
start = std::max(UInt64(1), start);
|
||||
res[i] = start <= haystack_size + 1 ? start : 0;
|
||||
|
||||
pos = begin + haystack_offsets[i];
|
||||
prev_offset = haystack_offsets[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// Current index in the array of strings.
|
||||
size_t i = 0;
|
||||
|
||||
@ -253,7 +300,7 @@ struct PositionImpl
|
||||
{
|
||||
auto start = std::max(start_pos, UInt64(1));
|
||||
|
||||
if (needle.size() == 0)
|
||||
if (needle.empty())
|
||||
{
|
||||
size_t haystack_size = Impl::countChars(data.data(), data.data() + data.size());
|
||||
res = start <= haystack_size + 1 ? start : 0;
|
||||
|
13
tests/performance/position_empty_needle.xml
Normal file
13
tests/performance/position_empty_needle.xml
Normal file
@ -0,0 +1,13 @@
|
||||
<test>
|
||||
<query>select position(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '') from numbers(100000000) format Null</query>
|
||||
<query>select position(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '', 10) from numbers(100000000) format Null</query>
|
||||
|
||||
<query>select positionCaseInsensitive(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '') from numbers(100000000) format Null</query>
|
||||
<query>select positionCaseInsensitive(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '', 10) from numbers(100000000) format Null</query>
|
||||
|
||||
<query>select positionUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '') from numbers(100000000) format Null</query>
|
||||
<query>select positionUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '', 10) from numbers(100000000) format Null</query>
|
||||
|
||||
<query>select positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '') from numbers(100000000) format Null</query>
|
||||
<query>select positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '', 10) from numbers(100000000) format Null</query>
|
||||
</test>
|
Loading…
Reference in New Issue
Block a user