Merge pull request #45382 from bigo-sg/improve_position

Add fastpath for function position when needle is empty
This commit is contained in:
Robert Schulze 2023-01-31 11:22:31 +01:00 committed by GitHub
commit fdd6c77480
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 89 additions and 3 deletions

View File

@ -95,6 +95,32 @@ Result:
└───────────────────────────────┘
```
Note: If argument needle is empty the following rules apply:
- if no start_pos was specified: return 1
- if start_pos = 0: return 1
- if start_pos >= 1 and start_pos <= length(haystack) + 1: return start_pos
- otherwise: return 0
The same rules also apply to functions `positionCaseInsensitive`, `positionUTF8` and `positionCaseInsensitiveUTF8`
``` sql
SELECT
position('abc', ''),
position('abc', '', 0),
position('abc', '', 1),
position('abc', '', 2),
position('abc', '', 3),
position('abc', '', 4),
position('abc', '', 5)
```
``` text
┌─position('abc', '')─┬─position('abc', '', 0)─┬─position('abc', '', 1)─┬─position('abc', '', 2)─┬─position('abc', '', 3)─┬─position('abc', '', 4)─┬─position('abc', '', 5)─┐
│ 1 │ 1 │ 1 │ 2 │ 3 │ 4 │ 0 │
└─────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┘
```
**Examples for POSITION(needle IN haystack) syntax**
Query:

View File

@ -113,7 +113,7 @@ struct PositionCaseSensitiveUTF8
static const char * advancePos(const char * pos, const char * end, size_t n)
{
for (auto it = pos; it != end; ++it)
for (const auto *it = pos; it != end; ++it)
{
if (!UTF8::isContinuationOctet(static_cast<UInt8>(*it)))
{
@ -128,7 +128,7 @@ struct PositionCaseSensitiveUTF8
static size_t countChars(const char * begin, const char * end)
{
size_t res = 0;
for (auto it = begin; it != end; ++it)
for (const auto *it = begin; it != end; ++it)
if (!UTF8::isContinuationOctet(static_cast<UInt8>(*it)))
++res;
return res;
@ -202,6 +202,53 @@ struct PositionImpl
const UInt8 * const end = haystack_data.data() + haystack_data.size();
const UInt8 * pos = begin;
/// Fastpath when needle is empty
if (needle.empty())
{
/// When needle is empty and start_pos doesn't exist, always return 1
if (start_pos == nullptr)
{
for (auto & x : res)
x = 1;
return;
}
ColumnString::Offset prev_offset = 0;
size_t rows = haystack_offsets.size();
if (const ColumnConst * start_pos_const = typeid_cast<const ColumnConst *>(&*start_pos))
{
/// When needle is empty and start_pos is constant
UInt64 start = std::max(start_pos_const->getUInt(0), UInt64(1));
for (size_t i = 0; i < rows; ++i)
{
size_t haystack_size = Impl::countChars(
reinterpret_cast<const char *>(pos), reinterpret_cast<const char *>(pos + haystack_offsets[i] - prev_offset - 1));
res[i] = start <= haystack_size + 1 ? start : 0;
pos = begin + haystack_offsets[i];
prev_offset = haystack_offsets[i];
}
return;
}
else
{
/// When needle is empty and start_pos is not constant
for (size_t i = 0; i < rows; ++i)
{
size_t haystack_size = Impl::countChars(
reinterpret_cast<const char *>(pos), reinterpret_cast<const char *>(pos + haystack_offsets[i] - prev_offset - 1));
UInt64 start = start_pos->getUInt(i);
start = std::max(UInt64(1), start);
res[i] = start <= haystack_size + 1 ? start : 0;
pos = begin + haystack_offsets[i];
prev_offset = haystack_offsets[i];
}
return;
}
}
/// Current index in the array of strings.
size_t i = 0;
@ -253,7 +300,7 @@ struct PositionImpl
{
auto start = std::max(start_pos, UInt64(1));
if (needle.size() == 0)
if (needle.empty())
{
size_t haystack_size = Impl::countChars(data.data(), data.data() + data.size());
res = start <= haystack_size + 1 ? start : 0;

View File

@ -0,0 +1,13 @@
<test>
<query>select position(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '') from numbers(100000000) format Null</query>
<query>select position(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '', 10) from numbers(100000000) format Null</query>
<query>select positionCaseInsensitive(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '') from numbers(100000000) format Null</query>
<query>select positionCaseInsensitive(materialize('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), '', 10) from numbers(100000000) format Null</query>
<query>select positionUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '') from numbers(100000000) format Null</query>
<query>select positionUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '', 10) from numbers(100000000) format Null</query>
<query>select positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '') from numbers(100000000) format Null</query>
<query>select positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), '', 10) from numbers(100000000) format Null</query>
</test>