Merge pull request #70053 from bigo-sg/regReplace-empty-needle

Allow empty needle in replaceRegexp*
This commit is contained in:
Vitaly Baranov 2024-10-21 17:45:33 +00:00 committed by GitHub
commit f41d604f28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 78 additions and 30 deletions

View File

@ -13,7 +13,6 @@ namespace DB
namespace ErrorCodes
{
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
}
@ -205,7 +204,11 @@ struct ReplaceRegexpImpl
size_t input_rows_count)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.assign(haystack_data);
res_offsets.assign(haystack_offsets);
return;
}
ColumnString::Offset res_offset = 0;
res_data.reserve(haystack_data.size());
@ -240,7 +243,7 @@ struct ReplaceRegexpImpl
for (size_t i = 0; i < input_rows_count; ++i)
{
size_t from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t from = haystack_offsets[i - 1];
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - from - 1);
@ -271,17 +274,24 @@ struct ReplaceRegexpImpl
for (size_t i = 0; i < input_rows_count; ++i)
{
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t hs_from = haystack_offsets[i - 1];
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);
size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0;
size_t ndl_from = needle_offsets[i - 1];
const char * ndl_data = reinterpret_cast<const char *>(needle_data.data() + ndl_from);
const size_t ndl_length = static_cast<unsigned>(needle_offsets[i] - ndl_from - 1);
std::string_view needle(ndl_data, ndl_length);
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.insert(res_data.end(), hs_data, hs_data + hs_length);
res_data.push_back(0);
res_offset += hs_length + 1;
res_offsets[i] = res_offset;
continue;
}
re2::RE2 searcher(needle, regexp_options);
if (!searcher.ok())
@ -308,7 +318,11 @@ struct ReplaceRegexpImpl
assert(haystack_offsets.size() == replacement_offsets.size());
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.assign(haystack_data);
res_offsets.assign(haystack_offsets);
return;
}
ColumnString::Offset res_offset = 0;
res_data.reserve(haystack_data.size());
@ -325,11 +339,11 @@ struct ReplaceRegexpImpl
for (size_t i = 0; i < input_rows_count; ++i)
{
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t hs_from = haystack_offsets[i - 1];
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);
size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0;
size_t repl_from = replacement_offsets[i - 1];
const char * repl_data = reinterpret_cast<const char *>(replacement_data.data() + repl_from);
const size_t repl_length = static_cast<unsigned>(replacement_offsets[i] - repl_from - 1);
std::string_view replacement(repl_data, repl_length);
@ -364,19 +378,25 @@ struct ReplaceRegexpImpl
for (size_t i = 0; i < input_rows_count; ++i)
{
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t hs_from = haystack_offsets[i - 1];
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);
size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0;
size_t ndl_from = needle_offsets[i - 1];
const char * ndl_data = reinterpret_cast<const char *>(needle_data.data() + ndl_from);
const size_t ndl_length = static_cast<unsigned>(needle_offsets[i] - ndl_from - 1);
std::string_view needle(ndl_data, ndl_length);
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.insert(res_data.end(), hs_data, hs_data + hs_length);
res_data.push_back(0);
res_offsets[i] = res_offsets[i - 1] + hs_length + 1;
res_offset = res_offsets[i];
continue;
}
size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0;
size_t repl_from = replacement_offsets[i - 1];
const char * repl_data = reinterpret_cast<const char *>(replacement_data.data() + repl_from);
const size_t repl_length = static_cast<unsigned>(replacement_offsets[i] - repl_from - 1);
std::string_view replacement(repl_data, repl_length);
@ -403,7 +423,21 @@ struct ReplaceRegexpImpl
size_t input_rows_count)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
chassert(input_rows_count == haystack_data.size() / n);
/// Since ColumnFixedString does not have a zero byte at the end, while ColumnString does,
/// we need to split haystack_data into strings of length n, add 1 zero byte to the end of each string
/// and then copy to res_data, ref: ColumnString.h and ColumnFixedString.h
res_data.reserve(haystack_data.size() + input_rows_count);
res_offsets.resize(input_rows_count);
for (size_t i = 0; i < input_rows_count; ++i)
{
res_data.insert(res_data.end(), haystack_data.begin() + i * n, haystack_data.begin() + (i + 1) * n);
res_data.push_back(0);
res_offsets[i] = res_offsets[i - 1] + n + 1;
}
return;
}
ColumnString::Offset res_offset = 0;
res_data.reserve(haystack_data.size());

View File

@ -134,15 +134,30 @@
3 Hello World not_found x Hello World
4 Hello World [eo] x Hxllo World
5 Hello World . x xello World
Check that whether an exception is thrown if the needle is empty
- should not throw an exception if the needle is empty
- non-const needle, const replacement
Hexxo Worxd
Hello World
Hexlo World
Hello World
Hexxo Worxd
Hello World
Hexlo World
Hello World
- const needle, non-const replacement
Hello World
Hello World
Hello World
Hello World
Hello World
Hello World
Hello World
Hello World
- non-const needle, non-const replacement
Hexxo Worxd
Hello World
Hexlo World
Hello World
Hello World
Hello World
Hello World
Hello World
Hexxo Worxd
Hello World
Hexlo World

View File

@ -69,8 +69,7 @@ SELECT id, haystack, needle, replacement, replaceRegexpOne('Hello World', needle
DROP TABLE IF EXISTS test_tab;
SELECT 'Check that whether an exception is thrown if the needle is empty';
SELECT '- should not throw an exception if the needle is empty';
CREATE TABLE test_tab
(id UInt32, haystack String, needle String, replacement String)
@ -79,22 +78,22 @@ CREATE TABLE test_tab
INSERT INTO test_tab VALUES (1, 'Hello World', 'l', 'x') (2, 'Hello World', '', 'y');
-- needle: non-const, replacement: const
SELECT '- non-const needle, const replacement';
SELECT replaceAll(haystack, needle, 'x') FROM test_tab;
SELECT replaceOne(haystack, needle, 'x') FROM test_tab;
SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab;
SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab;
-- needle: const, replacement: non-const
SELECT '- const needle, non-const replacement';
SELECT replaceAll(haystack, '', replacement) FROM test_tab;
SELECT replaceOne(haystack, '', replacement) FROM test_tab;
SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab;
SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab;
-- needle: non-const, replacement: non-const
SELECT '- non-const needle, non-const replacement';
SELECT replaceAll(haystack, needle, replacement) FROM test_tab;
SELECT replaceOne(haystack, needle, replacement) FROM test_tab;
SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab;
SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab;
DROP TABLE IF EXISTS test_tab;