fix bugs and add uts

This commit is contained in:
taiyang-li 2023-06-28 10:45:52 +08:00
parent 0de5fcfbee
commit ae7a586aea
3 changed files with 496 additions and 249 deletions

View File

@ -32,21 +32,16 @@ public:
static constexpr auto name = is_utf8 ? "substringIndexUTF8" : "substringIndex";
static FunctionPtr create(ContextPtr)
{
return std::make_shared<FunctionSubstringIndex>();
}
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSubstringIndex>(); }
String getName() const override
{
return name;
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 3; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
@ -65,8 +60,11 @@ public:
getName());
if (!isNativeNumber(arguments[2]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of third argument of function {}",
arguments[2]->getName(), getName());
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of third argument of function {}",
arguments[2]->getName(),
getName());
return std::make_shared<DataTypeString>();
}
@ -113,59 +111,59 @@ public:
if (is_index_const)
{
Int64 index = column_index->getInt(0);
vectorConstant(col_str->getChars(), col_str->getOffsets(), delim, index, vec_res, offsets_res);
vectorConstant(col_str, delim, index, vec_res, offsets_res);
}
else
vectorVector(col_str->getChars(), col_str->getOffsets(), delim, column_index.get(), vec_res, offsets_res);
vectorVector(col_str, delim, column_index.get(), vec_res, offsets_res);
}
return column_res;
}
protected:
static void vectorVector(
const ColumnString::Chars & str_data,
const ColumnString::Offsets & str_offsets,
const ColumnString * str_column,
const String & delim,
const IColumn * index_column,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
size_t rows = str_offsets.size();
res_data.reserve(str_data.size() / 2);
size_t rows = str_column->size();
res_data.reserve(str_column->getChars().size() / 2);
res_offsets.reserve(rows);
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim);
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
for (size_t i = 0; i < rows; ++i)
{
StringRef str_ref{&str_data[str_offsets[i]], str_offsets[i] - str_offsets[i - 1] - 1};
StringRef str_ref = str_column->getDataAt(i);
Int64 index = index_column->getInt(i);
StringRef res_ref
= !is_utf8 ? substringIndex<delim[0]>(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
= !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
appendToResultColumn(res_ref, res_data, res_offsets);
}
}
static void vectorConstant(
const ColumnString::Chars & str_data,
const ColumnString::Offsets & str_offsets,
const ColumnString * str_column,
const String & delim,
Int64 index,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
size_t rows = str_offsets.size();
res_data.reserve(str_data.size() / 2);
size_t rows = str_column->size();
res_data.reserve(str_column->getChars().size() / 2);
res_offsets.reserve(rows);
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim);
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
for (size_t i = 0; i < rows; ++i)
{
StringRef str_ref{&str_data[str_offsets[i]], str_offsets[i] - str_offsets[i - 1] - 1};
StringRef str_ref = str_column->getDataAt(i);
StringRef res_ref
= !is_utf8 ? substringIndex<delim[0]>(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
= !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
std::cout << "result:" << res_ref.toString() << std::endl;
appendToResultColumn(res_ref, res_data, res_offsets);
}
}
@ -182,20 +180,19 @@ protected:
res_offsets.reserve(rows);
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim);
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
StringRef str_ref{str.data(), str.size()};
for (size_t i = 0; i < rows; ++i)
{
Int64 index = index_column->getInt(i);
StringRef res_ref
= !is_utf8 ? substringIndex<delim[0]>(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
= !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
appendToResultColumn(res_ref, res_data, res_offsets);
}
}
static void appendToResultColumn(
const StringRef & res_ref, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
static void appendToResultColumn(const StringRef & res_ref, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
{
size_t res_offset = res_data.size();
res_data.resize(res_offset + res_ref.size + 1);
@ -210,6 +207,8 @@ protected:
static StringRef substringIndexUTF8(
const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index)
{
std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl;
if (index == 0)
return {str_ref.data, 0};
@ -246,22 +245,26 @@ protected:
return str_ref;
Int64 index_from_left = total + 1 + index;
std::cout << "total:" << total << ", index_from_left" << index_from_left << std::endl;
pos = begin;
Int64 i = 0;
while (pos < end && end != (pos = searcher->search(pos, end - pos)) && i < index_from_left)
while (i < index_from_left && pos < end && end != (pos = searcher->search(pos, end - pos)))
{
pos += delim.size();
++i;
std::cout << "pos offset:" << pos - begin << ", total size:" << end - begin << std::endl;
}
return {pos, static_cast<size_t>(end - pos)};
std::cout << "pos offset:" << pos - begin << ", size:" << end - pos << std::endl;
StringRef res = {pos, static_cast<size_t>(end - pos)};
std::cout << "result:" << res.toString() << std::endl;
return res;
}
}
template <char delim>
static StringRef substringIndex(
const StringRef & str_ref,
Int64 index)
static StringRef substringIndex(const StringRef & str_ref, char delim, Int64 index)
{
std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl;
if (index == 0)
return {str_ref.data, 0};
@ -272,8 +275,7 @@ protected:
Int64 i = 0;
while (i < index)
{
pos = find_first_symbols<delim>(pos, end);
pos = std::find(pos, end, delim);
if (pos != end)
{
++pos;
@ -289,19 +291,17 @@ protected:
const auto * begin = str_ref.data;
const auto * pos = str_ref.data + str_ref.size;
Int64 i = 0;
while (i < index)
while (i + index < 0)
{
const auto * next_pos = ::detail::find_last_symbols_sse2<true, ::detail::ReturnMode::End, delim>(begin, pos);
--pos;
while (pos >= begin && *pos != delim)
--pos;
if (next_pos != pos)
{
pos = next_pos;
if (pos >= begin)
++i;
}
else
return str_ref;
}
return {pos + 1, static_cast<size_t>(str_ref.data + str_ref.size - pos - 1)};
}
}
@ -319,4 +319,3 @@ REGISTER_FUNCTION(SubstringIndex)
}

View File

@ -0,0 +1,155 @@
-- { echoOn }
select substringIndex('www.clickhouse.com', '.', -4);
www.clickhouse.com
select substringIndex('www.clickhouse.com', '.', -3);
www.clickhouse.com
select substringIndex('www.clickhouse.com', '.', -2);
clickhouse.com
select substringIndex('www.clickhouse.com', '.', -1);
com
select substringIndex('www.clickhouse.com', '.', 0);
select substringIndex('www.clickhouse.com', '.', 1);
www
select substringIndex('www.clickhouse.com', '.', 2);
www.clickhouse
select substringIndex('www.clickhouse.com', '.', 3);
www.clickhouse.com
select substringIndex('www.clickhouse.com', '.', 4);
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', -4);
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', -3);
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', -2);
clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', -1);
com
select substringIndex(materialize('www.clickhouse.com'), '.', 0);
select substringIndex(materialize('www.clickhouse.com'), '.', 1);
www
select substringIndex(materialize('www.clickhouse.com'), '.', 2);
www.clickhouse
select substringIndex(materialize('www.clickhouse.com'), '.', 3);
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', 4);
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-4));
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-3));
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-2));
clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-1));
com
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(0));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(1));
www
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(2));
www.clickhouse
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(3));
www.clickhouse.com
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(4));
www.clickhouse.com
select substringIndex('www.clickhouse.com', '.', materialize(-4));
www.clickhouse.com
select substringIndex('www.clickhouse.com', '.', materialize(-3));
www.clickhouse.com
select substringIndex('www.clickhouse.com', '.', materialize(-2));
clickhouse.com
select substringIndex('www.clickhouse.com', '.', materialize(-1));
com
select substringIndex('www.clickhouse.com', '.', materialize(0));
select substringIndex('www.clickhouse.com', '.', materialize(1));
www
select substringIndex('www.clickhouse.com', '.', materialize(2));
www.clickhouse
select substringIndex('www.clickhouse.com', '.', materialize(3));
www.clickhouse.com
select substringIndex('www.clickhouse.com', '.', materialize(4));
www.clickhouse.com
select SUBSTRING_INDEX('www.clickhouse.com', '.', 2);
www.clickhouse
select substringIndex('www.clickhouse.com', '..', 2); -- { serverError BAD_ARGUMENTS }
select substringIndex('www.clickhouse.com', '', 2); -- { serverError BAD_ARGUMENTS }
select substringIndex('www.clickhouse.com', materialize('.'), 2); -- { serverError ILLEGAL_COLUMN }
select substringIndex('www.clickhouse.com', '.', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
select substringIndexUTF8('富强,民主,文明', '', -4);
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', -3);
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', -2);
民主,文明
select substringIndexUTF8('富强,民主,文明', '', -1);
文明
select substringIndexUTF8('富强,民主,文明', '', 0);
select substringIndexUTF8('富强,民主,文明', '', 1);
富强
select substringIndexUTF8('富强,民主,文明', '', 2);
富强,民主
select substringIndexUTF8('富强,民主,文明', '', 3);
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', 4);
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', -4);
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', -3);
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', -2);
民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', -1);
文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', 0);
select substringIndexUTF8(materialize('富强,民主,文明'), '', 1);
富强
select substringIndexUTF8(materialize('富强,民主,文明'), '', 2);
富强,民主
select substringIndexUTF8(materialize('富强,民主,文明'), '', 3);
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', 4);
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', materialize(-4));
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', materialize(-3));
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', materialize(-2));
民主,文明
select substringIndexUTF8('富强,民主,文明', '', materialize(-1));
文明
select substringIndexUTF8('富强,民主,文明', '', materialize(0));
select substringIndexUTF8('富强,民主,文明', '', materialize(1));
富强
select substringIndexUTF8('富强,民主,文明', '', materialize(2));
富强,民主
select substringIndexUTF8('富强,民主,文明', '', materialize(3));
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', materialize(4));
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-4));
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-3));
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-2));
民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-1));
文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(0));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(1));
富强
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(2));
富强,民主
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(3));
富强,民主,文明
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(4));
富强,民主,文明
select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS }
select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS }
select substringIndexUTF8('富强,民主,文明', materialize(''), 2); -- { serverError ILLEGAL_COLUMN }
select substringIndexUTF8('富强,民主,文明', '', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }

View File

@ -0,0 +1,93 @@
-- { echoOn }
select substringIndex('www.clickhouse.com', '.', -4);
select substringIndex('www.clickhouse.com', '.', -3);
select substringIndex('www.clickhouse.com', '.', -2);
select substringIndex('www.clickhouse.com', '.', -1);
select substringIndex('www.clickhouse.com', '.', 0);
select substringIndex('www.clickhouse.com', '.', 1);
select substringIndex('www.clickhouse.com', '.', 2);
select substringIndex('www.clickhouse.com', '.', 3);
select substringIndex('www.clickhouse.com', '.', 4);
select substringIndex(materialize('www.clickhouse.com'), '.', -4);
select substringIndex(materialize('www.clickhouse.com'), '.', -3);
select substringIndex(materialize('www.clickhouse.com'), '.', -2);
select substringIndex(materialize('www.clickhouse.com'), '.', -1);
select substringIndex(materialize('www.clickhouse.com'), '.', 0);
select substringIndex(materialize('www.clickhouse.com'), '.', 1);
select substringIndex(materialize('www.clickhouse.com'), '.', 2);
select substringIndex(materialize('www.clickhouse.com'), '.', 3);
select substringIndex(materialize('www.clickhouse.com'), '.', 4);
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-4));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-3));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-2));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-1));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(0));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(1));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(2));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(3));
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(4));
select substringIndex('www.clickhouse.com', '.', materialize(-4));
select substringIndex('www.clickhouse.com', '.', materialize(-3));
select substringIndex('www.clickhouse.com', '.', materialize(-2));
select substringIndex('www.clickhouse.com', '.', materialize(-1));
select substringIndex('www.clickhouse.com', '.', materialize(0));
select substringIndex('www.clickhouse.com', '.', materialize(1));
select substringIndex('www.clickhouse.com', '.', materialize(2));
select substringIndex('www.clickhouse.com', '.', materialize(3));
select substringIndex('www.clickhouse.com', '.', materialize(4));
select SUBSTRING_INDEX('www.clickhouse.com', '.', 2);
select substringIndex('www.clickhouse.com', '..', 2); -- { serverError BAD_ARGUMENTS }
select substringIndex('www.clickhouse.com', '', 2); -- { serverError BAD_ARGUMENTS }
select substringIndex('www.clickhouse.com', materialize('.'), 2); -- { serverError ILLEGAL_COLUMN }
select substringIndex('www.clickhouse.com', '.', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
select substringIndexUTF8('富强,民主,文明', '', -4);
select substringIndexUTF8('富强,民主,文明', '', -3);
select substringIndexUTF8('富强,民主,文明', '', -2);
select substringIndexUTF8('富强,民主,文明', '', -1);
select substringIndexUTF8('富强,民主,文明', '', 0);
select substringIndexUTF8('富强,民主,文明', '', 1);
select substringIndexUTF8('富强,民主,文明', '', 2);
select substringIndexUTF8('富强,民主,文明', '', 3);
select substringIndexUTF8('富强,民主,文明', '', 4);
select substringIndexUTF8(materialize('富强,民主,文明'), '', -4);
select substringIndexUTF8(materialize('富强,民主,文明'), '', -3);
select substringIndexUTF8(materialize('富强,民主,文明'), '', -2);
select substringIndexUTF8(materialize('富强,民主,文明'), '', -1);
select substringIndexUTF8(materialize('富强,民主,文明'), '', 0);
select substringIndexUTF8(materialize('富强,民主,文明'), '', 1);
select substringIndexUTF8(materialize('富强,民主,文明'), '', 2);
select substringIndexUTF8(materialize('富强,民主,文明'), '', 3);
select substringIndexUTF8(materialize('富强,民主,文明'), '', 4);
select substringIndexUTF8('富强,民主,文明', '', materialize(-4));
select substringIndexUTF8('富强,民主,文明', '', materialize(-3));
select substringIndexUTF8('富强,民主,文明', '', materialize(-2));
select substringIndexUTF8('富强,民主,文明', '', materialize(-1));
select substringIndexUTF8('富强,民主,文明', '', materialize(0));
select substringIndexUTF8('富强,民主,文明', '', materialize(1));
select substringIndexUTF8('富强,民主,文明', '', materialize(2));
select substringIndexUTF8('富强,民主,文明', '', materialize(3));
select substringIndexUTF8('富强,民主,文明', '', materialize(4));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-4));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-3));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-2));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(-1));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(0));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(1));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(2));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(3));
select substringIndexUTF8(materialize('富强,民主,文明'), '', materialize(4));
select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS }
select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS }
select substringIndexUTF8('富强,民主,文明', materialize(''), 2); -- { serverError ILLEGAL_COLUMN }
select substringIndexUTF8('富强,民主,文明', '', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
-- { echoOff }