mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 11:32:03 +00:00
fix bugs and add uts
This commit is contained in:
parent
0de5fcfbee
commit
ae7a586aea
@ -25,287 +25,287 @@ namespace ErrorCodes
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
template <bool is_utf8>
|
template <bool is_utf8>
|
||||||
class FunctionSubstringIndex : public IFunction
|
class FunctionSubstringIndex : public IFunction
|
||||||
{
|
|
||||||
public:
|
|
||||||
static constexpr auto name = is_utf8 ? "substringIndexUTF8" : "substringIndex";
|
|
||||||
|
|
||||||
|
|
||||||
static FunctionPtr create(ContextPtr)
|
|
||||||
{
|
{
|
||||||
return std::make_shared<FunctionSubstringIndex>();
|
public:
|
||||||
}
|
static constexpr auto name = is_utf8 ? "substringIndexUTF8" : "substringIndex";
|
||||||
|
|
||||||
String getName() const override
|
|
||||||
{
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t getNumberOfArguments() const override { return 3; }
|
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSubstringIndex>(); }
|
||||||
|
|
||||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
String getName() const override { return name; }
|
||||||
|
|
||||||
bool useDefaultImplementationForConstants() const override { return true; }
|
size_t getNumberOfArguments() const override { return 3; }
|
||||||
|
|
||||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||||
{
|
|
||||||
if (!isString(arguments[0]))
|
|
||||||
throw Exception(
|
|
||||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
|
||||||
"Illegal type {} of first argument of function {}",
|
|
||||||
arguments[0]->getName(),
|
|
||||||
getName());
|
|
||||||
|
|
||||||
if (!isString(arguments[1]))
|
bool useDefaultImplementationForConstants() const override { return true; }
|
||||||
throw Exception(
|
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
|
||||||
"Illegal type {} of second argument of function {}",
|
|
||||||
arguments[1]->getName(),
|
|
||||||
getName());
|
|
||||||
|
|
||||||
if (!isNativeNumber(arguments[2]))
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of third argument of function {}",
|
|
||||||
arguments[2]->getName(), getName());
|
|
||||||
|
|
||||||
return std::make_shared<DataTypeString>();
|
|
||||||
}
|
|
||||||
|
|
||||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
|
|
||||||
{
|
|
||||||
ColumnPtr column_string = arguments[0].column;
|
|
||||||
ColumnPtr column_delim = arguments[1].column;
|
|
||||||
ColumnPtr column_index = arguments[2].column;
|
|
||||||
|
|
||||||
const ColumnConst * column_delim_const = checkAndGetColumnConst<ColumnString>(column_delim.get());
|
|
||||||
if (!column_delim_const)
|
|
||||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN , "Second argument to {} must be a constant String", getName());
|
|
||||||
|
|
||||||
String delim = column_delim_const->getValue<String>();
|
|
||||||
if constexpr (!is_utf8)
|
|
||||||
{
|
{
|
||||||
if (delim.size() != 1)
|
if (!isString(arguments[0]))
|
||||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single character", getName());
|
throw Exception(
|
||||||
}
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||||
else
|
"Illegal type {} of first argument of function {}",
|
||||||
{
|
arguments[0]->getName(),
|
||||||
if (UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(delim.data()), delim.size()) != 1)
|
getName());
|
||||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single UTF-8 character", getName());
|
|
||||||
|
if (!isString(arguments[1]))
|
||||||
|
throw Exception(
|
||||||
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||||
|
"Illegal type {} of second argument of function {}",
|
||||||
|
arguments[1]->getName(),
|
||||||
|
getName());
|
||||||
|
|
||||||
|
if (!isNativeNumber(arguments[2]))
|
||||||
|
throw Exception(
|
||||||
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||||
|
"Illegal type {} of third argument of function {}",
|
||||||
|
arguments[2]->getName(),
|
||||||
|
getName());
|
||||||
|
|
||||||
|
return std::make_shared<DataTypeString>();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto column_res = ColumnString::create();
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
|
||||||
ColumnString::Chars & vec_res = column_res->getChars();
|
|
||||||
ColumnString::Offsets & offsets_res = column_res->getOffsets();
|
|
||||||
|
|
||||||
const ColumnConst * column_string_const = checkAndGetColumnConst<ColumnString>(column_string.get());
|
|
||||||
if (column_string_const)
|
|
||||||
{
|
{
|
||||||
String str = column_string_const->getValue<String>();
|
ColumnPtr column_string = arguments[0].column;
|
||||||
constantVector(str, delim, column_index.get(), vec_res, offsets_res);
|
ColumnPtr column_delim = arguments[1].column;
|
||||||
}
|
ColumnPtr column_index = arguments[2].column;
|
||||||
else
|
|
||||||
{
|
|
||||||
const auto * col_str = checkAndGetColumn<ColumnString>(column_string.get());
|
|
||||||
if (!col_str)
|
|
||||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument to {} must be a String", getName());
|
|
||||||
|
|
||||||
bool is_index_const = isColumnConst(*column_index);
|
const ColumnConst * column_delim_const = checkAndGetColumnConst<ColumnString>(column_delim.get());
|
||||||
if (is_index_const)
|
if (!column_delim_const)
|
||||||
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument to {} must be a constant String", getName());
|
||||||
|
|
||||||
|
String delim = column_delim_const->getValue<String>();
|
||||||
|
if constexpr (!is_utf8)
|
||||||
{
|
{
|
||||||
Int64 index = column_index->getInt(0);
|
if (delim.size() != 1)
|
||||||
vectorConstant(col_str->getChars(), col_str->getOffsets(), delim, index, vec_res, offsets_res);
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single character", getName());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
vectorVector(col_str->getChars(), col_str->getOffsets(), delim, column_index.get(), vec_res, offsets_res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
static void vectorVector(
|
|
||||||
const ColumnString::Chars & str_data,
|
|
||||||
const ColumnString::Offsets & str_offsets,
|
|
||||||
const String & delim,
|
|
||||||
const IColumn * index_column,
|
|
||||||
ColumnString::Chars & res_data,
|
|
||||||
ColumnString::Offsets & res_offsets)
|
|
||||||
{
|
|
||||||
size_t rows = str_offsets.size();
|
|
||||||
res_data.reserve(str_data.size() / 2);
|
|
||||||
res_offsets.reserve(rows);
|
|
||||||
|
|
||||||
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
|
||||||
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows; ++i)
|
|
||||||
{
|
|
||||||
StringRef str_ref{&str_data[str_offsets[i]], str_offsets[i] - str_offsets[i - 1] - 1};
|
|
||||||
Int64 index = index_column->getInt(i);
|
|
||||||
StringRef res_ref
|
|
||||||
= !is_utf8 ? substringIndex<delim[0]>(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
|
|
||||||
appendToResultColumn(res_ref, res_data, res_offsets);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vectorConstant(
|
|
||||||
const ColumnString::Chars & str_data,
|
|
||||||
const ColumnString::Offsets & str_offsets,
|
|
||||||
const String & delim,
|
|
||||||
Int64 index,
|
|
||||||
ColumnString::Chars & res_data,
|
|
||||||
ColumnString::Offsets & res_offsets)
|
|
||||||
{
|
|
||||||
size_t rows = str_offsets.size();
|
|
||||||
res_data.reserve(str_data.size() / 2);
|
|
||||||
res_offsets.reserve(rows);
|
|
||||||
|
|
||||||
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
|
||||||
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim);
|
|
||||||
|
|
||||||
for (size_t i = 0; i<rows; ++i)
|
|
||||||
{
|
|
||||||
StringRef str_ref{&str_data[str_offsets[i]], str_offsets[i] - str_offsets[i - 1] - 1};
|
|
||||||
StringRef res_ref
|
|
||||||
= !is_utf8 ? substringIndex<delim[0]>(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
|
|
||||||
appendToResultColumn(res_ref, res_data, res_offsets);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void constantVector(
|
|
||||||
const String & str,
|
|
||||||
const String & delim,
|
|
||||||
const IColumn * index_column,
|
|
||||||
ColumnString::Chars & res_data,
|
|
||||||
ColumnString::Offsets & res_offsets)
|
|
||||||
{
|
|
||||||
size_t rows = index_column->size();
|
|
||||||
res_data.reserve(str.size() * rows / 2);
|
|
||||||
res_offsets.reserve(rows);
|
|
||||||
|
|
||||||
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
|
||||||
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim);
|
|
||||||
|
|
||||||
StringRef str_ref{str.data(), str.size()};
|
|
||||||
for (size_t i=0; i<rows; ++i)
|
|
||||||
{
|
|
||||||
Int64 index = index_column->getInt(i);
|
|
||||||
StringRef res_ref
|
|
||||||
= !is_utf8 ? substringIndex<delim[0]>(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
|
|
||||||
appendToResultColumn(res_ref, res_data, res_offsets);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void appendToResultColumn(
|
|
||||||
const StringRef & res_ref, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
|
||||||
{
|
|
||||||
size_t res_offset = res_data.size();
|
|
||||||
res_data.resize(res_offset + res_ref.size + 1);
|
|
||||||
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], res_ref.data, res_ref.size);
|
|
||||||
res_offset += res_ref.size;
|
|
||||||
res_data[res_offset] = 0;
|
|
||||||
++res_offset;
|
|
||||||
|
|
||||||
res_offsets.emplace_back(res_offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
static StringRef substringIndexUTF8(
|
|
||||||
const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index)
|
|
||||||
{
|
|
||||||
if (index == 0)
|
|
||||||
return {str_ref.data, 0};
|
|
||||||
|
|
||||||
const auto * begin = reinterpret_cast<const UInt8 *>(str_ref.data);
|
|
||||||
const auto * end = reinterpret_cast<const UInt8 *>(str_ref.data + str_ref.size);
|
|
||||||
const auto * pos = begin;
|
|
||||||
if (index > 0)
|
|
||||||
{
|
|
||||||
Int64 i = 0;
|
|
||||||
while (i < index)
|
|
||||||
{
|
{
|
||||||
pos = searcher->search(pos, end - pos);
|
if (UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(delim.data()), delim.size()) != 1)
|
||||||
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single UTF-8 character", getName());
|
||||||
|
}
|
||||||
|
|
||||||
if (pos != end)
|
auto column_res = ColumnString::create();
|
||||||
|
ColumnString::Chars & vec_res = column_res->getChars();
|
||||||
|
ColumnString::Offsets & offsets_res = column_res->getOffsets();
|
||||||
|
|
||||||
|
const ColumnConst * column_string_const = checkAndGetColumnConst<ColumnString>(column_string.get());
|
||||||
|
if (column_string_const)
|
||||||
|
{
|
||||||
|
String str = column_string_const->getValue<String>();
|
||||||
|
constantVector(str, delim, column_index.get(), vec_res, offsets_res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const auto * col_str = checkAndGetColumn<ColumnString>(column_string.get());
|
||||||
|
if (!col_str)
|
||||||
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument to {} must be a String", getName());
|
||||||
|
|
||||||
|
bool is_index_const = isColumnConst(*column_index);
|
||||||
|
if (is_index_const)
|
||||||
|
{
|
||||||
|
Int64 index = column_index->getInt(0);
|
||||||
|
vectorConstant(col_str, delim, index, vec_res, offsets_res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
vectorVector(col_str, delim, column_index.get(), vec_res, offsets_res);
|
||||||
|
}
|
||||||
|
return column_res;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
static void vectorVector(
|
||||||
|
const ColumnString * str_column,
|
||||||
|
const String & delim,
|
||||||
|
const IColumn * index_column,
|
||||||
|
ColumnString::Chars & res_data,
|
||||||
|
ColumnString::Offsets & res_offsets)
|
||||||
|
{
|
||||||
|
size_t rows = str_column->size();
|
||||||
|
res_data.reserve(str_column->getChars().size() / 2);
|
||||||
|
res_offsets.reserve(rows);
|
||||||
|
|
||||||
|
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
||||||
|
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < rows; ++i)
|
||||||
|
{
|
||||||
|
StringRef str_ref = str_column->getDataAt(i);
|
||||||
|
Int64 index = index_column->getInt(i);
|
||||||
|
StringRef res_ref
|
||||||
|
= !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
|
||||||
|
appendToResultColumn(res_ref, res_data, res_offsets);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void vectorConstant(
|
||||||
|
const ColumnString * str_column,
|
||||||
|
const String & delim,
|
||||||
|
Int64 index,
|
||||||
|
ColumnString::Chars & res_data,
|
||||||
|
ColumnString::Offsets & res_offsets)
|
||||||
|
{
|
||||||
|
size_t rows = str_column->size();
|
||||||
|
res_data.reserve(str_column->getChars().size() / 2);
|
||||||
|
res_offsets.reserve(rows);
|
||||||
|
|
||||||
|
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
||||||
|
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < rows; ++i)
|
||||||
|
{
|
||||||
|
StringRef str_ref = str_column->getDataAt(i);
|
||||||
|
StringRef res_ref
|
||||||
|
= !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
|
||||||
|
std::cout << "result:" << res_ref.toString() << std::endl;
|
||||||
|
appendToResultColumn(res_ref, res_data, res_offsets);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void constantVector(
|
||||||
|
const String & str,
|
||||||
|
const String & delim,
|
||||||
|
const IColumn * index_column,
|
||||||
|
ColumnString::Chars & res_data,
|
||||||
|
ColumnString::Offsets & res_offsets)
|
||||||
|
{
|
||||||
|
size_t rows = index_column->size();
|
||||||
|
res_data.reserve(str.size() * rows / 2);
|
||||||
|
res_offsets.reserve(rows);
|
||||||
|
|
||||||
|
std::unique_ptr<PositionCaseSensitiveUTF8::SearcherInBigHaystack> searcher
|
||||||
|
= !is_utf8 ? nullptr : std::make_unique<PositionCaseSensitiveUTF8::SearcherInBigHaystack>(delim.data(), delim.size());
|
||||||
|
|
||||||
|
StringRef str_ref{str.data(), str.size()};
|
||||||
|
for (size_t i = 0; i < rows; ++i)
|
||||||
|
{
|
||||||
|
Int64 index = index_column->getInt(i);
|
||||||
|
StringRef res_ref
|
||||||
|
= !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index);
|
||||||
|
appendToResultColumn(res_ref, res_data, res_offsets);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void appendToResultColumn(const StringRef & res_ref, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
|
||||||
|
{
|
||||||
|
size_t res_offset = res_data.size();
|
||||||
|
res_data.resize(res_offset + res_ref.size + 1);
|
||||||
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], res_ref.data, res_ref.size);
|
||||||
|
res_offset += res_ref.size;
|
||||||
|
res_data[res_offset] = 0;
|
||||||
|
++res_offset;
|
||||||
|
|
||||||
|
res_offsets.emplace_back(res_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
static StringRef substringIndexUTF8(
|
||||||
|
const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index)
|
||||||
|
{
|
||||||
|
std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl;
|
||||||
|
|
||||||
|
if (index == 0)
|
||||||
|
return {str_ref.data, 0};
|
||||||
|
|
||||||
|
const auto * begin = reinterpret_cast<const UInt8 *>(str_ref.data);
|
||||||
|
const auto * end = reinterpret_cast<const UInt8 *>(str_ref.data + str_ref.size);
|
||||||
|
const auto * pos = begin;
|
||||||
|
if (index > 0)
|
||||||
|
{
|
||||||
|
Int64 i = 0;
|
||||||
|
while (i < index)
|
||||||
|
{
|
||||||
|
pos = searcher->search(pos, end - pos);
|
||||||
|
|
||||||
|
if (pos != end)
|
||||||
|
{
|
||||||
|
pos += delim.size();
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return str_ref;
|
||||||
|
}
|
||||||
|
return {begin, static_cast<size_t>(pos - begin - delim.size())};
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Int64 total = 0;
|
||||||
|
while (pos < end && end != (pos = searcher->search(pos, end - pos)))
|
||||||
|
{
|
||||||
|
pos += delim.size();
|
||||||
|
++total;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (total + index < 0)
|
||||||
|
return str_ref;
|
||||||
|
|
||||||
|
Int64 index_from_left = total + 1 + index;
|
||||||
|
std::cout << "total:" << total << ", index_from_left" << index_from_left << std::endl;
|
||||||
|
pos = begin;
|
||||||
|
Int64 i = 0;
|
||||||
|
while (i < index_from_left && pos < end && end != (pos = searcher->search(pos, end - pos)))
|
||||||
{
|
{
|
||||||
pos += delim.size();
|
pos += delim.size();
|
||||||
++i;
|
++i;
|
||||||
|
std::cout << "pos offset:" << pos - begin << ", total size:" << end - begin << std::endl;
|
||||||
}
|
}
|
||||||
else
|
std::cout << "pos offset:" << pos - begin << ", size:" << end - pos << std::endl;
|
||||||
return str_ref;
|
StringRef res = {pos, static_cast<size_t>(end - pos)};
|
||||||
|
std::cout << "result:" << res.toString() << std::endl;
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
return {begin, static_cast<size_t>(pos - begin - delim.size())};
|
|
||||||
}
|
}
|
||||||
else
|
|
||||||
|
static StringRef substringIndex(const StringRef & str_ref, char delim, Int64 index)
|
||||||
{
|
{
|
||||||
Int64 total = 0;
|
std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl;
|
||||||
while (pos < end && end != (pos = searcher->search(pos, end - pos)))
|
|
||||||
|
if (index == 0)
|
||||||
|
return {str_ref.data, 0};
|
||||||
|
|
||||||
|
if (index > 0)
|
||||||
{
|
{
|
||||||
pos += delim.size();
|
const auto * end = str_ref.data + str_ref.size;
|
||||||
++total;
|
const auto * pos = str_ref.data;
|
||||||
}
|
Int64 i = 0;
|
||||||
|
while (i < index)
|
||||||
if (total + index < 0)
|
|
||||||
return str_ref;
|
|
||||||
|
|
||||||
Int64 index_from_left = total + 1 + index;
|
|
||||||
pos = begin;
|
|
||||||
Int64 i = 0;
|
|
||||||
while (pos < end && end != (pos = searcher->search(pos, end - pos)) && i < index_from_left)
|
|
||||||
{
|
|
||||||
pos += delim.size();
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
return {pos, static_cast<size_t>(end - pos)};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <char delim>
|
|
||||||
static StringRef substringIndex(
|
|
||||||
const StringRef & str_ref,
|
|
||||||
Int64 index)
|
|
||||||
{
|
|
||||||
if (index == 0)
|
|
||||||
return {str_ref.data, 0};
|
|
||||||
|
|
||||||
if (index > 0)
|
|
||||||
{
|
|
||||||
const auto * end = str_ref.data + str_ref.size;
|
|
||||||
const auto * pos = str_ref.data;
|
|
||||||
Int64 i = 0;
|
|
||||||
while (i < index)
|
|
||||||
{
|
|
||||||
pos = find_first_symbols<delim>(pos, end);
|
|
||||||
|
|
||||||
if (pos != end)
|
|
||||||
{
|
{
|
||||||
++pos;
|
pos = std::find(pos, end, delim);
|
||||||
++i;
|
if (pos != end)
|
||||||
|
{
|
||||||
|
++pos;
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return str_ref;
|
||||||
}
|
}
|
||||||
else
|
return {str_ref.data, static_cast<size_t>(pos - str_ref.data - 1)};
|
||||||
return str_ref;
|
|
||||||
}
|
}
|
||||||
return {str_ref.data, static_cast<size_t>(pos - str_ref.data - 1)};
|
else
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
const auto * begin = str_ref.data;
|
|
||||||
const auto * pos = str_ref.data + str_ref.size;
|
|
||||||
Int64 i = 0;
|
|
||||||
while (i < index)
|
|
||||||
{
|
{
|
||||||
const auto * next_pos = ::detail::find_last_symbols_sse2<true, ::detail::ReturnMode::End, delim>(begin, pos);
|
const auto * begin = str_ref.data;
|
||||||
|
const auto * pos = str_ref.data + str_ref.size;
|
||||||
if (next_pos != pos)
|
Int64 i = 0;
|
||||||
|
while (i + index < 0)
|
||||||
{
|
{
|
||||||
pos = next_pos;
|
--pos;
|
||||||
++i;
|
while (pos >= begin && *pos != delim)
|
||||||
}
|
--pos;
|
||||||
else
|
|
||||||
return str_ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {pos + 1, static_cast<size_t>(str_ref.data + str_ref.size - pos - 1)};
|
if (pos >= begin)
|
||||||
|
++i;
|
||||||
|
else
|
||||||
|
return str_ref;
|
||||||
|
}
|
||||||
|
return {pos + 1, static_cast<size_t>(str_ref.data + str_ref.size - pos - 1)};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -319,4 +319,3 @@ REGISTER_FUNCTION(SubstringIndex)
|
|||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
155
tests/queries/0_stateless/02798_substring_index.reference
Normal file
155
tests/queries/0_stateless/02798_substring_index.reference
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
-- { echoOn }
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -4);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -3);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -2);
|
||||||
|
clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -1);
|
||||||
|
com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 0);
|
||||||
|
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 1);
|
||||||
|
www
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 2);
|
||||||
|
www.clickhouse
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 3);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 4);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -4);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -3);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -2);
|
||||||
|
clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -1);
|
||||||
|
com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 0);
|
||||||
|
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 1);
|
||||||
|
www
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 2);
|
||||||
|
www.clickhouse
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 3);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 4);
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-4));
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-3));
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-2));
|
||||||
|
clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-1));
|
||||||
|
com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(0));
|
||||||
|
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(1));
|
||||||
|
www
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(2));
|
||||||
|
www.clickhouse
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(3));
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(4));
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-4));
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-3));
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-2));
|
||||||
|
clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-1));
|
||||||
|
com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(0));
|
||||||
|
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(1));
|
||||||
|
www
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(2));
|
||||||
|
www.clickhouse
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(3));
|
||||||
|
www.clickhouse.com
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(4));
|
||||||
|
www.clickhouse.com
|
||||||
|
select SUBSTRING_INDEX('www.clickhouse.com', '.', 2);
|
||||||
|
www.clickhouse
|
||||||
|
select substringIndex('www.clickhouse.com', '..', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndex('www.clickhouse.com', '', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndex('www.clickhouse.com', materialize('.'), 2); -- { serverError ILLEGAL_COLUMN }
|
||||||
|
select substringIndex('www.clickhouse.com', '.', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -4);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -3);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -2);
|
||||||
|
民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -1);
|
||||||
|
文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 0);
|
||||||
|
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 1);
|
||||||
|
富强
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 2);
|
||||||
|
富强,民主
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 3);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 4);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -4);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -3);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -2);
|
||||||
|
民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -1);
|
||||||
|
文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 0);
|
||||||
|
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 1);
|
||||||
|
富强
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 2);
|
||||||
|
富强,民主
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 3);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 4);
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-4));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-3));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-2));
|
||||||
|
民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-1));
|
||||||
|
文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(0));
|
||||||
|
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(1));
|
||||||
|
富强
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(2));
|
||||||
|
富强,民主
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(3));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(4));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-4));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-3));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-2));
|
||||||
|
民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-1));
|
||||||
|
文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(0));
|
||||||
|
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(1));
|
||||||
|
富强
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(2));
|
||||||
|
富强,民主
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(3));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(4));
|
||||||
|
富强,民主,文明
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',,', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndexUTF8('富强,民主,文明', materialize(','), 2); -- { serverError ILLEGAL_COLUMN }
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
93
tests/queries/0_stateless/02798_substring_index.sql
Normal file
93
tests/queries/0_stateless/02798_substring_index.sql
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
-- { echoOn }
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -4);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -3);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -2);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', -1);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 0);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 1);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 2);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 3);
|
||||||
|
select substringIndex('www.clickhouse.com', '.', 4);
|
||||||
|
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -4);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -3);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -2);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', -1);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 0);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 1);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 2);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 3);
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', 4);
|
||||||
|
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-4));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-3));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-2));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-1));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(0));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(1));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(2));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(3));
|
||||||
|
select substringIndex(materialize('www.clickhouse.com'), '.', materialize(4));
|
||||||
|
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-4));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-3));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-2));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(-1));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(0));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(1));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(2));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(3));
|
||||||
|
select substringIndex('www.clickhouse.com', '.', materialize(4));
|
||||||
|
|
||||||
|
select SUBSTRING_INDEX('www.clickhouse.com', '.', 2);
|
||||||
|
|
||||||
|
select substringIndex('www.clickhouse.com', '..', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndex('www.clickhouse.com', '', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndex('www.clickhouse.com', materialize('.'), 2); -- { serverError ILLEGAL_COLUMN }
|
||||||
|
select substringIndex('www.clickhouse.com', '.', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||||
|
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -4);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -3);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -2);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', -1);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 0);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 1);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 2);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 3);
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', 4);
|
||||||
|
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -4);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -3);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -2);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', -1);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 0);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 1);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 2);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 3);
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', 4);
|
||||||
|
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-4));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-3));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-2));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(-1));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(0));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(1));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(2));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(3));
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', materialize(4));
|
||||||
|
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-4));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-3));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-2));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-1));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(0));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(1));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(2));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(3));
|
||||||
|
select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(4));
|
||||||
|
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',,', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS }
|
||||||
|
select substringIndexUTF8('富强,民主,文明', materialize(','), 2); -- { serverError ILLEGAL_COLUMN }
|
||||||
|
select substringIndexUTF8('富强,民主,文明', ',', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||||
|
-- { echoOff }
|
Loading…
Reference in New Issue
Block a user