ClickHouse/src/Functions/extract.cpp
2023-09-14 16:40:42 +00:00

74 lines
2.0 KiB
C++

#include <Functions/FunctionsStringSearchToString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/Regexps.h>
#include <Common/OptimizedRegularExpression.h>
namespace DB
{
namespace
{
struct ExtractImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & pattern,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(data.size() / 5);
res_offsets.resize(offsets.size());
const OptimizedRegularExpression regexp = Regexps::createRegexp<false, false, false>(pattern);
unsigned capture = regexp.getNumberOfSubpatterns() > 0 ? 1 : 0;
OptimizedRegularExpression::MatchVec matches;
matches.reserve(capture + 1);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t cur_offset = offsets[i];
unsigned count
= regexp.match(reinterpret_cast<const char *>(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1);
if (count > capture && matches[capture].offset != std::string::npos)
{
const auto & match = matches[capture];
res_data.resize(res_offset + match.length + 1);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], &data[prev_offset + match.offset], match.length);
res_offset += match.length;
}
else
{
res_data.resize(res_offset + 1);
}
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
struct NameExtract
{
static constexpr auto name = "extract";
};
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
}
REGISTER_FUNCTION(Extract)
{
factory.registerFunction<FunctionExtract>();
}
}