mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 17:41:59 +00:00
74 lines
2.0 KiB
C++
74 lines
2.0 KiB
C++
#include <Functions/FunctionsStringSearchToString.h>
|
|
#include <Functions/FunctionFactory.h>
|
|
#include <Functions/Regexps.h>
|
|
#include <Common/OptimizedRegularExpression.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
namespace
|
|
{
|
|
|
|
struct ExtractImpl
|
|
{
|
|
static void vector(
|
|
const ColumnString::Chars & data,
|
|
const ColumnString::Offsets & offsets,
|
|
const std::string & pattern,
|
|
ColumnString::Chars & res_data,
|
|
ColumnString::Offsets & res_offsets)
|
|
{
|
|
res_data.reserve(data.size() / 5);
|
|
res_offsets.resize(offsets.size());
|
|
|
|
const OptimizedRegularExpression regexp = Regexps::createRegexp<false, false, false>(pattern);
|
|
|
|
unsigned capture = regexp.getNumberOfSubpatterns() > 0 ? 1 : 0;
|
|
OptimizedRegularExpression::MatchVec matches;
|
|
matches.reserve(capture + 1);
|
|
size_t prev_offset = 0;
|
|
size_t res_offset = 0;
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
{
|
|
size_t cur_offset = offsets[i];
|
|
|
|
unsigned count
|
|
= regexp.match(reinterpret_cast<const char *>(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1);
|
|
if (count > capture && matches[capture].offset != std::string::npos)
|
|
{
|
|
const auto & match = matches[capture];
|
|
res_data.resize(res_offset + match.length + 1);
|
|
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], &data[prev_offset + match.offset], match.length);
|
|
res_offset += match.length;
|
|
}
|
|
else
|
|
{
|
|
res_data.resize(res_offset + 1);
|
|
}
|
|
|
|
res_data[res_offset] = 0;
|
|
++res_offset;
|
|
res_offsets[i] = res_offset;
|
|
|
|
prev_offset = cur_offset;
|
|
}
|
|
}
|
|
};
|
|
|
|
struct NameExtract
|
|
{
|
|
static constexpr auto name = "extract";
|
|
};
|
|
|
|
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
|
|
|
|
}
|
|
|
|
REGISTER_FUNCTION(Extract)
|
|
{
|
|
factory.registerFunction<FunctionExtract>();
|
|
}
|
|
|
|
}
|