mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-03 13:02:00 +00:00
123 lines
3.3 KiB
C++
123 lines
3.3 KiB
C++
#include <Columns/ColumnArray.h>
|
|
#include <Columns/ColumnConst.h>
|
|
#include <Columns/ColumnString.h>
|
|
#include <DataTypes/DataTypeArray.h>
|
|
#include <DataTypes/DataTypeNullable.h>
|
|
#include <DataTypes/DataTypeString.h>
|
|
#include <Functions/FunctionHelpers.h>
|
|
#include <Functions/FunctionTokens.h>
|
|
#include <Functions/FunctionFactory.h>
|
|
#include <Functions/Regexps.h>
|
|
#include <Interpreters/Context.h>
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
#include <Common/assert_cast.h>
|
|
#include <Common/typeid_cast.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int ILLEGAL_COLUMN;
|
|
}
|
|
|
|
|
|
/** Functions that split strings into an array of strings or vice versa.
|
|
*
|
|
* extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
|
|
* - first subpattern, if regexp has subpattern;
|
|
* - zero subpattern (the match part, otherwise);
|
|
* - otherwise, an empty array
|
|
*/
|
|
namespace
|
|
{
|
|
|
|
using Pos = const char *;
|
|
|
|
class ExtractAllImpl
|
|
{
|
|
private:
|
|
Regexps::RegexpPtr re;
|
|
OptimizedRegularExpression::MatchVec matches;
|
|
size_t capture;
|
|
|
|
Pos pos;
|
|
Pos end;
|
|
public:
|
|
static constexpr auto name = "extractAll";
|
|
static String getName() { return name; }
|
|
static bool isVariadic() { return false; }
|
|
static size_t getNumberOfArguments() { return 2; }
|
|
|
|
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
|
|
{
|
|
FunctionArgumentDescriptors mandatory_args{
|
|
{"haystack", &isString<IDataType>, nullptr, "String"},
|
|
{"pattern", &isString<IDataType>, isColumnConst, "const String"}
|
|
};
|
|
|
|
validateFunctionArgumentTypes(func, arguments, mandatory_args);
|
|
}
|
|
|
|
static constexpr auto strings_argument_position = 0uz;
|
|
|
|
void init(const ColumnsWithTypeAndName & arguments, bool /*max_substrings_includes_remaining_string*/)
|
|
{
|
|
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
|
|
|
|
if (!col)
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. "
|
|
"Must be constant string.", arguments[1].column->getName(), getName());
|
|
|
|
re = std::make_shared<OptimizedRegularExpression>(Regexps::createRegexp<false, false, false>(col->getValue<String>()));
|
|
capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0;
|
|
|
|
matches.resize(capture + 1);
|
|
}
|
|
|
|
/// Called for each next string.
|
|
void set(Pos pos_, Pos end_)
|
|
{
|
|
pos = pos_;
|
|
end = end_;
|
|
}
|
|
|
|
/// Get the next token, if any, or return false.
|
|
bool get(Pos & token_begin, Pos & token_end)
|
|
{
|
|
if (!pos || pos > end)
|
|
return false;
|
|
|
|
if (!re->match(pos, end - pos, matches) || !matches[0].length)
|
|
return false;
|
|
|
|
if (matches[capture].offset == std::string::npos)
|
|
{
|
|
/// Empty match.
|
|
token_begin = pos;
|
|
token_end = pos;
|
|
}
|
|
else
|
|
{
|
|
token_begin = pos + matches[capture].offset;
|
|
token_end = token_begin + matches[capture].length;
|
|
}
|
|
|
|
pos += matches[0].offset + matches[0].length;
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
using FunctionExtractAll = FunctionTokens<ExtractAllImpl>;
|
|
|
|
}
|
|
|
|
REGISTER_FUNCTION(ExtractAll)
|
|
{
|
|
factory.registerFunction<FunctionExtractAll>();
|
|
}
|
|
|
|
}
|