#include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int ILLEGAL_COLUMN; } /** Functions that split strings into an array of strings or vice versa. * * splitByRegexp(regexp, s[, max_substrings]) */ namespace { using Pos = const char *; class SplitByRegexpImpl { private: Regexps::RegexpPtr re; OptimizedRegularExpression::MatchVec matches; Pos pos; Pos end; std::optional max_splits; size_t splits; bool max_substrings_includes_remaining_string; public: static constexpr auto name = "splitByRegexp"; static bool isVariadic() { return true; } static size_t getNumberOfArguments() { return 0; } static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {0, 2}; } static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments) { checkArgumentsWithSeparatorAndOptionalMaxSubstrings(func, arguments); } static constexpr auto strings_argument_position = 1uz; void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_) { const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); if (!col) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. " "Must be constant string.", arguments[0].column->getName(), name); if (!col->getValue().empty()) re = std::make_shared(Regexps::createRegexp(col->getValue())); max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_; max_splits = extractMaxSplits(arguments, 2); } /// Called for each next string. void set(Pos pos_, Pos end_) { pos = pos_; end = end_; splits = 0; } /// Get the next token, if any, or return false. bool get(Pos & token_begin, Pos & token_end) { if (!re) { if (pos == end) return false; token_begin = pos; if (max_splits) { if (max_substrings_includes_remaining_string) { if (splits == *max_splits - 1) { token_end = end; pos = end; return true; } } else if (splits == *max_splits) return false; } pos += 1; token_end = pos; ++splits; } else { if (!pos || pos > end) return false; token_begin = pos; if (max_splits) { if (max_substrings_includes_remaining_string) { if (splits == *max_splits - 1) { token_end = end; pos = nullptr; return true; } } else if (splits == *max_splits) return false; } if (!re->match(pos, end - pos, matches) || !matches[0].length) { token_end = end; pos = end + 1; } else { token_end = pos + matches[0].offset; pos = token_end + matches[0].length; ++splits; } } return true; } }; using FunctionSplitByRegexp = FunctionTokens; } REGISTER_FUNCTION(SplitByRegexp) { factory.registerFunction(); } }