mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 07:01:59 +00:00
extractAllGroupsHorizontal and extractAllGroupsVertical
Split tests, fixed some error messages Fixed test and error reporting of extractGroups
This commit is contained in:
parent
5d3383edbe
commit
50a184acac
@ -14,11 +14,6 @@ static bool operator==(const IDataType & left, const IDataType & right)
|
||||
return left.equals(right);
|
||||
}
|
||||
|
||||
std::ostream & operator<<(std::ostream & ostr, const IDataType & dt)
|
||||
{
|
||||
return ostr << dt.getName();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
using namespace DB;
|
||||
|
@ -1,127 +0,0 @@
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/Regexps.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
|
||||
/** Match all groups of given input string with given re, return array of arrays of matches.
|
||||
*
|
||||
* SELECT extractAllGroups('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
|
||||
* should produce:
|
||||
* [['abc', '111'], ['def', '222'], ['ghi', '333']]
|
||||
*/
|
||||
class FunctionExtractAllGroups : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "extractAllGroups";
|
||||
static FunctionPtr create(const Context &) { return std::make_shared<FunctionExtractAllGroups>(); }
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 2; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return false; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
FunctionArgumentDescriptors args{
|
||||
{"haystack", isStringOrFixedString, nullptr, "const String or const FixedString"},
|
||||
{"needle", isStringOrFixedString, isColumnConst, "const String or const FixedString"},
|
||||
};
|
||||
validateFunctionArgumentTypes(*this, arguments, args);
|
||||
|
||||
/// Two-dimensional array of strings, each `row` of top array represents matching groups.
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()));
|
||||
}
|
||||
|
||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
|
||||
{
|
||||
const ColumnPtr column_haystack = block.getByPosition(arguments[0]).column;
|
||||
const ColumnPtr column_needle = block.getByPosition(arguments[1]).column;
|
||||
|
||||
const auto needle = typeid_cast<const ColumnConst &>(*column_needle).getValue<String>();
|
||||
|
||||
if (needle.empty())
|
||||
throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
||||
|
||||
const auto regexp = Regexps::get<false, false>(needle);
|
||||
const auto & re2 = regexp->getRE2();
|
||||
|
||||
if (!re2)
|
||||
throw Exception("There is no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
const size_t groups_count = re2->NumberOfCapturingGroups();
|
||||
|
||||
if (!groups_count)
|
||||
throw Exception("There is no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
// Including 0-group, which is the whole regexp.
|
||||
PODArrayWithStackMemory<re2_st::StringPiece, 128> matched_groups(groups_count + 1);
|
||||
|
||||
ColumnArray::ColumnOffsets::MutablePtr root_offsets_col = ColumnArray::ColumnOffsets::create();
|
||||
ColumnArray::ColumnOffsets::MutablePtr nested_offsets_col = ColumnArray::ColumnOffsets::create();
|
||||
ColumnString::MutablePtr data_col = ColumnString::create();
|
||||
|
||||
auto & root_offsets_data = root_offsets_col->getData();
|
||||
auto & nested_offsets_data = nested_offsets_col->getData();
|
||||
|
||||
root_offsets_data.resize(input_rows_count);
|
||||
ColumnArray::Offset current_root_offset = 0;
|
||||
ColumnArray::Offset current_nested_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
StringRef current_row = column_haystack->getDataAt(i);
|
||||
|
||||
// Extract all non-intersecting matches from haystack except group #0.
|
||||
const auto * pos = current_row.data;
|
||||
const auto * end = pos + current_row.size;
|
||||
while (pos < end
|
||||
&& re2->Match(re2_st::StringPiece(pos, end - pos),
|
||||
0, end - pos, re2_st::RE2::UNANCHORED, matched_groups.data(), matched_groups.size()))
|
||||
{
|
||||
// 1 is to exclude group #0 which is whole re match.
|
||||
for (size_t group = 1; group <= groups_count; ++group)
|
||||
data_col->insertData(matched_groups[group].data(), matched_groups[group].size());
|
||||
|
||||
pos = matched_groups[0].data() + matched_groups[0].size();
|
||||
|
||||
current_nested_offset += groups_count;
|
||||
nested_offsets_data.push_back(current_nested_offset);
|
||||
|
||||
++current_root_offset;
|
||||
}
|
||||
|
||||
root_offsets_data[i] = current_root_offset;
|
||||
}
|
||||
ColumnArray::MutablePtr nested_array_col = ColumnArray::create(std::move(data_col), std::move(nested_offsets_col));
|
||||
ColumnArray::MutablePtr root_array_col = ColumnArray::create(std::move(nested_array_col), std::move(root_offsets_col));
|
||||
block.getByPosition(result).column = std::move(root_array_col);
|
||||
}
|
||||
};
|
||||
|
||||
void registerFunctionExtractAllGroups(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionExtractAllGroups>();
|
||||
}
|
||||
|
||||
}
|
238
src/Functions/extractAllGroups.h
Normal file
238
src/Functions/extractAllGroups.h
Normal file
@ -0,0 +1,238 @@
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunctionImpl.h>
|
||||
#include <Functions/Regexps.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <Core/iostream_debug_helpers.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
|
||||
enum class ExtractAllGroupsResultKind
|
||||
{
|
||||
VERTICAL,
|
||||
HORIZONTAL
|
||||
};
|
||||
|
||||
|
||||
/** Match all groups of given input string with given re, return array of arrays of matches.
|
||||
*
|
||||
* Depending on `Impl::Kind`, result is either grouped by grop id (Horizontal) or in order of appearance (Vertical):
|
||||
*
|
||||
* SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
|
||||
* =>
|
||||
* [['abc', '111'], ['def', '222'], ['ghi', '333']]
|
||||
*
|
||||
* SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
|
||||
* =>
|
||||
* [['abc', 'def', 'ghi'], ['111', '222', '333']
|
||||
*/
|
||||
template <typename Impl>
|
||||
class FunctionExtractAllGroups : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto Kind = Impl::Kind;
|
||||
static constexpr auto name = Impl::Name;
|
||||
|
||||
static FunctionPtr create(const Context &) { return std::make_shared<FunctionExtractAllGroups>(); }
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 2; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return false; }
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
FunctionArgumentDescriptors args{
|
||||
{"haystack", isStringOrFixedString, nullptr, "const String or const FixedString"},
|
||||
{"needle", isStringOrFixedString, isColumnConst, "const String or const FixedString"},
|
||||
};
|
||||
validateFunctionArgumentTypes(*this, arguments, args);
|
||||
|
||||
/// Two-dimensional array of strings, each `row` of top array represents matching groups.
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()));
|
||||
}
|
||||
|
||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
|
||||
{
|
||||
static const auto MAX_GROUPS_COUNT = 128;
|
||||
|
||||
const ColumnPtr column_haystack = block.getByPosition(arguments[0]).column;
|
||||
const ColumnPtr column_needle = block.getByPosition(arguments[1]).column;
|
||||
|
||||
const auto needle = typeid_cast<const ColumnConst &>(*column_needle).getValue<String>();
|
||||
|
||||
if (needle.empty())
|
||||
throw Exception("Length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
using StringPiece = typename Regexps::Regexp::StringPieceType;
|
||||
const auto & regexp = Regexps::get<false, false>(needle)->getRE2();
|
||||
|
||||
if (!regexp)
|
||||
throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
const size_t groups_count = regexp->NumberOfCapturingGroups();
|
||||
|
||||
if (!groups_count)
|
||||
throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
if (groups_count > MAX_GROUPS_COUNT - 1)
|
||||
throw Exception("Too many groups in regexp: " + std::to_string(groups_count)
|
||||
+ ", max: " + std::to_string(MAX_GROUPS_COUNT - 1),
|
||||
ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
// Including 0-group, which is the whole regexp.
|
||||
PODArrayWithStackMemory<StringPiece, MAX_GROUPS_COUNT> matched_groups(groups_count + 1);
|
||||
|
||||
ColumnArray::ColumnOffsets::MutablePtr root_offsets_col = ColumnArray::ColumnOffsets::create();
|
||||
ColumnArray::ColumnOffsets::MutablePtr nested_offsets_col = ColumnArray::ColumnOffsets::create();
|
||||
ColumnString::MutablePtr data_col = ColumnString::create();
|
||||
|
||||
auto & root_offsets_data = root_offsets_col->getData();
|
||||
auto & nested_offsets_data = nested_offsets_col->getData();
|
||||
|
||||
ColumnArray::Offset current_root_offset = 0;
|
||||
ColumnArray::Offset current_nested_offset = 0;
|
||||
|
||||
if constexpr (Kind == ExtractAllGroupsResultKind::VERTICAL)
|
||||
{
|
||||
root_offsets_data.resize(input_rows_count);
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
StringRef current_row = column_haystack->getDataAt(i);
|
||||
|
||||
// Extract all non-intersecting matches from haystack except group #0.
|
||||
const auto * pos = current_row.data;
|
||||
const auto * end = pos + current_row.size;
|
||||
while (pos < end
|
||||
&& regexp->Match({pos, static_cast<size_t>(end - pos)},
|
||||
0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size()))
|
||||
{
|
||||
// 1 is to exclude group #0 which is whole re match.
|
||||
for (size_t group = 1; group <= groups_count; ++group)
|
||||
data_col->insertData(matched_groups[group].data(), matched_groups[group].size());
|
||||
|
||||
pos = matched_groups[0].data() + matched_groups[0].size();
|
||||
|
||||
current_nested_offset += groups_count;
|
||||
nested_offsets_data.push_back(current_nested_offset);
|
||||
|
||||
++current_root_offset;
|
||||
}
|
||||
|
||||
root_offsets_data[i] = current_root_offset;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<StringPiece> all_matches;
|
||||
// number of times RE matched on each row of haystack column.
|
||||
std::vector<size_t> number_of_matches_per_row;
|
||||
|
||||
// we expect RE to match multiple times on each row, `* 8` is arbitrary to reduce number of re-allocations.
|
||||
all_matches.reserve(input_rows_count * groups_count * 8);
|
||||
number_of_matches_per_row.reserve(input_rows_count);
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
size_t matches_per_row = 0;
|
||||
|
||||
const auto & current_row = column_haystack->getDataAt(i);
|
||||
|
||||
// Extract all non-intersecting matches from haystack except group #0.
|
||||
const auto * pos = current_row.data;
|
||||
const auto * end = pos + current_row.size;
|
||||
while (pos < end
|
||||
&& regexp->Match({pos, static_cast<size_t>(end - pos)},
|
||||
0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size()))
|
||||
{
|
||||
// 1 is to exclude group #0 which is whole re match.
|
||||
for (size_t group = 1; group <= groups_count; ++group)
|
||||
all_matches.push_back(matched_groups[group]);
|
||||
|
||||
pos = matched_groups[0].data() + matched_groups[0].size();
|
||||
|
||||
++matches_per_row;
|
||||
}
|
||||
|
||||
number_of_matches_per_row.push_back(matches_per_row);
|
||||
}
|
||||
|
||||
{
|
||||
size_t total_matched_groups_string_len = 0;
|
||||
for (const auto & m : all_matches)
|
||||
total_matched_groups_string_len += m.length();
|
||||
|
||||
data_col->reserve(total_matched_groups_string_len);
|
||||
}
|
||||
|
||||
nested_offsets_col->reserve(matched_groups.size());
|
||||
root_offsets_col->reserve(groups_count);
|
||||
|
||||
// Re-arrange `all_matches` from:
|
||||
// [
|
||||
// "ROW 0: 1st group 1st match",
|
||||
// "ROW 0: 2nd group 1st match",
|
||||
// ...,
|
||||
// "ROW 0: 1st group 2nd match",
|
||||
// "ROW 0: 2nd group 2nd match",
|
||||
// ...,
|
||||
// "ROW 1: 1st group 1st match",
|
||||
// ...
|
||||
// ]
|
||||
//
|
||||
// into column of 2D arrays:
|
||||
// [
|
||||
// /* all matchig groups from ROW 0 of haystack column */
|
||||
// ["ROW 0: 1st group 1st match", "ROW 0: 1st group 2nd match", ...],
|
||||
// ["ROW 0: 2nd group 1st match", "ROW 0: 2nd group 2nd match", ...],
|
||||
// ...
|
||||
// ],
|
||||
// [
|
||||
// /* all matchig groups from row 1 of haystack column */
|
||||
// ["ROW 1: 1st group 1st match", ...],
|
||||
// ...
|
||||
// ]
|
||||
|
||||
size_t row_offset = 0;
|
||||
for (const auto matches_per_row : number_of_matches_per_row)
|
||||
{
|
||||
const size_t next_row_offset = row_offset + matches_per_row * groups_count;
|
||||
for (size_t group_id = 0; group_id < groups_count; ++group_id)
|
||||
{
|
||||
for (size_t i = row_offset + group_id; i < next_row_offset && i < all_matches.size(); i += groups_count)
|
||||
{
|
||||
const auto & match = all_matches[i];
|
||||
data_col->insertData(match.begin(), match.length());
|
||||
}
|
||||
nested_offsets_col->insertValue(data_col->size());
|
||||
}
|
||||
root_offsets_col->insertValue(nested_offsets_col->size());
|
||||
row_offset = next_row_offset;
|
||||
}
|
||||
}
|
||||
DUMP(Kind, needle, column_haystack, root_offsets_col, nested_offsets_col);
|
||||
|
||||
ColumnArray::MutablePtr nested_array_col = ColumnArray::create(std::move(data_col), std::move(nested_offsets_col));
|
||||
ColumnArray::MutablePtr root_array_col = ColumnArray::create(std::move(nested_array_col), std::move(root_offsets_col));
|
||||
block.getByPosition(result).column = std::move(root_array_col);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
23
src/Functions/extractAllGroupsHorizontal.cpp
Normal file
23
src/Functions/extractAllGroupsHorizontal.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/extractAllGroups.h>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
struct HorizontalImpl
|
||||
{
|
||||
static constexpr auto Kind = DB::ExtractAllGroupsResultKind::HORIZONTAL;
|
||||
static constexpr auto Name = "extractAllGroupsHorizontal";
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
void registerFunctionExtractAllGroupsHorizontal(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionExtractAllGroups<HorizontalImpl>>();
|
||||
}
|
||||
|
||||
}
|
24
src/Functions/extractAllGroupsVertical.cpp
Normal file
24
src/Functions/extractAllGroupsVertical.cpp
Normal file
@ -0,0 +1,24 @@
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/extractAllGroups.h>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
struct VerticalImpl
|
||||
{
|
||||
static constexpr auto Kind = DB::ExtractAllGroupsResultKind::VERTICAL;
|
||||
static constexpr auto Name = "extractAllGroupsVertical";
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
void registerFunctionExtractAllGroupsVertical(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionExtractAllGroups<VerticalImpl>>();
|
||||
factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
@ -17,7 +17,6 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
@ -49,7 +48,6 @@ public:
|
||||
};
|
||||
validateFunctionArgumentTypes(*this, arguments, args);
|
||||
|
||||
/// Two-dimensional array of strings, each `row` of top array represents matching groups.
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
|
||||
}
|
||||
|
||||
@ -61,7 +59,7 @@ public:
|
||||
const auto needle = typeid_cast<const ColumnConst &>(*column_needle).getValue<String>();
|
||||
|
||||
if (needle.empty())
|
||||
throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
||||
throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
const auto regexp = Regexps::get<false, false>(needle);
|
||||
const auto & re2 = regexp->getRE2();
|
||||
|
@ -18,7 +18,8 @@ void registerFunctionMultiFuzzyMatchAny(FunctionFactory &);
|
||||
void registerFunctionMultiFuzzyMatchAnyIndex(FunctionFactory &);
|
||||
void registerFunctionMultiFuzzyMatchAllIndices(FunctionFactory &);
|
||||
void registerFunctionExtractGroups(FunctionFactory &);
|
||||
void registerFunctionExtractAllGroups(FunctionFactory &);
|
||||
void registerFunctionExtractAllGroupsVertical(FunctionFactory &);
|
||||
void registerFunctionExtractAllGroupsHorizontal(FunctionFactory &);
|
||||
|
||||
void registerFunctionsStringRegexp(FunctionFactory & factory)
|
||||
{
|
||||
@ -37,7 +38,8 @@ void registerFunctionsStringRegexp(FunctionFactory & factory)
|
||||
registerFunctionMultiFuzzyMatchAnyIndex(factory);
|
||||
registerFunctionMultiFuzzyMatchAllIndices(factory);
|
||||
registerFunctionExtractGroups(factory);
|
||||
registerFunctionExtractAllGroups(factory);
|
||||
registerFunctionExtractAllGroupsVertical(factory);
|
||||
registerFunctionExtractAllGroupsHorizontal(factory);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -6,5 +6,6 @@
|
||||
<query>SELECT count() FROM test.hits WHERE NOT ignore(extract(URL, '(\\w+=\\w+)'))</query>
|
||||
<query>SELECT count() FROM test.hits WHERE NOT ignore(extractAll(URL, '(\\w+=\\w+)'))</query>
|
||||
<query>SELECT count() FROM test.hits WHERE NOT ignore(extractGroups(URL, '(\\w+)=(\\w+)'))</query>
|
||||
<query>SELECT count() FROM test.hits WHERE NOT ignore(extractAllGroups(URL, '(\\w+)=(\\w+)'))</query>
|
||||
<query>SELECT count() FROM test.hits WHERE NOT ignore(extractAllGroupsVertical(URL, '(\\w+)=(\\w+)'))</query>
|
||||
<query>SELECT count() FROM test.hits WHERE NOT ignore(extractAllGroupsHorizontal(URL, '(\\w+)=(\\w+)'))</query>
|
||||
</test>
|
||||
|
@ -1,51 +0,0 @@
|
||||
-- error cases
|
||||
SELECT extractAllGroups(); --{serverError 42} not enough arguments
|
||||
SELECT extractAllGroups('hello'); --{serverError 42} not enough arguments
|
||||
SELECT extractAllGroups('hello', 123); --{serverError 43} invalid argument type
|
||||
SELECT extractAllGroups(123, 'world'); --{serverError 43} invalid argument type
|
||||
SELECT extractAllGroups('hello world', '((('); --{serverError 427} invalid re
|
||||
SELECT extractAllGroups('hello world', materialize('\\w+')); --{serverError 44} non-const needle
|
||||
|
||||
SELECT '0 groups, zero matches';
|
||||
SELECT extractAllGroups('hello world', '\\w+'); -- { serverError 36 }
|
||||
|
||||
SELECT '1 group, multiple matches, String and FixedString';
|
||||
SELECT extractAllGroups('hello world', '(\\w+)');
|
||||
SELECT extractAllGroups('hello world', CAST('(\\w+)' as FixedString(5)));
|
||||
SELECT extractAllGroups(CAST('hello world' AS FixedString(12)), '(\\w+)');
|
||||
SELECT extractAllGroups(CAST('hello world' AS FixedString(12)), CAST('(\\w+)' as FixedString(5)));
|
||||
SELECT extractAllGroups(materialize(CAST('hello world' AS FixedString(12))), '(\\w+)');
|
||||
SELECT extractAllGroups(materialize(CAST('hello world' AS FixedString(12))), CAST('(\\w+)' as FixedString(5)));
|
||||
|
||||
SELECT 'mutiple groups, multiple matches';
|
||||
SELECT extractAllGroups('abc=111, def=222, ghi=333 "jkl mno"="444 foo bar"', '("[^"]+"|\\w+)=("[^"]+"|\\w+)');
|
||||
|
||||
SELECT 'big match';
|
||||
SELECT
|
||||
length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches))
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroups(haystack, '(abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz)') AS matches
|
||||
FROM numbers(3)
|
||||
);
|
||||
|
||||
SELECT 'lots of matches';
|
||||
SELECT
|
||||
length(haystack), length(matches[1]), length(matches), arrayReduce('sum', arrayMap((x) -> length(x), arrayMap(x -> x[1], matches)))
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroups(haystack, '(\\w)') AS matches
|
||||
FROM numbers(3)
|
||||
);
|
||||
|
||||
SELECT 'lots of groups';
|
||||
SELECT
|
||||
length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches))
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroups(haystack, repeat('(\\w)', 100)) AS matches
|
||||
FROM numbers(3)
|
||||
);
|
@ -0,0 +1,22 @@
|
||||
0 groups, zero matches
|
||||
1 group, multiple matches, String and FixedString
|
||||
[['hello','world']]
|
||||
[['hello','world']]
|
||||
[['hello','world']]
|
||||
[['hello','world']]
|
||||
[['hello','world']]
|
||||
[['hello','world']]
|
||||
mutiple groups, multiple matches
|
||||
[['abc','def','ghi','"jkl mno"'],['111','222','333','"444 foo bar"']]
|
||||
big match
|
||||
0 1 0 []
|
||||
260 1 1 [156]
|
||||
520 1 3 [156,156,156]
|
||||
lots of matches
|
||||
0 1 0 0
|
||||
260 1 260 260
|
||||
520 1 520 520
|
||||
lots of groups
|
||||
0 100 0 []
|
||||
260 100 2 [1,1]
|
||||
520 100 5 [1,1,1,1,1]
|
@ -0,0 +1,51 @@
|
||||
-- error cases
|
||||
SELECT extractAllGroupsHorizontal(); --{serverError 42} not enough arguments
|
||||
SELECT extractAllGroupsHorizontal('hello'); --{serverError 42} not enough arguments
|
||||
SELECT extractAllGroupsHorizontal('hello', 123); --{serverError 43} invalid argument type
|
||||
SELECT extractAllGroupsHorizontal(123, 'world'); --{serverError 43} invalid argument type
|
||||
SELECT extractAllGroupsHorizontal('hello world', '((('); --{serverError 427} invalid re
|
||||
SELECT extractAllGroupsHorizontal('hello world', materialize('\\w+')); --{serverError 44} non-cons needle
|
||||
|
||||
SELECT '0 groups, zero matches';
|
||||
SELECT extractAllGroupsHorizontal('hello world', '\\w+'); -- { serverError 36 }
|
||||
|
||||
SELECT '1 group, multiple matches, String and FixedString';
|
||||
SELECT extractAllGroupsHorizontal('hello world', '(\\w+)');
|
||||
SELECT extractAllGroupsHorizontal('hello world', CAST('(\\w+)' as FixedString(5)));
|
||||
SELECT extractAllGroupsHorizontal(CAST('hello world' AS FixedString(12)), '(\\w+)');
|
||||
SELECT extractAllGroupsHorizontal(CAST('hello world' AS FixedString(12)), CAST('(\\w+)' as FixedString(5)));
|
||||
SELECT extractAllGroupsHorizontal(materialize(CAST('hello world' AS FixedString(12))), '(\\w+)');
|
||||
SELECT extractAllGroupsHorizontal(materialize(CAST('hello world' AS FixedString(12))), CAST('(\\w+)' as FixedString(5)));
|
||||
|
||||
SELECT 'mutiple groups, multiple matches';
|
||||
SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333 "jkl mno"="444 foo bar"', '("[^"]+"|\\w+)=("[^"]+"|\\w+)');
|
||||
|
||||
SELECT 'big match';
|
||||
SELECT
|
||||
length(haystack), length(matches), length(matches[1]), arrayMap((x) -> length(x), matches[1])
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroupsHorizontal(haystack, '(abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz)') AS matches
|
||||
FROM numbers(3)
|
||||
);
|
||||
|
||||
SELECT 'lots of matches';
|
||||
SELECT
|
||||
length(haystack), length(matches), length(matches[1]), arrayReduce('sum', arrayMap((x) -> length(x), matches[1]))
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroupsHorizontal(haystack, '(\\w)') AS matches
|
||||
FROM numbers(3)
|
||||
);
|
||||
|
||||
SELECT 'lots of groups';
|
||||
SELECT
|
||||
length(haystack), length(matches), length(matches[1]), arrayMap((x) -> length(x), matches[1])
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroupsHorizontal(haystack, repeat('(\\w)', 100)) AS matches
|
||||
FROM numbers(3)
|
||||
);
|
51
tests/queries/0_stateless/01246_extractAllGroupsVertical.sql
Normal file
51
tests/queries/0_stateless/01246_extractAllGroupsVertical.sql
Normal file
@ -0,0 +1,51 @@
|
||||
-- error cases
|
||||
SELECT extractAllGroupsVertical(); --{serverError 42} not enough arguments
|
||||
SELECT extractAllGroupsVertical('hello'); --{serverError 42} not enough arguments
|
||||
SELECT extractAllGroupsVertical('hello', 123); --{serverError 43} invalid argument type
|
||||
SELECT extractAllGroupsVertical(123, 'world'); --{serverError 43} invalid argument type
|
||||
SELECT extractAllGroupsVertical('hello world', '((('); --{serverError 427} invalid re
|
||||
SELECT extractAllGroupsVertical('hello world', materialize('\\w+')); --{serverError 44} non-const needle
|
||||
|
||||
SELECT '0 groups, zero matches';
|
||||
SELECT extractAllGroupsVertical('hello world', '\\w+'); -- { serverError 36 }
|
||||
|
||||
SELECT '1 group, multiple matches, String and FixedString';
|
||||
SELECT extractAllGroupsVertical('hello world', '(\\w+)');
|
||||
SELECT extractAllGroupsVertical('hello world', CAST('(\\w+)' as FixedString(5)));
|
||||
SELECT extractAllGroupsVertical(CAST('hello world' AS FixedString(12)), '(\\w+)');
|
||||
SELECT extractAllGroupsVertical(CAST('hello world' AS FixedString(12)), CAST('(\\w+)' as FixedString(5)));
|
||||
SELECT extractAllGroupsVertical(materialize(CAST('hello world' AS FixedString(12))), '(\\w+)');
|
||||
SELECT extractAllGroupsVertical(materialize(CAST('hello world' AS FixedString(12))), CAST('(\\w+)' as FixedString(5)));
|
||||
|
||||
SELECT 'mutiple groups, multiple matches';
|
||||
SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333 "jkl mno"="444 foo bar"', '("[^"]+"|\\w+)=("[^"]+"|\\w+)');
|
||||
|
||||
SELECT 'big match';
|
||||
SELECT
|
||||
length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches))
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroupsVertical(haystack, '(abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz)') AS matches
|
||||
FROM numbers(3)
|
||||
);
|
||||
|
||||
SELECT 'lots of matches';
|
||||
SELECT
|
||||
length(haystack), length(matches[1]), length(matches), arrayReduce('sum', arrayMap((x) -> length(x), arrayMap(x -> x[1], matches)))
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroupsVertical(haystack, '(\\w)') AS matches
|
||||
FROM numbers(3)
|
||||
);
|
||||
|
||||
SELECT 'lots of groups';
|
||||
SELECT
|
||||
length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches))
|
||||
FROM (
|
||||
SELECT
|
||||
repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack,
|
||||
extractAllGroupsVertical(haystack, repeat('(\\w)', 100)) AS matches
|
||||
FROM numbers(3)
|
||||
);
|
@ -1,5 +1,5 @@
|
||||
SELECT extractGroups('hello', ''); -- { serverError 69 }
|
||||
SELECT extractAllGroups('hello', ''); -- { serverError 69 }
|
||||
SELECT extractGroups('hello', ''); -- { serverError 36 }
|
||||
SELECT extractAllGroups('hello', ''); -- { serverError 36 }
|
||||
|
||||
SELECT extractGroups('hello', ' '); -- { serverError 36 }
|
||||
SELECT extractAllGroups('hello', ' '); -- { serverError 36 }
|
||||
|
Loading…
Reference in New Issue
Block a user