2020-05-07 00:55:54 +00:00
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
#include <Columns/ColumnArray.h>
|
|
|
|
#include <Columns/ColumnConst.h>
|
|
|
|
#include <DataTypes/DataTypeArray.h>
|
|
|
|
#include <DataTypes/DataTypeString.h>
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionHelpers.h>
|
|
|
|
#include <Functions/Regexps.h>
|
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-05-07 23:31:15 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2020-05-07 00:55:54 +00:00
|
|
|
}
|
|
|
|
|
2020-09-07 18:00:37 +00:00
|
|
|
namespace
|
|
|
|
{
|
2020-05-07 00:55:54 +00:00
|
|
|
|
|
|
|
/** Match all groups of given input string with given re, return array of arrays of matches.
|
|
|
|
*
|
|
|
|
* SELECT extractGroups('hello abc=111 world', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
|
|
|
|
* should produce:
|
|
|
|
* ['abc', '111']
|
|
|
|
*/
|
|
|
|
class FunctionExtractGroups : public IFunction
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
static constexpr auto name = "extractGroups";
|
2021-06-01 12:20:52 +00:00
|
|
|
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionExtractGroups>(); }
|
2020-05-07 00:55:54 +00:00
|
|
|
|
|
|
|
String getName() const override { return name; }
|
|
|
|
|
|
|
|
size_t getNumberOfArguments() const override { return 2; }
|
|
|
|
|
2021-06-22 16:21:23 +00:00
|
|
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
2021-04-29 14:48:26 +00:00
|
|
|
|
2020-05-07 00:55:54 +00:00
|
|
|
bool useDefaultImplementationForConstants() const override { return false; }
|
|
|
|
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
|
|
|
|
|
|
|
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
|
|
|
{
|
|
|
|
FunctionArgumentDescriptors args{
|
2021-09-30 11:35:24 +00:00
|
|
|
{"haystack", &isStringOrFixedString<IDataType>, nullptr, "const String or const FixedString"},
|
|
|
|
{"needle", &isStringOrFixedString<IDataType>, isColumnConst, "const String or const FixedString"},
|
2020-05-07 00:55:54 +00:00
|
|
|
};
|
|
|
|
validateFunctionArgumentTypes(*this, arguments, args);
|
|
|
|
|
|
|
|
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
|
|
|
|
}
|
|
|
|
|
2020-11-17 13:24:45 +00:00
|
|
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
2020-05-07 00:55:54 +00:00
|
|
|
{
|
2020-10-17 21:41:50 +00:00
|
|
|
const ColumnPtr column_haystack = arguments[0].column;
|
|
|
|
const ColumnPtr column_needle = arguments[1].column;
|
2020-05-07 00:55:54 +00:00
|
|
|
|
|
|
|
const auto needle = typeid_cast<const ColumnConst &>(*column_needle).getValue<String>();
|
|
|
|
|
|
|
|
if (needle.empty())
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} length of 'needle' argument must be greater than 0.", getName());
|
2020-05-07 00:55:54 +00:00
|
|
|
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
const Regexps::Regexp regexp = Regexps::createRegexp<false, false, false>(needle);
|
|
|
|
const auto & re2 = regexp.getRE2();
|
2020-05-07 23:31:15 +00:00
|
|
|
|
|
|
|
if (!re2)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "There are no groups in regexp: {}", needle);
|
2020-05-07 23:31:15 +00:00
|
|
|
|
2020-05-07 01:25:06 +00:00
|
|
|
const size_t groups_count = re2->NumberOfCapturingGroups();
|
2020-05-07 00:55:54 +00:00
|
|
|
|
2020-05-07 23:31:15 +00:00
|
|
|
if (!groups_count)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "There are no groups in regexp: {}", needle);
|
2020-05-07 23:31:15 +00:00
|
|
|
|
2020-05-07 00:55:54 +00:00
|
|
|
// Including 0-group, which is the whole regexp.
|
2020-05-07 01:25:06 +00:00
|
|
|
PODArrayWithStackMemory<re2_st::StringPiece, 128> matched_groups(groups_count + 1);
|
2020-05-07 00:55:54 +00:00
|
|
|
|
|
|
|
ColumnArray::ColumnOffsets::MutablePtr offsets_col = ColumnArray::ColumnOffsets::create();
|
|
|
|
ColumnString::MutablePtr data_col = ColumnString::create();
|
|
|
|
|
|
|
|
auto & offsets_data = offsets_col->getData();
|
|
|
|
|
|
|
|
offsets_data.resize(input_rows_count);
|
|
|
|
ColumnArray::Offset current_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < input_rows_count; ++i)
|
|
|
|
{
|
2022-07-18 17:31:34 +00:00
|
|
|
std::string_view current_row = column_haystack->getDataAt(i).toView();
|
2020-05-07 00:55:54 +00:00
|
|
|
|
2022-07-18 17:31:34 +00:00
|
|
|
if (re2->Match(re2_st::StringPiece(current_row.data(), current_row.size()),
|
2022-10-07 10:46:45 +00:00
|
|
|
0, current_row.size(), re2_st::RE2::UNANCHORED, matched_groups.data(),
|
|
|
|
static_cast<int>(matched_groups.size())))
|
2020-05-07 00:55:54 +00:00
|
|
|
{
|
|
|
|
// 1 is to exclude group #0 which is whole re match.
|
|
|
|
for (size_t group = 1; group <= groups_count; ++group)
|
2020-05-07 01:25:06 +00:00
|
|
|
data_col->insertData(matched_groups[group].data(), matched_groups[group].size());
|
2020-05-07 00:55:54 +00:00
|
|
|
|
|
|
|
current_offset += groups_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
offsets_data[i] = current_offset;
|
|
|
|
}
|
|
|
|
|
2020-10-17 21:41:50 +00:00
|
|
|
return ColumnArray::create(std::move(data_col), std::move(offsets_col));
|
2020-05-07 00:55:54 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-09-07 18:00:37 +00:00
|
|
|
}
|
|
|
|
|
2022-07-04 07:01:39 +00:00
|
|
|
REGISTER_FUNCTION(ExtractGroups)
|
2020-05-07 00:55:54 +00:00
|
|
|
{
|
|
|
|
factory.registerFunction<FunctionExtractGroups>();
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|