mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
write docs and optimize regex compile
This commit is contained in:
parent
7f705686d4
commit
f2a9eea995
@ -0,0 +1,76 @@
|
||||
---
|
||||
slug: /en/sql-reference/dictionaries/external-dictionaries/regexp-tree
|
||||
sidebar_position: 47
|
||||
sidebar_label: RegExp Tree Dictionary
|
||||
title: "RegExp Tree Dictionary"
|
||||
---
|
||||
import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md';
|
||||
|
||||
Regexp Tree dictionary stores multiple trees of regular expressions with attributions. Users can trieve strings in the dictionary. If a string matches the root of the regexp tree, we will collect the corresponding attributions of the matched root and continue to walk the children. If any of the children matches the string, we will collect attributions and rewrite the old ones if conflicts occur, then continue the traverse util we reach leaf nodes.
|
||||
|
||||
Example of the ddl query for creating Regexp Tree dictionary:
|
||||
|
||||
<CloudDetails />
|
||||
|
||||
```sql
|
||||
create dictionary regexp_dict
|
||||
(
|
||||
regexp String,
|
||||
name String,
|
||||
version String
|
||||
)
|
||||
PRIMARY KEY(regexp)
|
||||
SOURCE(YAMLRegExpTree(PATH '/var/lib/clickhouse/user_files/regexp_tree.yaml'))
|
||||
LAYOUT(regexp_tree)
|
||||
...
|
||||
```
|
||||
|
||||
We only allow `YAMLRegExpTree` to work with regexp_tree dicitionary layout. If you want to use other sources, please set variable `regexp_dict_allow_other_sources` true.
|
||||
|
||||
**Source**
|
||||
|
||||
We introduce a type of source called `YAMLRegExpTree` representing the structure of Regexp Tree dictionary. An Example of a valid yaml config is like:
|
||||
|
||||
```xml
|
||||
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
|
||||
name: 'TencentOS'
|
||||
version: '\1'
|
||||
|
||||
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
|
||||
name: 'Andriod'
|
||||
versions:
|
||||
- regexp: '33/tclwebkit'
|
||||
version: '13'
|
||||
- regexp: '3[12]/tclwebkit'
|
||||
version: '12'
|
||||
- regexp: '30/tclwebkit'
|
||||
version: '11'
|
||||
- regexp: '29/tclwebkit'
|
||||
version: '10'
|
||||
```
|
||||
|
||||
The key `regexp` represents the regular expression of a tree node. The name of key is same as the dictionary key. The `name` and `version` is user-defined attributions in the dicitionary. The `versions` (which can be any name that not appear in attributions or the key) indicates the children nodes of this tree.
|
||||
|
||||
**Back Reference**
|
||||
|
||||
The value of an attribution could contain a back reference which refers to a capture group of the matched regular expression. Reference number ranges from 1 to 9 and writes as `$1` or `\1`.
|
||||
|
||||
During the query execution, the back reference in the value will be replaced by the matched capture group.
|
||||
|
||||
**Query**
|
||||
|
||||
Due to the specialty of Regexp Tree dictionary, we only allow functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull` work with it.
|
||||
|
||||
Example:
|
||||
|
||||
```sql
|
||||
SELECT dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```
|
||||
┌─dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024')─┐
|
||||
│ ('Andriod','12') │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
@ -64,7 +64,6 @@ namespace
|
||||
explicit StringPiece(int ref_) : ref_num(ref_) {}
|
||||
};
|
||||
|
||||
/// TODO: We should consider what kind of types we should support.
|
||||
Field parseStringToField(const String & raw, DataTypePtr data_type)
|
||||
try
|
||||
{
|
||||
@ -244,6 +243,13 @@ void RegExpTreeDictionary::loadData()
|
||||
initRegexNodes(block);
|
||||
}
|
||||
initGraph();
|
||||
#if USE_VECTORSCAN
|
||||
std::vector<std::string_view> regexps_views(regexps.begin(), regexps.end());
|
||||
hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
|
||||
/// TODO: fallback when exceptions occure.
|
||||
hyperscan_regex->get();
|
||||
#endif
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -364,10 +370,6 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
|
||||
[[maybe_unused]] const std::unordered_map<String, ColumnPtr> & defaults) const
|
||||
{
|
||||
#if USE_VECTORSCAN
|
||||
std::vector<std::string_view> regexps_views(regexps.begin(), regexps.end());
|
||||
|
||||
const auto & hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
|
||||
|
||||
hs_scratch_t * scratch = nullptr;
|
||||
hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch);
|
||||
|
||||
@ -454,7 +456,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
|
||||
if (attributes_to_set.contains(name))
|
||||
continue;
|
||||
|
||||
/// TODO: default value might be a back-reference.
|
||||
/// TODO: default value might be a back-reference, that is useful in lib ua-core
|
||||
DefaultValueProvider default_value(attr.null_value, defaults.at(name));
|
||||
columns[name]->insert(default_value.getDefaultValue(key_idx));
|
||||
}
|
||||
|
@ -8,21 +8,17 @@
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/HashTable/Hash.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
#include "Core/ColumnWithTypeAndName.h"
|
||||
#include "Core/Field.h"
|
||||
|
||||
#include <DataTypes/IDataType.h>
|
||||
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
|
||||
#include <QueryPipeline/Pipe.h>
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <Core/Field.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Functions/Regexps.h>
|
||||
#include <QueryPipeline/Pipe.h>
|
||||
|
||||
#include <Dictionaries/DictionaryStructure.h>
|
||||
#include <Dictionaries/IDictionary.h>
|
||||
@ -35,8 +31,6 @@ namespace ErrorCodes
|
||||
extern const int UNSUPPORTED_METHOD;
|
||||
}
|
||||
|
||||
class DeferredConstructedRegexpsPtr;
|
||||
|
||||
class RegExpTreeDictionary : public IDictionary
|
||||
{
|
||||
public:
|
||||
@ -159,7 +153,9 @@ private:
|
||||
|
||||
std::map<UInt64, RegexTreeNodePtr> regex_nodes;
|
||||
std::unordered_map<UInt64, UInt64> topology_order;
|
||||
|
||||
#if USE_VECTORSCAN
|
||||
MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
|
||||
#endif
|
||||
};
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user