write docs and optimize regex compile

This commit is contained in:
Han Fei 2023-01-05 17:38:01 +01:00
parent 7f705686d4
commit f2a9eea995
3 changed files with 93 additions and 19 deletions

View File

@ -0,0 +1,76 @@
---
slug: /en/sql-reference/dictionaries/external-dictionaries/regexp-tree
sidebar_position: 47
sidebar_label: RegExp Tree Dictionary
title: "RegExp Tree Dictionary"
---
import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md';
Regexp Tree dictionary stores multiple trees of regular expressions with attributions. Users can trieve strings in the dictionary. If a string matches the root of the regexp tree, we will collect the corresponding attributions of the matched root and continue to walk the children. If any of the children matches the string, we will collect attributions and rewrite the old ones if conflicts occur, then continue the traverse util we reach leaf nodes.
Example of the ddl query for creating Regexp Tree dictionary:
<CloudDetails />
```sql
create dictionary regexp_dict
(
regexp String,
name String,
version String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '/var/lib/clickhouse/user_files/regexp_tree.yaml'))
LAYOUT(regexp_tree)
...
```
We only allow `YAMLRegExpTree` to work with regexp_tree dicitionary layout. If you want to use other sources, please set variable `regexp_dict_allow_other_sources` true.
**Source**
We introduce a type of source called `YAMLRegExpTree` representing the structure of Regexp Tree dictionary. An Example of a valid yaml config is like:
```xml
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
name: 'TencentOS'
version: '\1'
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
name: 'Andriod'
versions:
- regexp: '33/tclwebkit'
version: '13'
- regexp: '3[12]/tclwebkit'
version: '12'
- regexp: '30/tclwebkit'
version: '11'
- regexp: '29/tclwebkit'
version: '10'
```
The key `regexp` represents the regular expression of a tree node. The name of key is same as the dictionary key. The `name` and `version` is user-defined attributions in the dicitionary. The `versions` (which can be any name that not appear in attributions or the key) indicates the children nodes of this tree.
**Back Reference**
The value of an attribution could contain a back reference which refers to a capture group of the matched regular expression. Reference number ranges from 1 to 9 and writes as `$1` or `\1`.
During the query execution, the back reference in the value will be replaced by the matched capture group.
**Query**
Due to the specialty of Regexp Tree dictionary, we only allow functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull` work with it.
Example:
```sql
SELECT dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024');
```
Result:
```
┌─dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024')─┐
│ ('Andriod','12') │
└─────────────────────────────────────────────────────────────────┘
```

View File

@ -64,7 +64,6 @@ namespace
explicit StringPiece(int ref_) : ref_num(ref_) {}
};
/// TODO: We should consider what kind of types we should support.
Field parseStringToField(const String & raw, DataTypePtr data_type)
try
{
@ -244,6 +243,13 @@ void RegExpTreeDictionary::loadData()
initRegexNodes(block);
}
initGraph();
#if USE_VECTORSCAN
std::vector<std::string_view> regexps_views(regexps.begin(), regexps.end());
hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
/// TODO: fallback when exceptions occure.
hyperscan_regex->get();
#endif
}
else
{
@ -364,10 +370,6 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
[[maybe_unused]] const std::unordered_map<String, ColumnPtr> & defaults) const
{
#if USE_VECTORSCAN
std::vector<std::string_view> regexps_views(regexps.begin(), regexps.end());
const auto & hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch);
@ -454,7 +456,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
if (attributes_to_set.contains(name))
continue;
/// TODO: default value might be a back-reference.
/// TODO: default value might be a back-reference, that is useful in lib ua-core
DefaultValueProvider default_value(attr.null_value, defaults.at(name));
columns[name]->insert(default_value.getDefaultValue(key_idx));
}

View File

@ -8,21 +8,17 @@
#include <base/types.h>
#include <Columns/IColumn.h>
#include <Columns/ColumnString.h>
#include <Common/Arena.h>
#include <Common/Exception.h>
#include <Common/HashTable/Hash.h>
#include <Common/HashTable/HashSet.h>
#include "Core/ColumnWithTypeAndName.h"
#include "Core/Field.h"
#include <DataTypes/IDataType.h>
#include <Columns/IColumn.h>
#include <Columns/ColumnString.h>
#include <QueryPipeline/Pipe.h>
#include <Core/Block.h>
#include <Core/Field.h>
#include <DataTypes/IDataType.h>
#include <Functions/Regexps.h>
#include <QueryPipeline/Pipe.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/IDictionary.h>
@ -35,8 +31,6 @@ namespace ErrorCodes
extern const int UNSUPPORTED_METHOD;
}
class DeferredConstructedRegexpsPtr;
class RegExpTreeDictionary : public IDictionary
{
public:
@ -159,7 +153,9 @@ private:
std::map<UInt64, RegexTreeNodePtr> regex_nodes;
std::unordered_map<UInt64, UInt64> topology_order;
#if USE_VECTORSCAN
MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
#endif
};
}