ClickHouse/src/Dictionaries/RegExpTreeDictionary.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

174 lines
5.3 KiB
C++
Raw Normal View History

2022-08-31 19:34:50 +00:00
#pragma once
#include <atomic>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <variant>
#include <base/types.h>
2023-01-05 16:38:01 +00:00
#include <Columns/IColumn.h>
#include <Columns/ColumnString.h>
2022-08-31 19:34:50 +00:00
#include <Common/Arena.h>
#include <Common/Exception.h>
#include <Common/HashTable/Hash.h>
#include <Common/HashTable/HashSet.h>
2023-01-05 16:38:01 +00:00
#include <Core/Block.h>
#include <Core/Field.h>
2022-08-31 19:34:50 +00:00
#include <DataTypes/IDataType.h>
2023-01-05 16:38:01 +00:00
#include <Functions/Regexps.h>
2022-08-31 19:34:50 +00:00
#include <QueryPipeline/Pipe.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/IDictionary.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_METHOD;
}
class RegExpTreeDictionary : public IDictionary
{
2023-03-15 14:38:11 +00:00
friend struct MatchContext;
2022-08-31 19:34:50 +00:00
public:
struct Configuration
{
bool require_nonempty;
DictionaryLifetime lifetime;
};
const std::string name = "RegExpTree";
RegExpTreeDictionary(
2023-02-04 09:53:54 +00:00
const StorageID & id_,
const DictionaryStructure & structure_,
DictionarySourcePtr source_ptr_,
Configuration configuration_,
bool use_vectorscan_);
2022-08-31 19:34:50 +00:00
std::string getTypeName() const override { return name; }
size_t getBytesAllocated() const override { return bytes_allocated; }
size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
double getFoundRate() const override
{
const auto queries = query_count.load(std::memory_order_relaxed);
if (!queries)
return 0;
return static_cast<double>(found_count.load(std::memory_order_relaxed)) / queries;
}
double getHitRate() const override { return 1.0; }
size_t getElementCount() const override { return element_count; }
double getLoadFactor() const override { return static_cast<double>(element_count) / bucket_count; }
DictionarySourcePtr getSource() const override { return source_ptr; }
const DictionaryLifetime & getLifetime() const override { return configuration.lifetime; }
const DictionaryStructure & getStructure() const override { return structure; }
bool isInjective(const std::string & attribute_name) const override { return structure.getAttribute(attribute_name).injective; }
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::Simple; }
bool hasHierarchy() const override { return false; }
std::shared_ptr<const IExternalLoadable> clone() const override
{
2023-02-04 09:53:54 +00:00
return std::make_shared<RegExpTreeDictionary>(getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan);
2022-08-31 19:34:50 +00:00
}
ColumnUInt8::Ptr hasKeys(const Columns &, const DataTypes &) const override
{
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary {} does not support method `hasKeys`", name);
}
Pipe read(const Names &, size_t, size_t) const override
{
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary {} does not support method `read`", name);
}
ColumnPtr getColumn(
const std::string & attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
2022-12-14 10:22:29 +00:00
const ColumnPtr & default_values_column) const override
{
return getColumns(Strings({attribute_name}), DataTypes({result_type}), key_columns, key_types, Columns({default_values_column}))[0];
}
Columns getColumns(
const Strings & attribute_names,
const DataTypes & result_types,
const Columns & key_columns,
const DataTypes & key_types,
const Columns & default_values_columns) const override;
2022-08-31 19:34:50 +00:00
private:
const DictionaryStructure structure;
2022-12-22 21:42:11 +00:00
DictionarySourcePtr source_ptr;
2022-08-31 19:34:50 +00:00
const Configuration configuration;
size_t bytes_allocated = 0;
size_t bucket_count = 0;
size_t element_count = 0;
mutable std::atomic<size_t> query_count{0};
mutable std::atomic<size_t> found_count{0};
void calculateBytesAllocated();
void loadData();
2022-12-14 10:22:29 +00:00
void initRegexNodes(Block & block);
void initTopologyOrder(UInt64 node_idx, std::set<UInt64> & visited, UInt64 & topology_id);
void initGraph();
2023-02-04 09:53:54 +00:00
std::unordered_map<String, ColumnPtr> match(
2022-12-14 10:22:29 +00:00
const ColumnString::Chars & keys_data,
const ColumnString::Offsets & keys_offsets,
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
const std::unordered_map<String, ColumnPtr> & defaults) const;
2022-08-31 19:34:50 +00:00
2022-12-14 10:22:29 +00:00
bool setAttributes(
UInt64 id,
std::unordered_map<String, Field> & attributes_to_set,
const String & data,
std::unordered_set<UInt64> & visited_nodes,
2023-02-01 14:09:04 +00:00
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
const std::unordered_map<String, ColumnPtr> & defaults,
size_t key_index) const;
2022-08-31 19:34:50 +00:00
2022-12-14 10:22:29 +00:00
struct RegexTreeNode;
using RegexTreeNodePtr = std::shared_ptr<RegexTreeNode>;
2023-02-04 09:53:54 +00:00
bool use_vectorscan;
std::vector<std::string> simple_regexps;
std::vector<UInt64> regexp_ids;
std::vector<RegexTreeNodePtr> complex_regexp_nodes;
2022-08-31 19:34:50 +00:00
2022-12-22 21:42:11 +00:00
std::map<UInt64, RegexTreeNodePtr> regex_nodes;
2022-12-14 10:22:29 +00:00
std::unordered_map<UInt64, UInt64> topology_order;
2023-01-05 16:38:01 +00:00
#if USE_VECTORSCAN
MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
2023-03-15 14:38:11 +00:00
MultiRegexps::ScratchPtr origin_scratch;
hs_database_t* origin_db;
2023-01-05 16:38:01 +00:00
#endif
2023-02-04 09:53:54 +00:00
Poco::Logger * logger;
2022-08-31 19:34:50 +00:00
};
}