2022-08-31 19:34:50 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <atomic>
|
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <unordered_set>
|
|
|
|
#include <variant>
|
|
|
|
|
|
|
|
#include <base/types.h>
|
|
|
|
|
2023-01-05 16:38:01 +00:00
|
|
|
#include <Columns/IColumn.h>
|
|
|
|
#include <Columns/ColumnString.h>
|
2022-08-31 19:34:50 +00:00
|
|
|
#include <Common/Arena.h>
|
|
|
|
#include <Common/Exception.h>
|
|
|
|
#include <Common/HashTable/Hash.h>
|
|
|
|
#include <Common/HashTable/HashSet.h>
|
2023-01-05 16:38:01 +00:00
|
|
|
#include <Core/Block.h>
|
|
|
|
#include <Core/Field.h>
|
2022-08-31 19:34:50 +00:00
|
|
|
#include <DataTypes/IDataType.h>
|
2023-01-05 16:38:01 +00:00
|
|
|
#include <Functions/Regexps.h>
|
2022-08-31 19:34:50 +00:00
|
|
|
#include <QueryPipeline/Pipe.h>
|
|
|
|
|
|
|
|
#include <Dictionaries/DictionaryStructure.h>
|
|
|
|
#include <Dictionaries/IDictionary.h>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int UNSUPPORTED_METHOD;
|
|
|
|
}
|
|
|
|
|
|
|
|
class RegExpTreeDictionary : public IDictionary
|
|
|
|
{
|
2023-03-15 14:38:11 +00:00
|
|
|
friend struct MatchContext;
|
2022-08-31 19:34:50 +00:00
|
|
|
public:
|
|
|
|
struct Configuration
|
|
|
|
{
|
|
|
|
bool require_nonempty;
|
|
|
|
DictionaryLifetime lifetime;
|
|
|
|
};
|
|
|
|
|
|
|
|
const std::string name = "RegExpTree";
|
|
|
|
|
|
|
|
RegExpTreeDictionary(
|
2023-02-04 09:53:54 +00:00
|
|
|
const StorageID & id_,
|
|
|
|
const DictionaryStructure & structure_,
|
|
|
|
DictionarySourcePtr source_ptr_,
|
|
|
|
Configuration configuration_,
|
|
|
|
bool use_vectorscan_);
|
2022-08-31 19:34:50 +00:00
|
|
|
|
|
|
|
std::string getTypeName() const override { return name; }
|
|
|
|
|
|
|
|
size_t getBytesAllocated() const override { return bytes_allocated; }
|
|
|
|
|
|
|
|
size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
|
|
|
|
|
|
|
|
double getFoundRate() const override
|
|
|
|
{
|
|
|
|
const auto queries = query_count.load(std::memory_order_relaxed);
|
|
|
|
if (!queries)
|
|
|
|
return 0;
|
|
|
|
return static_cast<double>(found_count.load(std::memory_order_relaxed)) / queries;
|
|
|
|
}
|
|
|
|
|
|
|
|
double getHitRate() const override { return 1.0; }
|
|
|
|
|
|
|
|
size_t getElementCount() const override { return element_count; }
|
|
|
|
|
|
|
|
double getLoadFactor() const override { return static_cast<double>(element_count) / bucket_count; }
|
|
|
|
|
|
|
|
DictionarySourcePtr getSource() const override { return source_ptr; }
|
|
|
|
|
|
|
|
const DictionaryLifetime & getLifetime() const override { return configuration.lifetime; }
|
|
|
|
|
|
|
|
const DictionaryStructure & getStructure() const override { return structure; }
|
|
|
|
|
|
|
|
bool isInjective(const std::string & attribute_name) const override { return structure.getAttribute(attribute_name).injective; }
|
|
|
|
|
|
|
|
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::Simple; }
|
|
|
|
|
|
|
|
bool hasHierarchy() const override { return false; }
|
|
|
|
|
|
|
|
std::shared_ptr<const IExternalLoadable> clone() const override
|
|
|
|
{
|
2023-02-04 09:53:54 +00:00
|
|
|
return std::make_shared<RegExpTreeDictionary>(getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan);
|
2022-08-31 19:34:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ColumnUInt8::Ptr hasKeys(const Columns &, const DataTypes &) const override
|
|
|
|
{
|
|
|
|
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary {} does not support method `hasKeys`", name);
|
|
|
|
}
|
|
|
|
|
|
|
|
Pipe read(const Names &, size_t, size_t) const override
|
|
|
|
{
|
|
|
|
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary {} does not support method `read`", name);
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnPtr getColumn(
|
|
|
|
const std::string & attribute_name,
|
|
|
|
const DataTypePtr & result_type,
|
|
|
|
const Columns & key_columns,
|
|
|
|
const DataTypes & key_types,
|
2022-12-14 10:22:29 +00:00
|
|
|
const ColumnPtr & default_values_column) const override
|
|
|
|
{
|
|
|
|
return getColumns(Strings({attribute_name}), DataTypes({result_type}), key_columns, key_types, Columns({default_values_column}))[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
Columns getColumns(
|
|
|
|
const Strings & attribute_names,
|
|
|
|
const DataTypes & result_types,
|
|
|
|
const Columns & key_columns,
|
|
|
|
const DataTypes & key_types,
|
|
|
|
const Columns & default_values_columns) const override;
|
2022-08-31 19:34:50 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
const DictionaryStructure structure;
|
2022-12-22 21:42:11 +00:00
|
|
|
DictionarySourcePtr source_ptr;
|
2022-08-31 19:34:50 +00:00
|
|
|
const Configuration configuration;
|
|
|
|
|
|
|
|
size_t bytes_allocated = 0;
|
|
|
|
|
|
|
|
size_t bucket_count = 0;
|
|
|
|
size_t element_count = 0;
|
|
|
|
|
|
|
|
mutable std::atomic<size_t> query_count{0};
|
|
|
|
mutable std::atomic<size_t> found_count{0};
|
|
|
|
|
|
|
|
void calculateBytesAllocated();
|
|
|
|
|
|
|
|
void loadData();
|
|
|
|
|
2022-12-14 10:22:29 +00:00
|
|
|
void initRegexNodes(Block & block);
|
|
|
|
void initTopologyOrder(UInt64 node_idx, std::set<UInt64> & visited, UInt64 & topology_id);
|
|
|
|
void initGraph();
|
|
|
|
|
2023-02-04 09:53:54 +00:00
|
|
|
std::unordered_map<String, ColumnPtr> match(
|
2022-12-14 10:22:29 +00:00
|
|
|
const ColumnString::Chars & keys_data,
|
|
|
|
const ColumnString::Offsets & keys_offsets,
|
|
|
|
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
|
|
|
|
const std::unordered_map<String, ColumnPtr> & defaults) const;
|
2022-08-31 19:34:50 +00:00
|
|
|
|
2022-12-14 10:22:29 +00:00
|
|
|
bool setAttributes(
|
|
|
|
UInt64 id,
|
|
|
|
std::unordered_map<String, Field> & attributes_to_set,
|
|
|
|
const String & data,
|
|
|
|
std::unordered_set<UInt64> & visited_nodes,
|
2023-02-01 14:09:04 +00:00
|
|
|
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
|
|
|
|
const std::unordered_map<String, ColumnPtr> & defaults,
|
|
|
|
size_t key_index) const;
|
2022-08-31 19:34:50 +00:00
|
|
|
|
2022-12-14 10:22:29 +00:00
|
|
|
struct RegexTreeNode;
|
2023-01-25 09:38:51 +00:00
|
|
|
using RegexTreeNodePtr = std::shared_ptr<RegexTreeNode>;
|
|
|
|
|
2023-02-04 09:53:54 +00:00
|
|
|
bool use_vectorscan;
|
|
|
|
|
|
|
|
std::vector<std::string> simple_regexps;
|
|
|
|
std::vector<UInt64> regexp_ids;
|
2023-01-25 09:38:51 +00:00
|
|
|
std::vector<RegexTreeNodePtr> complex_regexp_nodes;
|
2022-08-31 19:34:50 +00:00
|
|
|
|
2022-12-22 21:42:11 +00:00
|
|
|
std::map<UInt64, RegexTreeNodePtr> regex_nodes;
|
2022-12-14 10:22:29 +00:00
|
|
|
std::unordered_map<UInt64, UInt64> topology_order;
|
2023-01-05 16:38:01 +00:00
|
|
|
#if USE_VECTORSCAN
|
|
|
|
MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
|
2023-03-15 14:38:11 +00:00
|
|
|
MultiRegexps::ScratchPtr origin_scratch;
|
|
|
|
hs_database_t* origin_db;
|
2023-01-05 16:38:01 +00:00
|
|
|
#endif
|
2023-02-04 09:53:54 +00:00
|
|
|
|
|
|
|
Poco::Logger * logger;
|
2022-08-31 19:34:50 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|