ClickHouse/src/Common/FST.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

183 lines
4.9 KiB
C++
Raw Normal View History

2022-06-24 01:56:15 +00:00
#pragma once
#include <array>
2022-06-24 01:56:15 +00:00
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
2022-06-24 01:56:15 +00:00
#include <Core/Types.h>
#include <IO/ReadHelpers.h>
2022-06-24 01:56:15 +00:00
#include <IO/WriteBuffer.h>
#include <base/types.h>
2022-06-24 01:56:15 +00:00
namespace DB
{
/// Finite State Transducer is an efficient way to represent term dictionary.
2023-01-10 16:26:27 +00:00
/// It can be viewed as a map of <term, output> where output is an integer.
2022-09-08 12:00:00 +00:00
/// Detailed explanation can be found in the following paper
2022-09-08 12:47:05 +00:00
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
namespace FST
2022-06-24 01:56:15 +00:00
{
2023-01-10 16:26:27 +00:00
using Output = UInt64;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
class State;
using StatePtr = std::shared_ptr<State>;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Arc represents a transition from one state to another
/// It includes the target state to which the arc points and the arc's output.
struct Arc
{
Arc() = default;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { }
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// 0 means the arc has no output
Output output = 0;
2023-01-10 16:26:27 +00:00
StatePtr target;
UInt64 serialize(WriteBuffer & write_buffer) const;
};
2022-07-19 11:01:09 +00:00
2023-01-10 16:26:27 +00:00
bool operator==(const Arc & arc1, const Arc & arc2);
/// LabelsAsBitmap implements a 256-bit bitmap for all labels of a state. Each bit represents
/// a label's presence and the index value of the bit represents the corresponding label
class LabelsAsBitmap
{
public:
void addLabel(char label);
bool hasLabel(char label) const;
/// computes the rank
UInt64 getIndex(char label) const;
UInt64 serialize(WriteBuffer& write_buffer);
private:
friend class State;
friend class FiniteStateTransducer;
/// data holds a 256-bit bitmap for all labels of a state. Its 256 bits correspond to 256
/// possible label values.
UInt256 data{ 0 };
};
/// State implements the State in Finite State Transducer
/// Each state contains all its arcs and a flag indicating if it is final state
class State
{
public:
static constexpr size_t MAX_ARCS_IN_SEQUENTIAL_METHOD = 32;
enum class EncodingMethod
2022-06-24 01:56:15 +00:00
{
2023-01-10 16:26:27 +00:00
/// Serialize arcs sequentially
Sequential = 0,
/// Serialize arcs by using bitmap
/// Note this is NOT enabled for now since it is experimental
Bitmap,
2022-06-24 01:56:15 +00:00
};
2023-01-10 16:26:27 +00:00
State() = default;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
State(const State & state) = default;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
UInt64 hash() const;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
Arc * getArc(char label);
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
void addArc(char label, Output output, StatePtr target);
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
void clear();
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
UInt64 serialize(WriteBuffer & write_buffer);
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
bool isFinal() const
{
return flag_values.is_final == 1;
}
void setFinal(bool value)
{
flag_values.is_final = value;
}
EncodingMethod getEncodingMethod() const
{
return flag_values.encoding_method;
}
void readFlag(ReadBuffer & read_buffer)
{
read_buffer.readStrict(reinterpret_cast<char&>(flag));
}
/// Transient ID of the state which is used for building FST. It won't be serialized
UInt64 id = 0;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// State index which indicates location of state in FST
UInt64 state_index = 0;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Arcs which are started from state, the 'char' is the label on the arc
std::unordered_map<char, Arc> arcs;
private:
struct FlagValues
{
unsigned int is_final : 1;
EncodingMethod encoding_method : 3;
};
2023-01-10 16:26:27 +00:00
union
{
2023-01-10 16:26:27 +00:00
FlagValues flag_values;
uint8_t flag = 0;
};
2023-01-10 16:26:27 +00:00
};
bool operator==(const State & state1, const State & state2);
inline constexpr size_t MAX_TERM_LENGTH = 256;
/// FSTBuilder is used to build Finite State Transducer by adding words incrementally.
/// Note that all the words have to be added in sorted order in order to achieve minimized result.
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer
class FSTBuilder
{
public:
explicit FSTBuilder(WriteBuffer & write_buffer_);
void add(const std::string & word, Output output);
UInt64 build();
private:
StatePtr findMinimized(const State & s, bool & found);
void minimizePreviousWordSuffix(Int64 down_to);
static size_t getCommonPrefixLength(const String & word1, const String & word2);
std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
String previous_word;
StatePtr initial_state;
2023-01-11 21:40:20 +00:00
/// map of (state_hash, StatePtr)
2023-01-10 16:26:27 +00:00
std::unordered_map<UInt64, StatePtr> minimized_states;
/// Next available ID of state
UInt64 next_id = 1;
WriteBuffer & write_buffer;
UInt64 previous_written_bytes = 0;
UInt64 previous_state_index = 0;
};
//FiniteStateTransducer is constructed by using minimized FST blob(which is loaded from index storage)
// It is used to retrieve output by given term
class FiniteStateTransducer
{
public:
FiniteStateTransducer() = default;
FiniteStateTransducer(std::vector<UInt8> data_);
std::pair<UInt64, bool> getOutput(const String & term);
void clear();
std::vector<UInt8> & getData() { return data; }
private:
std::vector<UInt8> data;
};
}
2022-06-24 01:56:15 +00:00
}