ClickHouse/src/Common/FST.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

172 lines
4.9 KiB
C++
Raw Normal View History

2022-06-24 01:56:15 +00:00
#pragma once
#include <array>
2022-06-24 01:56:15 +00:00
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
2022-06-24 01:56:15 +00:00
#include <Core/Types.h>
#include <IO/ReadHelpers.h>
2022-06-24 01:56:15 +00:00
#include <IO/WriteBuffer.h>
#include <base/types.h>
2022-06-24 01:56:15 +00:00
namespace DB
{
/// Finite State Transducer is an efficient way to represent term dictionary.
/// It can be viewed as a map of <term, offset>.
2022-09-08 12:00:00 +00:00
/// Detailed explanation can be found in the following paper
2022-09-08 12:47:05 +00:00
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
namespace FST
2022-06-24 01:56:15 +00:00
{
using Output = UInt64;
2022-06-24 01:56:15 +00:00
class State;
using StatePtr = std::shared_ptr<State>;
2022-06-24 01:56:15 +00:00
/// Arc represents a transition from one state to another
/// It includes the target state to which the arc points and its output.
struct Arc
2022-06-24 01:56:15 +00:00
{
Arc() = default;
explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { }
Output output = 0;
StatePtr target;
2022-06-24 01:56:15 +00:00
UInt64 serialize(WriteBuffer & write_buffer) const;
};
2022-06-24 01:56:15 +00:00
bool operator==(const Arc & arc1, const Arc & arc2);
/// ArcsAsBitmap implements a 256-bit bitmap for all arcs of a state. Each bit represents
/// an arc's presence and the index value of the bit represents the corresponding label
class ArcsAsBitmap
2022-07-19 11:01:09 +00:00
{
public:
void addArc(char label);
bool hasArc(char label) const;
int getIndex(char label) const;
private:
friend class State;
friend class FiniteStateTransducer;
/// data holds a 256-bit bitmap for all arcs of a state. Ita 256 bits correspond to 256
/// possible label values.
UInt256 data{ 0 };
2022-07-19 11:01:09 +00:00
};
/// State implements the State in Finite State Transducer
/// Each state contains all its arcs and a flag indicating if it is final state
class State
2022-06-24 01:56:15 +00:00
{
public:
static constexpr size_t MAX_ARCS_IN_SEQUENTIAL_METHOD = 32;
enum class EncodingMethod
{
ENCODING_METHOD_SEQUENTIAL = 0,
ENCODING_METHOD_BITMAP,
};
State() = default;
State(const State & state) = default;
UInt64 hash() const;
Arc * getArc(char label);
void addArc(char label, Output output, StatePtr target);
void clear()
{
id = 0;
state_index = 0;
flag = 0;
arcs.clear();
}
UInt64 serialize(WriteBuffer & write_buffer);
inline bool isFinal() const
{
return flag_values.is_final == 1;
}
inline void setFinal(bool value)
{
flag_values.is_final = value;
}
inline EncodingMethod getEncodingMethod() const
{
return flag_values.encoding_method;
}
inline void readFlag(ReadBuffer & read_buffer)
{
read_buffer.readStrict(reinterpret_cast<char&>(flag));
}
UInt64 id = 0;
UInt64 state_index = 0;
std::unordered_map<char, Arc> arcs;
private:
struct FlagValues
{
unsigned int is_final : 1;
EncodingMethod encoding_method : 3;
};
union
{
FlagValues flag_values;
uint8_t flag = 0;
};
2022-06-24 01:56:15 +00:00
};
bool operator==(const State & state1, const State & state2);
2022-06-24 01:56:15 +00:00
inline constexpr size_t MAX_TERM_LENGTH = 256;
2022-06-24 01:56:15 +00:00
/// FSTBuilder is used to build Finite State Transducer by adding words incrementally.
/// Note that all the words have to be added in sorted order in order to achieve minimized result.
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer
class FSTBuilder
{
public:
FSTBuilder(WriteBuffer & write_buffer_);
StatePtr getMinimized(const State & s, bool & found);
2022-06-24 01:56:15 +00:00
void add(const std::string & word, Output output);
UInt64 build();
2022-06-24 01:56:15 +00:00
UInt64 state_count = 0;
2022-06-24 01:56:15 +00:00
private:
void minimizePreviousWordSuffix(Int64 down_to);
static size_t getCommonPrefix(const std::string & word1, const std::string & word2);
2022-06-24 01:56:15 +00:00
std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
std::string previous_word;
StatePtr initial_state;
std::unordered_map<UInt64, StatePtr> minimized_states;
2022-06-24 01:56:15 +00:00
UInt64 next_id = 1;
2022-06-24 01:56:15 +00:00
WriteBuffer & write_buffer;
UInt64 previous_written_bytes = 0;
UInt64 previous_state_index = 0;
};
//FiniteStateTransducer is constructed by using minimized FST blob(which is loaded from index storage)
// It is used to retrieve output by given term
class FiniteStateTransducer
{
public:
FiniteStateTransducer() = default;
FiniteStateTransducer(std::vector<UInt8> data_);
std::pair<UInt64, bool> getOutput(const String & term);
void clear();
std::vector<UInt8> & getData() { return data; }
private:
std::vector<UInt8> data;
};
}
2022-06-24 01:56:15 +00:00
}