2022-06-24 01:56:15 +00:00
|
|
|
#pragma once
|
2022-09-07 18:22:09 +00:00
|
|
|
#include <array>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
2022-09-07 18:22:09 +00:00
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <unordered_set>
|
|
|
|
#include <vector>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <Core/Types.h>
|
2022-09-25 23:29:30 +00:00
|
|
|
#include <IO/ReadHelpers.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <IO/WriteBuffer.h>
|
2022-09-07 18:22:09 +00:00
|
|
|
#include <base/types.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2022-09-07 18:22:09 +00:00
|
|
|
/// Finite State Transducer is an efficient way to represent term dictionary.
|
2023-01-10 16:26:27 +00:00
|
|
|
/// It can be viewed as a map of <term, output> where output is an integer.
|
2022-09-08 12:00:00 +00:00
|
|
|
/// Detailed explanation can be found in the following paper
|
2022-09-08 12:47:05 +00:00
|
|
|
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
|
2022-09-07 18:22:09 +00:00
|
|
|
namespace FST
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-01-10 16:26:27 +00:00
|
|
|
using Output = UInt64;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
class State;
|
|
|
|
using StatePtr = std::shared_ptr<State>;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Arc represents a transition from one state to another
|
|
|
|
/// It includes the target state to which the arc points and the arc's output.
|
|
|
|
struct Arc
|
|
|
|
{
|
|
|
|
Arc() = default;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { }
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// 0 means the arc has no output
|
|
|
|
Output output = 0;
|
2022-09-25 23:29:30 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
StatePtr target;
|
|
|
|
|
|
|
|
UInt64 serialize(WriteBuffer & write_buffer) const;
|
|
|
|
};
|
2022-07-19 11:01:09 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
bool operator==(const Arc & arc1, const Arc & arc2);
|
|
|
|
|
|
|
|
/// LabelsAsBitmap implements a 256-bit bitmap for all labels of a state. Each bit represents
|
|
|
|
/// a label's presence and the index value of the bit represents the corresponding label
|
|
|
|
class LabelsAsBitmap
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
void addLabel(char label);
|
|
|
|
bool hasLabel(char label) const;
|
|
|
|
|
|
|
|
/// computes the rank
|
|
|
|
UInt64 getIndex(char label) const;
|
|
|
|
|
|
|
|
UInt64 serialize(WriteBuffer& write_buffer);
|
|
|
|
private:
|
|
|
|
friend class State;
|
|
|
|
friend class FiniteStateTransducer;
|
|
|
|
/// data holds a 256-bit bitmap for all labels of a state. Its 256 bits correspond to 256
|
|
|
|
/// possible label values.
|
|
|
|
UInt256 data{ 0 };
|
|
|
|
};
|
|
|
|
|
|
|
|
/// State implements the State in Finite State Transducer
|
|
|
|
/// Each state contains all its arcs and a flag indicating if it is final state
|
|
|
|
class State
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
static constexpr size_t MAX_ARCS_IN_SEQUENTIAL_METHOD = 32;
|
|
|
|
enum class EncodingMethod
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Serialize arcs sequentially
|
|
|
|
Sequential = 0,
|
|
|
|
|
|
|
|
/// Serialize arcs by using bitmap
|
|
|
|
/// Note this is NOT enabled for now since it is experimental
|
|
|
|
Bitmap,
|
2022-06-24 01:56:15 +00:00
|
|
|
};
|
2023-01-10 16:26:27 +00:00
|
|
|
State() = default;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
State(const State & state) = default;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
UInt64 hash() const;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
Arc * getArc(char label);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
void addArc(char label, Output output, StatePtr target);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
void clear();
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
UInt64 serialize(WriteBuffer & write_buffer);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
bool isFinal() const
|
|
|
|
{
|
|
|
|
return flag_values.is_final == 1;
|
|
|
|
}
|
|
|
|
void setFinal(bool value)
|
|
|
|
{
|
|
|
|
flag_values.is_final = value;
|
|
|
|
}
|
|
|
|
EncodingMethod getEncodingMethod() const
|
|
|
|
{
|
|
|
|
return flag_values.encoding_method;
|
|
|
|
}
|
|
|
|
void readFlag(ReadBuffer & read_buffer)
|
|
|
|
{
|
|
|
|
read_buffer.readStrict(reinterpret_cast<char&>(flag));
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Transient ID of the state which is used for building FST. It won't be serialized
|
|
|
|
UInt64 id = 0;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// State index which indicates location of state in FST
|
|
|
|
UInt64 state_index = 0;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Arcs which are started from state, the 'char' is the label on the arc
|
|
|
|
std::unordered_map<char, Arc> arcs;
|
|
|
|
private:
|
|
|
|
struct FlagValues
|
|
|
|
{
|
|
|
|
unsigned int is_final : 1;
|
|
|
|
EncodingMethod encoding_method : 3;
|
2022-09-07 18:22:09 +00:00
|
|
|
};
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
union
|
2022-09-07 18:22:09 +00:00
|
|
|
{
|
2023-01-10 16:26:27 +00:00
|
|
|
FlagValues flag_values;
|
|
|
|
uint8_t flag = 0;
|
2022-09-07 18:22:09 +00:00
|
|
|
};
|
2023-01-10 16:26:27 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
bool operator==(const State & state1, const State & state2);
|
|
|
|
|
|
|
|
inline constexpr size_t MAX_TERM_LENGTH = 256;
|
|
|
|
|
|
|
|
/// FSTBuilder is used to build Finite State Transducer by adding words incrementally.
|
|
|
|
/// Note that all the words have to be added in sorted order in order to achieve minimized result.
|
|
|
|
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer
|
|
|
|
class FSTBuilder
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
explicit FSTBuilder(WriteBuffer & write_buffer_);
|
|
|
|
|
|
|
|
void add(const std::string & word, Output output);
|
|
|
|
UInt64 build();
|
|
|
|
private:
|
|
|
|
StatePtr findMinimized(const State & s, bool & found);
|
|
|
|
void minimizePreviousWordSuffix(Int64 down_to);
|
|
|
|
static size_t getCommonPrefixLength(const String & word1, const String & word2);
|
|
|
|
|
|
|
|
std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
|
|
|
|
String previous_word;
|
|
|
|
StatePtr initial_state;
|
2023-01-11 21:40:20 +00:00
|
|
|
|
|
|
|
/// map of (state_hash, StatePtr)
|
2023-01-10 16:26:27 +00:00
|
|
|
std::unordered_map<UInt64, StatePtr> minimized_states;
|
|
|
|
|
|
|
|
/// Next available ID of state
|
|
|
|
UInt64 next_id = 1;
|
|
|
|
|
|
|
|
WriteBuffer & write_buffer;
|
|
|
|
UInt64 previous_written_bytes = 0;
|
|
|
|
UInt64 previous_state_index = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
//FiniteStateTransducer is constructed by using minimized FST blob(which is loaded from index storage)
|
|
|
|
// It is used to retrieve output by given term
|
|
|
|
class FiniteStateTransducer
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
FiniteStateTransducer() = default;
|
|
|
|
FiniteStateTransducer(std::vector<UInt8> data_);
|
|
|
|
std::pair<UInt64, bool> getOutput(const String & term);
|
|
|
|
void clear();
|
|
|
|
std::vector<UInt8> & getData() { return data; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::vector<UInt8> data;
|
|
|
|
};
|
2022-09-07 18:22:09 +00:00
|
|
|
}
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|