2022-06-24 01:56:15 +00:00
|
|
|
#pragma once
|
2022-09-07 18:22:09 +00:00
|
|
|
#include <array>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
2022-09-07 18:22:09 +00:00
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <unordered_set>
|
|
|
|
#include <vector>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <Core/Types.h>
|
2022-09-25 23:29:30 +00:00
|
|
|
#include <IO/ReadHelpers.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <IO/WriteBuffer.h>
|
2022-09-07 18:22:09 +00:00
|
|
|
#include <base/types.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2022-09-07 18:22:09 +00:00
|
|
|
/// Finite State Transducer is an efficient way to represent term dictionary.
|
|
|
|
/// It can be viewed as a map of <term, offset>.
|
2022-09-08 12:00:00 +00:00
|
|
|
/// Detailed explanation can be found in the following paper
|
2022-09-08 12:47:05 +00:00
|
|
|
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
|
2022-09-07 18:22:09 +00:00
|
|
|
namespace FST
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2022-09-07 18:22:09 +00:00
|
|
|
using Output = UInt64;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
class State;
|
2022-09-07 18:22:09 +00:00
|
|
|
using StatePtr = std::shared_ptr<State>;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-28 14:28:28 +00:00
|
|
|
/// Arc represents a transition from one state to another
|
|
|
|
/// It includes the target state to which the arc points and its output.
|
2022-09-07 18:22:09 +00:00
|
|
|
struct Arc
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2022-09-07 18:22:09 +00:00
|
|
|
Arc() = default;
|
2022-09-25 23:29:30 +00:00
|
|
|
explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { }
|
|
|
|
Output output = 0;
|
2022-09-07 18:22:09 +00:00
|
|
|
StatePtr target;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
UInt64 serialize(WriteBuffer & write_buffer) const;
|
2022-09-07 18:22:09 +00:00
|
|
|
};
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
bool operator==(const Arc & arc1, const Arc & arc2);
|
|
|
|
|
2022-09-28 14:28:28 +00:00
|
|
|
/// ArcsAsBitmap implements a 256-bit bitmap for all arcs of a state. Each bit represents
|
|
|
|
/// an arc's presence and the index value of the bit represents the corresponding label
|
2022-09-25 23:29:30 +00:00
|
|
|
class ArcsAsBitmap
|
2022-07-19 11:01:09 +00:00
|
|
|
{
|
2022-09-07 18:22:09 +00:00
|
|
|
public:
|
|
|
|
void addArc(char label);
|
|
|
|
bool hasArc(char label) const;
|
|
|
|
int getIndex(char label) const;
|
|
|
|
|
|
|
|
private:
|
2022-09-25 23:29:30 +00:00
|
|
|
friend class State;
|
2022-09-07 18:22:09 +00:00
|
|
|
friend class FiniteStateTransducer;
|
2022-09-28 14:28:28 +00:00
|
|
|
/// data holds a 256-bit bitmap for all arcs of a state. Ita 256 bits correspond to 256
|
2022-09-25 23:29:30 +00:00
|
|
|
/// possible label values.
|
|
|
|
UInt256 data{ 0 };
|
2022-07-19 11:01:09 +00:00
|
|
|
};
|
|
|
|
|
2022-09-28 14:28:28 +00:00
|
|
|
/// State implements the State in Finite State Transducer
|
|
|
|
/// Each state contains all its arcs and a flag indicating if it is final state
|
2022-09-25 23:29:30 +00:00
|
|
|
class State
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2022-09-25 23:29:30 +00:00
|
|
|
public:
|
2022-09-07 18:22:09 +00:00
|
|
|
static constexpr size_t MAX_ARCS_IN_SEQUENTIAL_METHOD = 32;
|
2022-09-25 23:29:30 +00:00
|
|
|
enum class EncodingMethod
|
2022-09-07 18:22:09 +00:00
|
|
|
{
|
|
|
|
ENCODING_METHOD_SEQUENTIAL = 0,
|
|
|
|
ENCODING_METHOD_BITMAP,
|
|
|
|
};
|
|
|
|
State() = default;
|
|
|
|
State(const State & state) = default;
|
|
|
|
|
|
|
|
UInt64 hash() const;
|
|
|
|
|
|
|
|
Arc * getArc(char label);
|
|
|
|
void addArc(char label, Output output, StatePtr target);
|
|
|
|
|
|
|
|
void clear()
|
|
|
|
{
|
|
|
|
id = 0;
|
|
|
|
state_index = 0;
|
|
|
|
flag = 0;
|
|
|
|
|
|
|
|
arcs.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
UInt64 serialize(WriteBuffer & write_buffer);
|
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
inline bool isFinal() const
|
|
|
|
{
|
|
|
|
return flag_values.is_final == 1;
|
|
|
|
}
|
|
|
|
inline void setFinal(bool value)
|
|
|
|
{
|
|
|
|
flag_values.is_final = value;
|
|
|
|
}
|
|
|
|
inline EncodingMethod getEncodingMethod() const
|
|
|
|
{
|
|
|
|
return flag_values.encoding_method;
|
|
|
|
}
|
|
|
|
inline void readFlag(ReadBuffer & read_buffer)
|
|
|
|
{
|
2022-12-29 16:00:17 +00:00
|
|
|
read_buffer.readStrict(reinterpret_cast<char&>(flag));
|
2022-09-25 23:29:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
UInt64 id = 0;
|
|
|
|
UInt64 state_index = 0;
|
|
|
|
std::unordered_map<char, Arc> arcs;
|
|
|
|
private:
|
|
|
|
struct FlagValues
|
2022-09-07 18:22:09 +00:00
|
|
|
{
|
|
|
|
unsigned int is_final : 1;
|
2022-09-25 23:29:30 +00:00
|
|
|
EncodingMethod encoding_method : 3;
|
2022-09-07 18:22:09 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
union
|
|
|
|
{
|
2022-09-25 23:29:30 +00:00
|
|
|
FlagValues flag_values;
|
|
|
|
uint8_t flag = 0;
|
2022-09-07 18:22:09 +00:00
|
|
|
};
|
2022-06-24 01:56:15 +00:00
|
|
|
};
|
|
|
|
|
2022-09-07 18:22:09 +00:00
|
|
|
bool operator==(const State & state1, const State & state2);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
inline constexpr size_t MAX_TERM_LENGTH = 256;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-28 14:28:28 +00:00
|
|
|
/// FSTBuilder is used to build Finite State Transducer by adding words incrementally.
|
|
|
|
/// Note that all the words have to be added in sorted order in order to achieve minimized result.
|
|
|
|
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer
|
2022-09-07 18:22:09 +00:00
|
|
|
class FSTBuilder
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
FSTBuilder(WriteBuffer & write_buffer_);
|
2022-09-25 23:29:30 +00:00
|
|
|
StatePtr getMinimized(const State & s, bool & found);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-07 18:22:09 +00:00
|
|
|
void add(const std::string & word, Output output);
|
|
|
|
UInt64 build();
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
UInt64 state_count = 0;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-07 18:22:09 +00:00
|
|
|
private:
|
2022-09-25 23:29:30 +00:00
|
|
|
void minimizePreviousWordSuffix(Int64 down_to);
|
2022-09-07 18:22:09 +00:00
|
|
|
static size_t getCommonPrefix(const std::string & word1, const std::string & word2);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-07 18:22:09 +00:00
|
|
|
std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
|
|
|
|
std::string previous_word;
|
|
|
|
StatePtr initial_state;
|
|
|
|
std::unordered_map<UInt64, StatePtr> minimized_states;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
UInt64 next_id = 1;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2022-09-07 18:22:09 +00:00
|
|
|
WriteBuffer & write_buffer;
|
2022-09-25 23:29:30 +00:00
|
|
|
UInt64 previous_written_bytes = 0;
|
|
|
|
UInt64 previous_state_index = 0;
|
2022-09-07 18:22:09 +00:00
|
|
|
};
|
|
|
|
|
2022-09-28 14:28:28 +00:00
|
|
|
//FiniteStateTransducer is constructed by using minimized FST blob(which is loaded from index storage)
|
|
|
|
// It is used to retrieve output by given term
|
2022-09-07 18:22:09 +00:00
|
|
|
class FiniteStateTransducer
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
FiniteStateTransducer() = default;
|
2022-09-25 23:29:30 +00:00
|
|
|
FiniteStateTransducer(std::vector<UInt8> data_);
|
|
|
|
std::pair<UInt64, bool> getOutput(const String & term);
|
2022-09-07 18:22:09 +00:00
|
|
|
void clear();
|
|
|
|
std::vector<UInt8> & getData() { return data; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::vector<UInt8> data;
|
|
|
|
};
|
|
|
|
}
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|