mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-14 19:45:11 +00:00
381 lines
17 KiB
C
381 lines
17 KiB
C
|
/* ******************************************************************
|
||
|
FSE : Finite State Entropy coder
|
||
|
header file
|
||
|
Copyright (C) 2013-2015, Yann Collet.
|
||
|
|
||
|
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
||
|
|
||
|
Redistribution and use in source and binary forms, with or without
|
||
|
modification, are permitted provided that the following conditions are
|
||
|
met:
|
||
|
|
||
|
* Redistributions of source code must retain the above copyright
|
||
|
notice, this list of conditions and the following disclaimer.
|
||
|
* Redistributions in binary form must reproduce the above
|
||
|
copyright notice, this list of conditions and the following disclaimer
|
||
|
in the documentation and/or other materials provided with the
|
||
|
distribution.
|
||
|
|
||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
|
||
|
You can contact the author at :
|
||
|
- Source repository : https://github.com/Cyan4973/FiniteStateEntropy
|
||
|
- Public forum : https://groups.google.com/forum/#!forum/lz4c
|
||
|
****************************************************************** */
|
||
|
#pragma once
|
||
|
|
||
|
#if defined (__cplusplus)
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/******************************************
|
||
|
* Includes
|
||
|
******************************************/
|
||
|
#include <stddef.h> // size_t, ptrdiff_t
|
||
|
|
||
|
|
||
|
/******************************************
|
||
|
* FSE simple functions
|
||
|
******************************************/
|
||
|
size_t FSE_compress(void* dst, size_t maxDstSize,
|
||
|
const void* src, size_t srcSize);
|
||
|
size_t FSE_decompress(void* dst, size_t maxDstSize,
|
||
|
const void* cSrc, size_t cSrcSize);
|
||
|
/*
|
||
|
FSE_compress():
|
||
|
Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
|
||
|
'dst' buffer must be already allocated, and sized to handle worst case situations.
|
||
|
Worst case size evaluation is provided by FSE_compressBound().
|
||
|
return : size of compressed data
|
||
|
Special values : if result == 0, data is uncompressible => Nothing is stored within cSrc !!
|
||
|
if result == 1, data is one constant element x srcSize times. Use RLE compression.
|
||
|
if FSE_isError(result), it's an error code.
|
||
|
|
||
|
FSE_decompress():
|
||
|
Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
|
||
|
into already allocated destination buffer 'dst', of size 'maxDstSize'.
|
||
|
** Important ** : This function doesn't decompress uncompressed nor RLE data !
|
||
|
return : size of regenerated data (<= maxDstSize)
|
||
|
or an error code, which can be tested using FSE_isError()
|
||
|
*/
|
||
|
|
||
|
|
||
|
size_t FSE_decompressRLE(void* dst, size_t originalSize,
|
||
|
const void* cSrc, size_t cSrcSize);
|
||
|
/*
|
||
|
FSE_decompressRLE():
|
||
|
Decompress specific RLE corner case (equivalent to memset()).
|
||
|
cSrcSize must be == 1. originalSize must be exact.
|
||
|
return : size of regenerated data (==originalSize)
|
||
|
or an error code, which can be tested using FSE_isError()
|
||
|
|
||
|
Note : there is no function provided for uncompressed data, as it's just a simple memcpy()
|
||
|
*/
|
||
|
|
||
|
|
||
|
/******************************************
|
||
|
* Tool functions
|
||
|
******************************************/
|
||
|
size_t FSE_compressBound(size_t size); /* maximum compressed size */
|
||
|
|
||
|
/* Error Management */
|
||
|
unsigned FSE_isError(size_t code); /* tells if a return value is an error code */
|
||
|
const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */
|
||
|
|
||
|
|
||
|
/******************************************
|
||
|
* FSE advanced functions
|
||
|
******************************************/
|
||
|
/*
|
||
|
FSE_compress2():
|
||
|
Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
|
||
|
Both parameters can be defined as '0' to mean : use default value
|
||
|
return : size of compressed data
|
||
|
or -1 if there is an error
|
||
|
*/
|
||
|
size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
|
||
|
|
||
|
|
||
|
/******************************************
|
||
|
FSE detailed API
|
||
|
******************************************/
|
||
|
/*
|
||
|
int FSE_compress(char* dest, const char* source, int inputSize) does the following:
|
||
|
1. count symbol occurrence from source[] into table count[]
|
||
|
2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
|
||
|
3. save normalized counters to memory buffer using writeHeader()
|
||
|
4. build encoding table 'CTable' from normalized counters
|
||
|
5. encode the data stream using encoding table
|
||
|
|
||
|
int FSE_decompress(char* dest, int originalSize, const char* compressed) performs:
|
||
|
1. read normalized counters with readHeader()
|
||
|
2. build decoding table 'DTable' from normalized counters
|
||
|
3. decode the data stream using decoding table
|
||
|
|
||
|
The following API allows triggering specific sub-functions.
|
||
|
*/
|
||
|
|
||
|
/* *** COMPRESSION *** */
|
||
|
|
||
|
size_t FSE_count(unsigned* count, const unsigned char* src, size_t srcSize, unsigned* maxSymbolValuePtr);
|
||
|
|
||
|
unsigned FSE_optimalTableLog(unsigned tableLog, size_t srcSize, unsigned maxSymbolValue);
|
||
|
size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t total, unsigned maxSymbolValue);
|
||
|
|
||
|
size_t FSE_headerBound(unsigned maxSymbolValue, unsigned tableLog);
|
||
|
size_t FSE_writeHeader (void* headerBuffer, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
|
||
|
|
||
|
void* FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue);
|
||
|
void FSE_freeCTable (void* CTable);
|
||
|
size_t FSE_buildCTable(void* CTable, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
|
||
|
|
||
|
size_t FSE_compress_usingCTable (void* dst, size_t dstSize, const void* src, size_t srcSize, const void* CTable);
|
||
|
|
||
|
/*
|
||
|
The first step is to count all symbols. FSE_count() provides one quick way to do this job.
|
||
|
Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have '*maxSymbolValuePtr+1' cells.
|
||
|
'source' is a table of char of size 'sourceSize'. All values within 'src' MUST be <= *maxSymbolValuePtr
|
||
|
*maxSymbolValuePtr will be updated, with its real value (necessarily <= original value)
|
||
|
FSE_count() will return the number of occurrence of the most frequent symbol.
|
||
|
If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
|
||
|
|
||
|
The next step is to normalize the frequencies.
|
||
|
FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
|
||
|
It also guarantees a minimum of 1 to any Symbol which frequency is >= 1.
|
||
|
You can use input 'tableLog'==0 to mean "use default tableLog value".
|
||
|
If you are unsure of which tableLog value to use, you can optionally call FSE_optimalTableLog(),
|
||
|
which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
|
||
|
|
||
|
The result of FSE_normalizeCount() will be saved into a table,
|
||
|
called 'normalizedCounter', which is a table of signed short.
|
||
|
'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
|
||
|
The return value is tableLog if everything proceeded as expected.
|
||
|
It is 0 if there is a single symbol within distribution.
|
||
|
If there is an error(typically, invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
|
||
|
|
||
|
'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeHeader().
|
||
|
'header' buffer must be already allocated.
|
||
|
For guaranteed success, buffer size must be at least FSE_headerBound().
|
||
|
The result of the function is the number of bytes written into 'header'.
|
||
|
If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()) (for example, buffer size too small).
|
||
|
|
||
|
'normalizedCounter' can then be used to create the compression tables 'CTable'.
|
||
|
The space required by 'CTable' must be already allocated. Its size is provided by FSE_sizeof_CTable().
|
||
|
'CTable' must be aligned of 4 bytes boundaries.
|
||
|
You can then use FSE_buildCTable() to fill 'CTable'.
|
||
|
In both cases, if there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
|
||
|
|
||
|
'CTable' can then be used to compress 'source', with FSE_compress_usingCTable().
|
||
|
Similar to FSE_count(), the convention is that 'source' is assumed to be a table of char of size 'sourceSize'
|
||
|
The function returns the size of compressed data (without header), or -1 if failed.
|
||
|
*/
|
||
|
|
||
|
|
||
|
/* *** DECOMPRESSION *** */
|
||
|
|
||
|
size_t FSE_readHeader (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* headerBuffer, size_t hbSize);
|
||
|
|
||
|
void* FSE_createDTable(unsigned tableLog);
|
||
|
void FSE_freeDTable(void* DTable);
|
||
|
size_t FSE_buildDTable (void* DTable, const short* const normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
|
||
|
|
||
|
size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const void* DTable, size_t fastMode);
|
||
|
|
||
|
/*
|
||
|
If the block is RLE compressed, or uncompressed, use the relevant specific functions.
|
||
|
|
||
|
The first step is to obtain the normalized frequencies of symbols.
|
||
|
This can be performed by reading a header with FSE_readHeader().
|
||
|
'normalizedCounter' must be already allocated, and have at least '*maxSymbolValuePtr+1' cells of short.
|
||
|
In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
|
||
|
or size the table to handle worst case situations (typically 256).
|
||
|
FSE_readHeader will provide 'tableLog' and 'maxSymbolValue' stored into the header.
|
||
|
The result of FSE_readHeader() is the number of bytes read from 'header'.
|
||
|
The following values have special meaning :
|
||
|
return 2 : there is only a single symbol value. The value is provided into the second byte of header.
|
||
|
return 1 : data is uncompressed
|
||
|
If there is an error, the function will return an error code, which can be tested using FSE_isError().
|
||
|
|
||
|
The next step is to create the decompression tables 'DTable' from 'normalizedCounter'.
|
||
|
This is performed by the function FSE_buildDTable().
|
||
|
The space required by 'DTable' must be already allocated and properly aligned.
|
||
|
One can create a DTable using FSE_createDTable().
|
||
|
The function will return 1 if DTable is compatible with fastMode, 0 otherwise.
|
||
|
If there is an error, the function will return an error code, which can be tested using FSE_isError().
|
||
|
|
||
|
'DTable' can then be used to decompress 'compressed', with FSE_decompress_usingDTable().
|
||
|
Only trigger fastMode if it was authorized by result of FSE_buildDTable(), otherwise decompression will fail.
|
||
|
cSrcSize must be correct, otherwise decompression will fail.
|
||
|
FSE_decompress_usingDTable() result will tell how many bytes were regenerated.
|
||
|
If there is an error, the function will return an error code, which can be tested using FSE_isError().
|
||
|
*/
|
||
|
|
||
|
|
||
|
/******************************************
|
||
|
* FSE streaming compression API
|
||
|
******************************************/
|
||
|
typedef struct
|
||
|
{
|
||
|
size_t bitContainer;
|
||
|
int bitPos;
|
||
|
char* startPtr;
|
||
|
char* ptr;
|
||
|
} FSE_CStream_t;
|
||
|
|
||
|
typedef struct
|
||
|
{
|
||
|
ptrdiff_t value;
|
||
|
const void* stateTable;
|
||
|
const void* symbolTT;
|
||
|
unsigned stateLog;
|
||
|
} FSE_CState_t;
|
||
|
|
||
|
void FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer);
|
||
|
void FSE_initCState(FSE_CState_t* CStatePtr, const void* CTable);
|
||
|
|
||
|
void FSE_encodeByte(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned char symbol);
|
||
|
void FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits);
|
||
|
void FSE_flushBits(FSE_CStream_t* bitC);
|
||
|
|
||
|
void FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* CStatePtr);
|
||
|
size_t FSE_closeCStream(FSE_CStream_t* bitC);
|
||
|
|
||
|
/*
|
||
|
These functions are inner components of FSE_compress_usingCTable().
|
||
|
They allow creation of custom streams, mixing multiple tables and bit sources.
|
||
|
|
||
|
A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
|
||
|
So the first symbol you will encode is the last you will decode, like a lifo stack.
|
||
|
|
||
|
You will need a few variables to track your CStream. They are :
|
||
|
|
||
|
void* CTable; // Provided by FSE_buildCTable()
|
||
|
FSE_CStream_t bitC; // bitStream tracking structure
|
||
|
FSE_CState_t state; // State tracking structure
|
||
|
|
||
|
|
||
|
The first thing to do is to init the bitStream, and the state.
|
||
|
FSE_initCStream(&bitC, dstBuffer);
|
||
|
FSE_initState(&state, CTable);
|
||
|
|
||
|
You can then encode your input data, byte after byte.
|
||
|
FSE_encodeByte() outputs a maximum of 'tableLog' bits at a time.
|
||
|
Remember decoding will be done in reverse direction.
|
||
|
FSE_encodeByte(&bitStream, &state, symbol);
|
||
|
|
||
|
At any time, you can add any bit sequence.
|
||
|
Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
|
||
|
FSE_addBits(&bitStream, bitField, nbBits);
|
||
|
|
||
|
The above methods don't commit data to memory, they just store it into local register, for speed.
|
||
|
Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
|
||
|
Writing data to memory is a manual operation, performed by the flushBits function.
|
||
|
FSE_flushBits(&bitStream);
|
||
|
|
||
|
Your last FSE encoding operation shall be to flush your last state value(s).
|
||
|
FSE_flushState(&bitStream, &state);
|
||
|
|
||
|
You must then close the bitStream if you opened it with FSE_initCStream().
|
||
|
It's possible to embed some user-info into the header, as an optionalId [0-31].
|
||
|
The function returns the size in bytes of CStream.
|
||
|
If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
|
||
|
size_t size = FSE_closeCStream(&bitStream, optionalId);
|
||
|
*/
|
||
|
|
||
|
|
||
|
/******************************************
|
||
|
* FSE streaming decompression API
|
||
|
******************************************/
|
||
|
//typedef unsigned int bitD_t;
|
||
|
typedef size_t bitD_t;
|
||
|
|
||
|
typedef struct
|
||
|
{
|
||
|
bitD_t bitContainer;
|
||
|
unsigned bitsConsumed;
|
||
|
const char* ptr;
|
||
|
const char* start;
|
||
|
} FSE_DStream_t;
|
||
|
|
||
|
typedef struct
|
||
|
{
|
||
|
bitD_t state;
|
||
|
const void* table;
|
||
|
} FSE_DState_t;
|
||
|
|
||
|
|
||
|
size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
|
||
|
void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const void* DTable);
|
||
|
|
||
|
unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
|
||
|
bitD_t FSE_readBits(FSE_DStream_t* bitD, unsigned nbBits);
|
||
|
unsigned int FSE_reloadDStream(FSE_DStream_t* bitD);
|
||
|
|
||
|
unsigned FSE_endOfDStream(const FSE_DStream_t* bitD);
|
||
|
unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
|
||
|
|
||
|
/*
|
||
|
Let's now decompose FSE_decompress_usingDTable() into its unitary elements.
|
||
|
You will decode FSE-encoded symbols from the bitStream,
|
||
|
and also any other bitFields you put in, **in reverse order**.
|
||
|
|
||
|
You will need a few variables to track your bitStream. They are :
|
||
|
|
||
|
FSE_DStream_t DStream; // Stream context
|
||
|
FSE_DState_t DState; // State context. Multiple ones are possible
|
||
|
const void* DTable; // Decoding table, provided by FSE_buildDTable()
|
||
|
U32 tableLog; // Provided by FSE_readHeader()
|
||
|
|
||
|
The first thing to do is to init the bitStream.
|
||
|
errorCode = FSE_initDStream(&DStream, &optionalId, srcBuffer, srcSize);
|
||
|
|
||
|
You should then retrieve your initial state(s) (multiple ones are possible) :
|
||
|
errorCode = FSE_initDState(&DState, &DStream, DTable, tableLog);
|
||
|
|
||
|
You can then decode your data, symbol after symbol.
|
||
|
For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
|
||
|
Keep in mind that symbols are decoded in reverse order, like a lifo stack (last in, first out).
|
||
|
unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
|
||
|
|
||
|
You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
|
||
|
Note : maximum allowed nbBits is 25
|
||
|
unsigned int bitField = FSE_readBits(&DStream, nbBits);
|
||
|
|
||
|
All above operations only read from local register (which size is controlled by bitD_t==32 bits).
|
||
|
Reading data from memory is manually performed by the reload method.
|
||
|
endSignal = FSE_reloadDStream(&DStream);
|
||
|
|
||
|
FSE_reloadDStream() result tells if there is still some more data to read from DStream.
|
||
|
0 : there is still some data left into the DStream.
|
||
|
1 Dstream reached end of buffer, but is not yet fully extracted. It will not load data from memory any more.
|
||
|
2 Dstream reached its exact end, corresponding in general to decompression completed.
|
||
|
3 Dstream went too far. Decompression result is corrupted.
|
||
|
|
||
|
When reaching end of buffer(1), progress slowly if you decode multiple symbols per loop,
|
||
|
to properly detect the exact end of stream.
|
||
|
After each decoded symbol, check if DStream is fully consumed using this simple test :
|
||
|
FSE_reloadDStream(&DStream) >= 2
|
||
|
|
||
|
When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
|
||
|
Checking if DStream has reached its end is performed by :
|
||
|
FSE_endOfDStream(&DStream);
|
||
|
Check also the states. There might be some entropy left there, still able to decode some high probability symbol.
|
||
|
FSE_endOfDState(&DState);
|
||
|
*/
|
||
|
|
||
|
|
||
|
#if defined (__cplusplus)
|
||
|
}
|
||
|
#endif
|