2018-06-15 05:33:39 +00:00
# include <Columns/IColumn.h>
# include <Columns/ColumnVector.h>
# include <Columns/ColumnString.h>
2018-06-15 07:09:54 +00:00
# include <Columns/ColumnArray.h>
# include <Columns/ColumnNullable.h>
# include <Columns/ColumnFixedString.h>
2018-06-15 05:33:39 +00:00
# include <DataTypes/IDataType.h>
# include <DataTypes/DataTypesNumber.h>
# include <DataTypes/DataTypeDate.h>
# include <DataTypes/DataTypeDateTime.h>
# include <DataTypes/DataTypeString.h>
2018-06-15 07:09:54 +00:00
# include <DataTypes/DataTypeFixedString.h>
# include <DataTypes/DataTypeArray.h>
# include <DataTypes/DataTypeNullable.h>
2018-06-15 05:33:39 +00:00
# include <DataTypes/DataTypeFactory.h>
2020-09-02 11:20:20 +00:00
# include <DataTypes/DataTypeUUID.h>
2018-06-15 05:33:39 +00:00
# include <Interpreters/Context.h>
2021-10-16 14:03:50 +00:00
# include <QueryPipeline/Pipe.h>
2021-07-22 16:05:52 +00:00
# include <Processors/LimitTransform.h>
2018-06-15 05:33:39 +00:00
# include <Common/SipHash.h>
# include <Common/UTF8Helpers.h>
2018-06-15 09:07:42 +00:00
# include <Common/StringUtils/StringUtils.h>
2018-06-15 05:33:39 +00:00
# include <Common/HashTable/HashMap.h>
# include <Common/typeid_cast.h>
2019-08-21 02:28:04 +00:00
# include <Common/assert_cast.h>
2020-10-29 04:27:12 +00:00
# include <Formats/registerFormats.h>
2022-08-11 11:32:32 +00:00
# include <Formats/ReadSchemaUtils.h>
2021-07-22 16:05:52 +00:00
# include <Processors/Formats/IInputFormat.h>
2021-10-16 14:03:50 +00:00
# include <QueryPipeline/QueryPipelineBuilder.h>
2021-07-22 16:05:52 +00:00
# include <Processors/Executors/PullingPipelineExecutor.h>
2021-10-11 16:11:50 +00:00
# include <Processors/Executors/PushingPipelineExecutor.h>
2018-06-15 05:33:39 +00:00
# include <Core/Block.h>
2021-12-21 13:41:53 +00:00
# include <Common/DateLUT.h>
2018-06-15 05:33:39 +00:00
# include <IO/ReadBufferFromFileDescriptor.h>
# include <IO/WriteBufferFromFileDescriptor.h>
2022-07-25 01:27:10 +00:00
# include <IO/ReadBufferFromFile.h>
# include <IO/WriteBufferFromFile.h>
# include <Compression/CompressedReadBuffer.h>
# include <Compression/CompressedWriteBuffer.h>
2022-09-01 13:23:34 +00:00
# include <Interpreters/parseColumnsListForTableFunction.h>
2018-06-15 05:33:39 +00:00
# include <memory>
2018-06-15 08:53:06 +00:00
# include <cmath>
2018-06-16 03:35:23 +00:00
# include <unistd.h>
2018-06-15 05:33:39 +00:00
# include <boost/program_options/options_description.hpp>
# include <boost/program_options.hpp>
# include <boost/algorithm/string.hpp>
2018-06-15 09:40:40 +00:00
# include <boost/container/flat_map.hpp>
2019-08-23 15:47:27 +00:00
# include <Common/TerminalSize.h>
2022-07-31 14:34:05 +00:00
# include <bit>
2018-06-15 05:33:39 +00:00
2018-06-16 01:44:25 +00:00
2020-01-11 09:50:41 +00:00
static const char * documentation = R " (
2018-06-16 01:44:25 +00:00
Simple tool for table data obfuscation .
It reads input table and produces output table , that retain some properties of input , but contains different data .
It allows to publish almost real production data for usage in benchmarks .
It is designed to retain the following properties of data :
- cardinalities of values ( number of distinct values ) for every column and for every tuple of columns ;
- conditional cardinalities : number of distinct values of one column under condition on value of another column ;
- probability distributions of absolute value of integers ; sign of signed integers ; exponent and sign for floats ;
- probability distributions of length of strings ;
- probability of zero values of numbers ; empty strings and arrays , NULLs ;
- data compression ratio when compressed with LZ77 and entropy family of codecs ;
2019-01-22 19:56:53 +00:00
- continuity ( magnitude of difference ) of time values across table ; continuity of floating point values .
2018-06-16 01:44:25 +00:00
- date component of DateTime values ;
- UTF - 8 validity of string values ;
- string values continue to look somewhat natural .
Most of the properties above are viable for performance testing :
- reading data , filtering , aggregation and sorting will work at almost the same speed
2018-07-24 18:46:23 +00:00
as on original data due to saved cardinalities , magnitudes , compression ratios , etc .
2018-06-16 01:44:25 +00:00
It works in deterministic fashion : you define a seed value and transform is totally determined by input data and by seed .
Some transforms are one to one and could be reversed , so you need to have large enough seed and keep it in secret .
It use some cryptographic primitives to transform data , but from the cryptographic point of view ,
2018-07-24 18:46:23 +00:00
it doesn ' t do anything properly and you should never consider the result as secure , unless you have other reasons for it .
2018-06-16 01:44:25 +00:00
It may retain some data you don ' t want to publish .
It always leave numbers 0 , 1 , - 1 as is . Also it leaves dates , lengths of arrays and null flags exactly as in source data .
For example , you have a column IsMobile in your table with values 0 and 1. In transformed data , it will have the same value .
So , the user will be able to count exact ratio of mobile traffic .
Another example , suppose you have some private data in your table , like user email and you don ' t want to publish any single email address .
If your table is large enough and contain multiple different emails and there is no email that have very high frequency than all others ,
2018-07-24 18:46:23 +00:00
it will perfectly anonymize all data . But if you have small amount of different values in a column , it can possibly reproduce some of them .
2018-06-16 01:44:25 +00:00
And you should take care and look at exact algorithm , how this tool works , and probably fine tune some of it command line parameters .
This tool works fine only with reasonable amount of data ( at least 1000 s of rows ) .
) " ;
2018-06-15 05:33:39 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR ;
2018-06-30 19:03:26 +00:00
extern const int NOT_IMPLEMENTED ;
2018-06-16 03:35:23 +00:00
extern const int CANNOT_SEEK_THROUGH_FILE ;
2022-07-25 01:27:10 +00:00
extern const int UNKNOWN_FORMAT_VERSION ;
extern const int INCORRECT_NUMBER_OF_COLUMNS ;
extern const int TYPE_MISMATCH ;
2018-06-15 05:33:39 +00:00
}
/// Model is used to transform columns with source data to columns
/// with similar by structure and by probability distributions but anonymized data.
class IModel
{
public :
/// Call train iteratively for each block to train a model.
2021-02-07 14:32:19 +00:00
virtual void train ( const IColumn & column ) = 0 ;
2018-06-15 05:33:39 +00:00
/// Call finalize one time after training before generating.
2021-02-07 14:32:19 +00:00
virtual void finalize ( ) = 0 ;
2018-06-15 05:33:39 +00:00
/// Call generate: pass source data column to obtain a column with anonymized data as a result.
2021-02-07 14:32:19 +00:00
virtual ColumnPtr generate ( const IColumn & column ) = 0 ;
2018-06-15 05:33:39 +00:00
2020-02-17 07:52:13 +00:00
/// Deterministically change seed to some other value. This can be used to generate more values than were in source.
2021-02-07 14:32:19 +00:00
virtual void updateSeed ( ) = 0 ;
2020-02-17 07:52:13 +00:00
2022-07-25 01:27:10 +00:00
/// Save into file. Binary, platform-dependent, version-dependent serialization.
virtual void serialize ( WriteBuffer & out ) const = 0 ;
/// Read from file
virtual void deserialize ( ReadBuffer & in ) = 0 ;
2020-03-09 03:38:43 +00:00
virtual ~ IModel ( ) = default ;
2018-06-15 05:33:39 +00:00
} ;
using ModelPtr = std : : unique_ptr < IModel > ;
template < typename . . . Ts >
UInt64 hash ( Ts . . . xs )
{
SipHash hash ;
( hash . update ( xs ) , . . . ) ;
return hash . get64 ( ) ;
}
2019-12-15 06:34:43 +00:00
static UInt64 maskBits ( UInt64 x , size_t num_bits )
2018-06-15 05:33:39 +00:00
{
2019-01-09 15:44:20 +00:00
return x & ( ( 1ULL < < num_bits ) - 1 ) ;
2018-06-15 05:33:39 +00:00
}
/// Apply Feistel network round to least significant num_bits part of x.
2019-12-15 06:34:43 +00:00
static UInt64 feistelRound ( UInt64 x , size_t num_bits , UInt64 seed , size_t round )
2018-06-15 05:33:39 +00:00
{
2018-06-15 06:14:39 +00:00
size_t num_bits_left_half = num_bits / 2 ;
size_t num_bits_right_half = num_bits - num_bits_left_half ;
2018-06-15 05:33:39 +00:00
UInt64 left_half = maskBits ( x > > num_bits_right_half , num_bits_left_half ) ;
2018-06-15 06:14:39 +00:00
UInt64 right_half = maskBits ( x , num_bits_right_half ) ;
2018-06-15 05:33:39 +00:00
UInt64 new_left_half = right_half ;
2018-06-15 06:14:39 +00:00
UInt64 new_right_half = left_half ^ maskBits ( hash ( right_half , seed , round ) , num_bits_left_half ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 06:14:39 +00:00
return ( new_left_half < < num_bits_left_half ) ^ new_right_half ;
2018-06-15 05:33:39 +00:00
}
/// Apply Feistel network with num_rounds to least significant num_bits part of x.
2019-12-15 06:34:43 +00:00
static UInt64 feistelNetwork ( UInt64 x , size_t num_bits , UInt64 seed , size_t num_rounds = 4 )
2018-06-15 05:33:39 +00:00
{
2018-06-15 06:14:39 +00:00
UInt64 bits = maskBits ( x , num_bits ) ;
2018-06-15 05:33:39 +00:00
for ( size_t i = 0 ; i < num_rounds ; + + i )
2018-06-15 06:14:39 +00:00
bits = feistelRound ( bits , num_bits , seed , i ) ;
2019-01-09 15:44:20 +00:00
return ( x & ~ ( ( 1ULL < < num_bits ) - 1 ) ) ^ bits ;
2018-06-15 05:33:39 +00:00
}
/// Pseudorandom permutation within set of numbers with the same log2(x).
2019-12-15 06:34:43 +00:00
static UInt64 transform ( UInt64 x , UInt64 seed )
2018-06-15 05:33:39 +00:00
{
/// Keep 0 and 1 as is.
if ( x = = 0 | | x = = 1 )
return x ;
/// Pseudorandom permutation of two elements.
if ( x = = 2 | | x = = 3 )
return x ^ ( seed & 1 ) ;
2022-07-31 14:34:05 +00:00
size_t num_leading_zeros = std : : countl_zero ( x ) ;
2018-06-15 06:14:39 +00:00
2018-06-15 05:33:39 +00:00
return feistelNetwork ( x , 64 - num_leading_zeros - 1 , seed ) ;
}
class UnsignedIntegerModel : public IModel
{
private :
2020-02-17 07:52:13 +00:00
UInt64 seed ;
2018-06-15 05:33:39 +00:00
public :
2020-03-18 03:27:32 +00:00
explicit UnsignedIntegerModel ( UInt64 seed_ ) : seed ( seed_ ) { }
2018-06-15 05:33:39 +00:00
void train ( const IColumn & ) override { }
void finalize ( ) override { }
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & ) const override { }
void deserialize ( ReadBuffer & ) override { }
2018-06-15 05:33:39 +00:00
ColumnPtr generate ( const IColumn & column ) override
{
MutableColumnPtr res = column . cloneEmpty ( ) ;
size_t size = column . size ( ) ;
res - > reserve ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
res - > insert ( transform ( column . getUInt ( i ) , seed ) ) ;
return res ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
seed = hash ( seed ) ;
}
2018-06-15 05:33:39 +00:00
} ;
/// Keep sign and apply pseudorandom permutation after converting to unsigned as above.
2019-12-15 06:34:43 +00:00
static Int64 transformSigned ( Int64 x , UInt64 seed )
2018-06-15 05:33:39 +00:00
{
if ( x > = 0 )
return transform ( x , seed ) ;
else
return - transform ( - x , seed ) ; /// It works Ok even for minimum signed number.
}
class SignedIntegerModel : public IModel
{
private :
2020-02-17 07:52:13 +00:00
UInt64 seed ;
2018-06-15 05:33:39 +00:00
public :
2020-03-18 03:27:32 +00:00
explicit SignedIntegerModel ( UInt64 seed_ ) : seed ( seed_ ) { }
2018-06-15 05:33:39 +00:00
void train ( const IColumn & ) override { }
void finalize ( ) override { }
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & ) const override { }
void deserialize ( ReadBuffer & ) override { }
2018-06-15 05:33:39 +00:00
ColumnPtr generate ( const IColumn & column ) override
{
MutableColumnPtr res = column . cloneEmpty ( ) ;
size_t size = column . size ( ) ;
res - > reserve ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
res - > insert ( transformSigned ( column . getInt ( i ) , seed ) ) ;
return res ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
seed = hash ( seed ) ;
}
2018-06-15 05:33:39 +00:00
} ;
/// Pseudorandom permutation of mantissa.
template < typename Float >
Float transformFloatMantissa ( Float x , UInt64 seed )
{
using UInt = std : : conditional_t < std : : is_same_v < Float , Float32 > , UInt32 , UInt64 > ;
constexpr size_t mantissa_num_bits = std : : is_same_v < Float , Float32 > ? 23 : 52 ;
2022-11-04 15:52:48 +00:00
UInt x_uint = std : : bit_cast < UInt > ( x ) ;
2022-10-07 10:46:45 +00:00
x_uint = static_cast < UInt > ( feistelNetwork ( x_uint , mantissa_num_bits , seed ) ) ;
2022-11-04 15:52:48 +00:00
return std : : bit_cast < Float > ( x_uint ) ;
2018-06-15 05:33:39 +00:00
}
/// Transform difference from previous number by applying pseudorandom permutation to mantissa part of it.
2019-01-22 19:56:53 +00:00
/// It allows to retain some continuity property of source data.
2018-06-15 05:33:39 +00:00
template < typename Float >
class FloatModel : public IModel
{
private :
2020-02-17 07:52:13 +00:00
UInt64 seed ;
2018-06-15 05:33:39 +00:00
Float src_prev_value = 0 ;
Float res_prev_value = 0 ;
public :
2020-03-18 03:27:32 +00:00
explicit FloatModel ( UInt64 seed_ ) : seed ( seed_ ) { }
2018-06-15 05:33:39 +00:00
void train ( const IColumn & ) override { }
void finalize ( ) override { }
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & ) const override { }
void deserialize ( ReadBuffer & ) override { }
2018-06-15 05:33:39 +00:00
ColumnPtr generate ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const auto & src_data = assert_cast < const ColumnVector < Float > & > ( column ) . getData ( ) ;
2018-06-15 05:33:39 +00:00
size_t size = src_data . size ( ) ;
auto res_column = ColumnVector < Float > : : create ( size ) ;
2019-08-21 02:28:04 +00:00
auto & res_data = assert_cast < ColumnVector < Float > & > ( * res_column ) . getData ( ) ;
2018-06-15 05:33:39 +00:00
for ( size_t i = 0 ; i < size ; + + i )
{
res_data [ i ] = res_prev_value + transformFloatMantissa ( src_data [ i ] - src_prev_value , seed ) ;
src_prev_value = src_data [ i ] ;
res_prev_value = res_data [ i ] ;
}
return res_column ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
seed = hash ( seed ) ;
}
2018-06-15 05:33:39 +00:00
} ;
/// Leave all data as is. For example, it is used for columns of type Date.
class IdentityModel : public IModel
{
public :
void train ( const IColumn & ) override { }
void finalize ( ) override { }
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & ) const override { }
void deserialize ( ReadBuffer & ) override { }
2018-06-15 05:33:39 +00:00
ColumnPtr generate ( const IColumn & column ) override
{
return column . cloneResized ( column . size ( ) ) ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
}
2018-06-15 05:33:39 +00:00
} ;
2018-06-15 09:07:42 +00:00
/// Pseudorandom function, but keep word characters as word characters.
2019-12-15 06:34:43 +00:00
static void transformFixedString ( const UInt8 * src , UInt8 * dst , size_t size , UInt64 seed )
2018-06-15 07:09:54 +00:00
{
{
SipHash hash ;
hash . update ( seed ) ;
hash . update ( reinterpret_cast < const char * > ( src ) , size ) ;
seed = hash . get64 ( ) ;
}
UInt8 * pos = dst ;
UInt8 * end = dst + size ;
size_t i = 0 ;
while ( pos < end )
{
SipHash hash ;
hash . update ( seed ) ;
hash . update ( i ) ;
2023-06-22 03:15:12 +00:00
const auto checksum = getSipHash128AsArray ( hash ) ;
2018-06-15 10:42:13 +00:00
if ( size > = 16 )
{
2023-06-22 03:15:12 +00:00
auto * hash_dst = std : : min ( pos , end - 16 ) ;
memcpy ( hash_dst , checksum . data ( ) , checksum . size ( ) ) ;
2018-06-15 10:42:13 +00:00
}
else
2023-06-22 03:15:12 +00:00
memcpy ( dst , checksum . data ( ) , end - dst ) ;
2018-06-15 07:09:54 +00:00
pos + = 16 ;
+ + i ;
}
2018-06-15 09:07:42 +00:00
for ( size_t j = 0 ; j < size ; + + j )
{
if ( isWordCharASCII ( src [ j ] ) )
{
static constexpr char word_chars [ ] = " _01234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ " ;
2018-06-16 02:04:46 +00:00
dst [ j ] = word_chars [ dst [ j ] % ( sizeof ( word_chars ) - 1 ) ] ;
2018-06-15 09:07:42 +00:00
}
}
2018-06-15 07:09:54 +00:00
}
2021-05-08 20:21:27 +00:00
static void transformUUID ( const UUID & src_uuid , UUID & dst_uuid , UInt64 seed )
2020-09-02 11:14:49 +00:00
{
2023-08-16 20:12:31 +00:00
auto src_copy = src_uuid ;
2023-08-18 18:25:08 +00:00
transformEndianness < std : : endian : : little , std : : endian : : native > ( src_copy ) ;
2023-08-16 20:12:31 +00:00
const UInt128 & src = src_copy . toUnderType ( ) ;
2021-05-08 20:21:27 +00:00
UInt128 & dst = dst_uuid . toUnderType ( ) ;
2020-09-02 11:14:49 +00:00
SipHash hash ;
hash . update ( seed ) ;
2021-05-08 20:21:27 +00:00
hash . update ( reinterpret_cast < const char * > ( & src ) , sizeof ( UUID ) ) ;
2020-09-02 11:14:49 +00:00
/// Saving version and variant from an old UUID
2023-06-22 15:41:30 +00:00
dst = hash . get128 ( ) ;
2021-05-08 20:21:27 +00:00
2023-08-16 20:12:31 +00:00
const UInt64 trace [ 2 ] = { 0x000000000000f000ull , 0xe000000000000000ull } ;
2023-08-18 20:50:31 +00:00
UUIDHelpers : : getLowBytes ( dst_uuid ) = ( UUIDHelpers : : getLowBytes ( dst_uuid ) & ( 0xffffffffffffffffull - trace [ 1 ] ) ) | ( UUIDHelpers : : getLowBytes ( src_uuid ) & trace [ 1 ] ) ;
UUIDHelpers : : getHighBytes ( dst_uuid ) = ( UUIDHelpers : : getHighBytes ( dst_uuid ) & ( 0xffffffffffffffffull - trace [ 0 ] ) ) | ( UUIDHelpers : : getHighBytes ( src_uuid ) & trace [ 0 ] ) ;
2020-09-02 11:14:49 +00:00
}
2018-06-15 07:09:54 +00:00
class FixedStringModel : public IModel
{
private :
2020-02-17 07:52:13 +00:00
UInt64 seed ;
2018-06-15 07:09:54 +00:00
public :
2020-03-18 03:27:32 +00:00
explicit FixedStringModel ( UInt64 seed_ ) : seed ( seed_ ) { }
2018-06-15 07:09:54 +00:00
void train ( const IColumn & ) override { }
void finalize ( ) override { }
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & ) const override { }
void deserialize ( ReadBuffer & ) override { }
2018-06-15 07:09:54 +00:00
ColumnPtr generate ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const ColumnFixedString & column_fixed_string = assert_cast < const ColumnFixedString & > ( column ) ;
2018-06-15 07:09:54 +00:00
const size_t string_size = column_fixed_string . getN ( ) ;
const auto & src_data = column_fixed_string . getChars ( ) ;
size_t size = column_fixed_string . size ( ) ;
auto res_column = ColumnFixedString : : create ( string_size ) ;
auto & res_data = res_column - > getChars ( ) ;
res_data . resize ( src_data . size ( ) ) ;
for ( size_t i = 0 ; i < size ; + + i )
transformFixedString ( & src_data [ i * string_size ] , & res_data [ i * string_size ] , string_size , seed ) ;
return res_column ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
seed = hash ( seed ) ;
}
2018-06-15 07:09:54 +00:00
} ;
2020-09-02 11:14:49 +00:00
class UUIDModel : public IModel
{
private :
UInt64 seed ;
public :
explicit UUIDModel ( UInt64 seed_ ) : seed ( seed_ ) { }
void train ( const IColumn & ) override { }
void finalize ( ) override { }
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & ) const override { }
void deserialize ( ReadBuffer & ) override { }
2020-09-02 11:14:49 +00:00
ColumnPtr generate ( const IColumn & column ) override
{
2021-05-08 20:21:27 +00:00
const ColumnUUID & src_column = assert_cast < const ColumnUUID & > ( column ) ;
2020-09-05 21:35:24 +00:00
const auto & src_data = src_column . getData ( ) ;
2020-09-02 11:14:49 +00:00
2021-05-08 20:21:27 +00:00
auto res_column = ColumnUUID : : create ( ) ;
2020-09-05 21:35:24 +00:00
auto & res_data = res_column - > getData ( ) ;
2020-09-02 11:14:49 +00:00
res_data . resize ( src_data . size ( ) ) ;
2020-09-05 21:35:24 +00:00
for ( size_t i = 0 ; i < src_column . size ( ) ; + + i )
transformUUID ( src_data [ i ] , res_data [ i ] , seed ) ;
2020-09-02 11:14:49 +00:00
return res_column ;
}
void updateSeed ( ) override
{
seed = hash ( seed ) ;
}
} ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
/// Leave date part as is and apply pseudorandom permutation to time difference with previous value within the same log2 class.
class DateTimeModel : public IModel
{
private :
2020-02-17 07:52:13 +00:00
UInt64 seed ;
2018-06-15 05:33:39 +00:00
UInt32 src_prev_value = 0 ;
UInt32 res_prev_value = 0 ;
const DateLUTImpl & date_lut ;
public :
2023-04-12 10:47:05 +00:00
explicit DateTimeModel ( UInt64 seed_ ) : seed ( seed_ ) , date_lut ( DateLUT : : serverTimezoneInstance ( ) ) { }
2018-06-15 05:33:39 +00:00
void train ( const IColumn & ) override { }
void finalize ( ) override { }
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & ) const override { }
void deserialize ( ReadBuffer & ) override { }
2018-06-15 05:33:39 +00:00
ColumnPtr generate ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const auto & src_data = assert_cast < const ColumnVector < UInt32 > & > ( column ) . getData ( ) ;
2018-06-15 05:33:39 +00:00
size_t size = src_data . size ( ) ;
auto res_column = ColumnVector < UInt32 > : : create ( size ) ;
2019-08-21 02:28:04 +00:00
auto & res_data = assert_cast < ColumnVector < UInt32 > & > ( * res_column ) . getData ( ) ;
2018-06-15 05:33:39 +00:00
for ( size_t i = 0 ; i < size ; + + i )
{
2018-06-15 09:21:53 +00:00
UInt32 src_datetime = src_data [ i ] ;
2022-10-07 10:46:45 +00:00
UInt32 src_date = static_cast < UInt32 > ( date_lut . toDate ( src_datetime ) ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 09:21:53 +00:00
Int32 src_diff = src_datetime - src_prev_value ;
2022-10-07 10:46:45 +00:00
Int32 res_diff = static_cast < Int32 > ( transformSigned ( src_diff , seed ) ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 09:21:53 +00:00
UInt32 new_datetime = res_prev_value + res_diff ;
2022-10-07 10:46:45 +00:00
UInt32 new_time = new_datetime - static_cast < UInt32 > ( date_lut . toDate ( new_datetime ) ) ;
2018-06-15 09:21:53 +00:00
res_data [ i ] = src_date + new_time ;
2018-06-15 05:33:39 +00:00
2018-06-15 09:21:53 +00:00
src_prev_value = src_datetime ;
2018-06-15 05:33:39 +00:00
res_prev_value = res_data [ i ] ;
}
return res_column ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
seed = hash ( seed ) ;
}
2018-06-15 05:33:39 +00:00
} ;
2018-06-15 23:54:33 +00:00
struct MarkovModelParameters
{
size_t order ;
size_t frequency_cutoff ;
2018-06-16 00:50:52 +00:00
size_t num_buckets_cutoff ;
size_t frequency_add ;
double frequency_desaturate ;
2018-06-16 01:44:25 +00:00
size_t determinator_sliding_window_size ;
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & out ) const
{
writeBinary ( order , out ) ;
writeBinary ( frequency_cutoff , out ) ;
writeBinary ( num_buckets_cutoff , out ) ;
writeBinary ( frequency_add , out ) ;
writeBinary ( frequency_desaturate , out ) ;
writeBinary ( determinator_sliding_window_size , out ) ;
}
void deserialize ( ReadBuffer & in )
{
readBinary ( order , in ) ;
readBinary ( frequency_cutoff , in ) ;
readBinary ( num_buckets_cutoff , in ) ;
readBinary ( frequency_add , in ) ;
readBinary ( frequency_desaturate , in ) ;
readBinary ( determinator_sliding_window_size , in ) ;
}
2018-06-15 23:54:33 +00:00
} ;
2018-06-16 00:50:52 +00:00
/** Actually it's not an order-N model, but a mix of order-{0..N} models.
*
* We calculate code point counts for every context of 0. . N previous code points .
* Then throw off some context with low amount of statistics .
*
* When generating data , we try to find statistics for a context of maximum order .
* And if not found - use context of smaller order , up to 0.
*/
2018-06-15 05:33:39 +00:00
class MarkovModel
{
private :
using CodePoint = UInt32 ;
using NGramHash = UInt32 ;
struct Histogram
{
2018-06-15 08:53:06 +00:00
UInt64 total = 0 ; /// Not including count_end.
UInt64 count_end = 0 ;
2018-06-15 09:40:40 +00:00
using Buckets = boost : : container : : flat_map < CodePoint , UInt64 > ;
Buckets buckets ;
2018-06-15 05:33:39 +00:00
void add ( CodePoint code )
{
+ + total ;
2018-06-15 09:40:40 +00:00
+ + buckets [ code ] ;
2018-06-15 05:33:39 +00:00
}
2018-06-15 08:53:06 +00:00
void addEnd ( )
2018-06-15 05:33:39 +00:00
{
2018-06-15 08:53:06 +00:00
+ + count_end ;
}
CodePoint sample ( UInt64 random , double end_multiplier ) const
{
2022-04-18 08:18:31 +00:00
UInt64 range = total + static_cast < UInt64 > ( count_end * end_multiplier ) ;
2018-06-15 08:53:06 +00:00
if ( range = = 0 )
return END ;
random % = range ;
2018-06-15 05:33:39 +00:00
UInt64 sum = 0 ;
2018-06-15 09:40:40 +00:00
for ( const auto & elem : buckets )
2018-06-15 05:33:39 +00:00
{
2018-06-15 09:40:40 +00:00
sum + = elem . second ;
2018-06-15 05:33:39 +00:00
if ( sum > random )
2018-06-15 09:40:40 +00:00
return elem . first ;
2018-06-15 05:33:39 +00:00
}
2018-06-15 08:53:06 +00:00
return END ;
2018-06-15 05:33:39 +00:00
}
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & out ) const
{
writeBinary ( total , out ) ;
writeBinary ( count_end , out ) ;
size_t size = buckets . size ( ) ;
writeBinary ( size , out ) ;
for ( const auto & elem : buckets )
{
writeBinary ( elem . first , out ) ;
writeBinary ( elem . second , out ) ;
}
}
void deserialize ( ReadBuffer & in )
{
readBinary ( total , in ) ;
readBinary ( count_end , in ) ;
size_t size = 0 ;
readBinary ( size , in ) ;
buckets . reserve ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
Buckets : : value_type elem ;
readBinary ( elem . first , in ) ;
readBinary ( elem . second , in ) ;
buckets . emplace ( std : : move ( elem ) ) ;
}
}
2018-06-15 05:33:39 +00:00
} ;
using Table = HashMap < NGramHash , Histogram , TrivialHash > ;
Table table ;
2018-06-15 23:54:33 +00:00
MarkovModelParameters params ;
2018-06-15 05:33:39 +00:00
std : : vector < CodePoint > code_points ;
2018-06-15 23:54:33 +00:00
/// Special code point to form context before beginning of string.
2018-06-15 08:53:06 +00:00
static constexpr CodePoint BEGIN = - 1 ;
2018-06-15 23:54:33 +00:00
/// Special code point to indicate end of string.
2018-06-15 08:53:06 +00:00
static constexpr CodePoint END = - 2 ;
2018-06-15 05:33:39 +00:00
2020-03-18 00:57:00 +00:00
static NGramHash hashContext ( const CodePoint * begin , const CodePoint * end )
2018-06-15 05:33:39 +00:00
{
return CRC32Hash ( ) ( StringRef ( reinterpret_cast < const char * > ( begin ) , ( end - begin ) * sizeof ( CodePoint ) ) ) ;
}
2018-10-13 14:33:43 +00:00
/// By the way, we don't have to use actual Unicode numbers. We use just arbitrary bijective mapping.
2020-03-18 00:57:00 +00:00
static CodePoint readCodePoint ( const char * & pos , const char * end )
2018-06-15 05:33:39 +00:00
{
size_t length = UTF8 : : seqLength ( * pos ) ;
2018-06-29 23:27:56 +00:00
2018-06-15 05:33:39 +00:00
if ( pos + length > end )
length = end - pos ;
2018-06-29 23:27:56 +00:00
if ( length > sizeof ( CodePoint ) )
length = sizeof ( CodePoint ) ;
2018-06-15 05:33:39 +00:00
CodePoint res = 0 ;
memcpy ( & res , pos , length ) ;
2018-06-15 06:14:39 +00:00
pos + = length ;
2018-06-15 05:33:39 +00:00
return res ;
}
2020-03-18 00:57:00 +00:00
static bool writeCodePoint ( CodePoint code , char * & pos , const char * end )
2018-06-15 05:33:39 +00:00
{
size_t length
= ( code & 0xFF000000 ) ? 4
: ( code & 0xFFFF0000 ) ? 3
: ( code & 0xFFFFFF00 ) ? 2
: 1 ;
if ( pos + length > end )
return false ;
memcpy ( pos , & code , length ) ;
pos + = length ;
return true ;
}
public :
2020-03-18 03:27:32 +00:00
explicit MarkovModel ( MarkovModelParameters params_ )
2022-06-07 11:53:10 +00:00
: params ( std : : move ( params_ ) ) , code_points ( params . order , BEGIN ) { }
2018-06-15 05:33:39 +00:00
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & out ) const
{
params . serialize ( out ) ;
size_t size = table . size ( ) ;
writeBinary ( size , out ) ;
for ( const auto & elem : table )
{
writeBinary ( elem . getKey ( ) , out ) ;
elem . getMapped ( ) . serialize ( out ) ;
}
}
void deserialize ( ReadBuffer & in )
{
params . deserialize ( in ) ;
size_t size = 0 ;
readBinary ( size , in ) ;
table . reserve ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
NGramHash key { } ;
readBinary ( key , in ) ;
Histogram & histogram = table [ key ] ;
histogram . deserialize ( in ) ;
}
}
2018-06-15 05:33:39 +00:00
void consume ( const char * data , size_t size )
{
2018-06-15 23:54:33 +00:00
/// First 'order' number of code points are pre-filled with BEGIN.
code_points . resize ( params . order ) ;
2018-06-15 05:33:39 +00:00
const char * pos = data ;
const char * end = data + size ;
2018-06-15 08:53:06 +00:00
while ( true )
2018-06-15 05:33:39 +00:00
{
2018-06-15 09:21:53 +00:00
const bool inside = pos < end ;
2018-06-15 08:53:06 +00:00
2018-06-15 09:21:53 +00:00
CodePoint next_code_point { } ;
2018-06-15 08:53:06 +00:00
if ( inside )
next_code_point = readCodePoint ( pos , end ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 23:54:33 +00:00
for ( size_t context_size = 0 ; context_size < params . order ; + + context_size )
2018-06-15 08:53:06 +00:00
{
NGramHash context_hash = hashContext ( code_points . data ( ) + code_points . size ( ) - context_size , code_points . data ( ) + code_points . size ( ) ) ;
if ( inside )
table [ context_hash ] . add ( next_code_point ) ;
else /// if (context_size != 0 || order == 0) /// Don't allow to break string without context (except order-0 model).
table [ context_hash ] . addEnd ( ) ;
}
if ( inside )
code_points . push_back ( next_code_point ) ;
else
break ;
2018-06-15 05:33:39 +00:00
}
}
void finalize ( )
{
2018-06-16 00:50:52 +00:00
if ( params . num_buckets_cutoff )
{
for ( auto & elem : table )
{
2019-10-29 15:16:51 +00:00
Histogram & histogram = elem . getMapped ( ) ;
2018-06-16 00:50:52 +00:00
if ( histogram . buckets . size ( ) < params . num_buckets_cutoff )
{
histogram . buckets . clear ( ) ;
histogram . total = 0 ;
}
}
}
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
if ( params . frequency_cutoff )
2018-06-15 08:53:06 +00:00
{
2018-06-16 00:50:52 +00:00
for ( auto & elem : table )
{
2019-10-29 15:16:51 +00:00
Histogram & histogram = elem . getMapped ( ) ;
2018-06-16 01:44:25 +00:00
if ( ! histogram . total )
continue ;
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
if ( histogram . total + histogram . count_end < params . frequency_cutoff )
{
histogram . buckets . clear ( ) ;
histogram . total = 0 ;
}
else
{
Histogram : : Buckets new_buckets ;
UInt64 erased_count = 0 ;
for ( const auto & bucket : histogram . buckets )
{
if ( bucket . second > = params . frequency_cutoff )
new_buckets . emplace ( bucket ) ;
else
erased_count + = bucket . second ;
}
histogram . buckets . swap ( new_buckets ) ;
histogram . total - = erased_count ;
}
}
}
if ( params . frequency_add )
{
for ( auto & elem : table )
2018-06-15 08:53:06 +00:00
{
2019-10-29 15:16:51 +00:00
Histogram & histogram = elem . getMapped ( ) ;
2018-06-16 01:44:25 +00:00
if ( ! histogram . total )
continue ;
2018-06-16 00:50:52 +00:00
for ( auto & bucket : histogram . buckets )
bucket . second + = params . frequency_add ;
histogram . count_end + = params . frequency_add ;
histogram . total + = params . frequency_add * histogram . buckets . size ( ) ;
2018-06-15 08:53:06 +00:00
}
2018-06-16 00:50:52 +00:00
}
2022-09-10 02:07:51 +00:00
if ( params . frequency_desaturate > 0.0 )
2018-06-16 00:50:52 +00:00
{
for ( auto & elem : table )
2018-06-15 08:53:06 +00:00
{
2019-10-29 15:16:51 +00:00
Histogram & histogram = elem . getMapped ( ) ;
2018-06-16 01:44:25 +00:00
if ( ! histogram . total )
continue ;
2018-06-16 00:50:52 +00:00
2022-04-18 08:18:31 +00:00
double average = static_cast < double > ( histogram . total ) / histogram . buckets . size ( ) ;
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
UInt64 new_total = 0 ;
for ( auto & bucket : histogram . buckets )
2018-06-15 09:40:40 +00:00
{
2022-09-10 02:07:51 +00:00
bucket . second = static_cast < UInt64 > ( bucket . second * ( 1.0 - params . frequency_desaturate ) + average * params . frequency_desaturate ) ;
2018-06-16 00:50:52 +00:00
new_total + = bucket . second ;
2018-06-15 09:40:40 +00:00
}
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
histogram . total = new_total ;
2018-06-15 08:53:06 +00:00
}
}
2018-06-15 05:33:39 +00:00
}
2018-06-15 08:53:06 +00:00
size_t generate ( char * data , size_t desired_size , size_t buffer_size ,
2018-06-15 05:33:39 +00:00
UInt64 seed , const char * determinator_data , size_t determinator_size )
{
2018-06-15 23:54:33 +00:00
code_points . resize ( params . order ) ;
2018-06-15 05:33:39 +00:00
char * pos = data ;
2018-06-15 08:53:06 +00:00
char * end = data + buffer_size ;
2018-06-15 05:33:39 +00:00
while ( pos < end )
{
2019-08-20 09:58:44 +00:00
Table : : LookupResult it ;
2018-06-15 05:33:39 +00:00
2018-06-15 23:54:33 +00:00
size_t context_size = params . order ;
2018-06-15 05:33:39 +00:00
while ( true )
{
it = table . find ( hashContext ( code_points . data ( ) + code_points . size ( ) - context_size , code_points . data ( ) + code_points . size ( ) ) ) ;
2019-10-29 15:16:51 +00:00
if ( it & & it - > getMapped ( ) . total + it - > getMapped ( ) . count_end ! = 0 )
2018-06-15 05:33:39 +00:00
break ;
if ( context_size = = 0 )
break ;
- - context_size ;
}
2019-08-20 09:58:44 +00:00
if ( ! it )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Logical error in markov model " ) ;
2018-06-15 05:33:39 +00:00
size_t offset_from_begin_of_string = pos - data ;
2018-06-16 01:44:25 +00:00
size_t determinator_sliding_window_size = params . determinator_sliding_window_size ;
if ( determinator_sliding_window_size > determinator_size )
determinator_sliding_window_size = determinator_size ;
2018-06-15 05:33:39 +00:00
size_t determinator_sliding_window_overflow = offset_from_begin_of_string + determinator_sliding_window_size > determinator_size
? offset_from_begin_of_string + determinator_sliding_window_size - determinator_size : 0 ;
const char * determinator_sliding_window_begin = determinator_data + offset_from_begin_of_string - determinator_sliding_window_overflow ;
SipHash hash ;
hash . update ( seed ) ;
hash . update ( determinator_sliding_window_begin , determinator_sliding_window_size ) ;
hash . update ( determinator_sliding_window_overflow ) ;
UInt64 determinator = hash . get64 ( ) ;
2018-06-15 08:53:06 +00:00
/// If string is greater than desired_size, increase probability of end.
double end_probability_multiplier = 0 ;
Int64 num_bytes_after_desired_size = ( pos - data ) - desired_size ;
2018-06-30 19:03:26 +00:00
if ( num_bytes_after_desired_size > 0 )
2018-06-15 08:53:06 +00:00
end_probability_multiplier = std : : pow ( 1.25 , num_bytes_after_desired_size ) ;
2019-10-29 15:16:51 +00:00
CodePoint code = it - > getMapped ( ) . sample ( determinator , end_probability_multiplier ) ;
2018-06-15 08:53:06 +00:00
if ( code = = END )
break ;
2018-06-15 05:33:39 +00:00
2018-06-30 19:03:26 +00:00
if ( num_bytes_after_desired_size > 0 )
{
/// Heuristic: break at ASCII non-alnum code point.
/// This allows to be close to desired_size but not break natural looking words.
if ( code < 128 & & ! isAlphaNumericASCII ( code ) )
break ;
}
2018-06-15 05:33:39 +00:00
if ( ! writeCodePoint ( code , pos , end ) )
break ;
2018-06-15 08:53:06 +00:00
code_points . push_back ( code ) ;
2018-06-15 05:33:39 +00:00
}
return pos - data ;
}
} ;
/// Generate length of strings as above.
/// To generate content of strings, use
/// order-N Markov model on Unicode code points,
/// and to generate next code point use deterministic RNG
2018-06-16 02:06:51 +00:00
/// determined by hash of a sliding window (default 8 bytes) of source string.
2018-06-15 05:33:39 +00:00
/// This is intended to generate locally-similar strings from locally-similar sources.
class StringModel : public IModel
{
private :
UInt64 seed ;
2018-06-15 08:53:06 +00:00
MarkovModel markov_model ;
2018-06-15 05:33:39 +00:00
public :
2022-06-07 11:53:10 +00:00
StringModel ( UInt64 seed_ , MarkovModelParameters params_ ) : seed ( seed_ ) , markov_model ( std : : move ( params_ ) ) { }
2018-06-15 05:33:39 +00:00
void train ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const ColumnString & column_string = assert_cast < const ColumnString & > ( column ) ;
2018-06-15 05:33:39 +00:00
size_t size = column_string . size ( ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
StringRef string = column_string . getDataAt ( i ) ;
markov_model . consume ( string . data , string . size ) ;
}
}
void finalize ( ) override
{
2018-06-15 08:53:06 +00:00
markov_model . finalize ( ) ;
2018-06-15 05:33:39 +00:00
}
ColumnPtr generate ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const ColumnString & column_string = assert_cast < const ColumnString & > ( column ) ;
2018-06-15 05:33:39 +00:00
size_t size = column_string . size ( ) ;
auto res_column = ColumnString : : create ( ) ;
res_column - > reserve ( size ) ;
std : : string new_string ;
for ( size_t i = 0 ; i < size ; + + i )
{
StringRef src_string = column_string . getDataAt ( i ) ;
size_t desired_string_size = transform ( src_string . size , seed ) ;
2018-06-15 08:53:06 +00:00
new_string . resize ( desired_string_size * 2 ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 07:09:54 +00:00
size_t actual_size = 0 ;
if ( desired_string_size ! = 0 )
2018-06-15 08:53:06 +00:00
actual_size = markov_model . generate ( new_string . data ( ) , desired_string_size , new_string . size ( ) , seed , src_string . data , src_string . size ) ;
2018-06-15 05:33:39 +00:00
res_column - > insertData ( new_string . data ( ) , actual_size ) ;
}
return res_column ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
seed = hash ( seed ) ;
}
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & out ) const override
{
markov_model . serialize ( out ) ;
}
void deserialize ( ReadBuffer & in ) override
{
markov_model . deserialize ( in ) ;
}
2018-06-15 05:33:39 +00:00
} ;
2018-06-15 07:09:54 +00:00
class ArrayModel : public IModel
{
private :
ModelPtr nested_model ;
public :
2020-03-18 03:27:32 +00:00
explicit ArrayModel ( ModelPtr nested_model_ ) : nested_model ( std : : move ( nested_model_ ) ) { }
2018-06-15 07:09:54 +00:00
void train ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const ColumnArray & column_array = assert_cast < const ColumnArray & > ( column ) ;
2018-06-15 07:09:54 +00:00
const IColumn & nested_column = column_array . getData ( ) ;
nested_model - > train ( nested_column ) ;
}
void finalize ( ) override
{
nested_model - > finalize ( ) ;
}
ColumnPtr generate ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const ColumnArray & column_array = assert_cast < const ColumnArray & > ( column ) ;
2018-06-15 07:09:54 +00:00
const IColumn & nested_column = column_array . getData ( ) ;
ColumnPtr new_nested_column = nested_model - > generate ( nested_column ) ;
2022-02-25 19:04:48 +00:00
return ColumnArray : : create ( IColumn : : mutate ( std : : move ( new_nested_column ) ) , IColumn : : mutate ( column_array . getOffsetsPtr ( ) ) ) ;
2018-06-15 07:09:54 +00:00
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
nested_model - > updateSeed ( ) ;
}
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & out ) const override
{
nested_model - > serialize ( out ) ;
}
void deserialize ( ReadBuffer & in ) override
{
nested_model - > deserialize ( in ) ;
}
2018-06-15 07:09:54 +00:00
} ;
class NullableModel : public IModel
{
private :
ModelPtr nested_model ;
public :
2020-03-18 03:27:32 +00:00
explicit NullableModel ( ModelPtr nested_model_ ) : nested_model ( std : : move ( nested_model_ ) ) { }
2018-06-15 07:09:54 +00:00
void train ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const ColumnNullable & column_nullable = assert_cast < const ColumnNullable & > ( column ) ;
2018-06-15 07:09:54 +00:00
const IColumn & nested_column = column_nullable . getNestedColumn ( ) ;
nested_model - > train ( nested_column ) ;
}
void finalize ( ) override
{
nested_model - > finalize ( ) ;
}
ColumnPtr generate ( const IColumn & column ) override
{
2019-08-21 02:28:04 +00:00
const ColumnNullable & column_nullable = assert_cast < const ColumnNullable & > ( column ) ;
2018-06-15 07:09:54 +00:00
const IColumn & nested_column = column_nullable . getNestedColumn ( ) ;
ColumnPtr new_nested_column = nested_model - > generate ( nested_column ) ;
2022-02-25 19:04:48 +00:00
return ColumnNullable : : create ( IColumn : : mutate ( std : : move ( new_nested_column ) ) , IColumn : : mutate ( column_nullable . getNullMapColumnPtr ( ) ) ) ;
2018-06-15 07:09:54 +00:00
}
2020-02-17 07:52:13 +00:00
void updateSeed ( ) override
{
nested_model - > updateSeed ( ) ;
}
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & out ) const override
{
nested_model - > serialize ( out ) ;
}
void deserialize ( ReadBuffer & in ) override
{
nested_model - > deserialize ( in ) ;
}
2018-06-15 07:09:54 +00:00
} ;
2018-06-15 05:33:39 +00:00
class ModelFactory
{
public :
2018-06-15 23:54:33 +00:00
ModelPtr get ( const IDataType & data_type , UInt64 seed , MarkovModelParameters markov_model_params ) const
2018-06-15 05:33:39 +00:00
{
2018-09-07 14:37:26 +00:00
if ( isInteger ( data_type ) )
2018-06-15 05:33:39 +00:00
{
2023-11-06 10:29:29 +00:00
if ( isUInt ( data_type ) )
2018-06-15 05:33:39 +00:00
return std : : make_unique < UnsignedIntegerModel > ( seed ) ;
else
return std : : make_unique < SignedIntegerModel > ( seed ) ;
}
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeFloat32 * > ( & data_type ) )
return std : : make_unique < FloatModel < Float32 > > ( seed ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeFloat64 * > ( & data_type ) )
return std : : make_unique < FloatModel < Float64 > > ( seed ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeDate * > ( & data_type ) )
return std : : make_unique < IdentityModel > ( ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeDateTime * > ( & data_type ) )
return std : : make_unique < DateTimeModel > ( seed ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeString * > ( & data_type ) )
2018-06-15 23:54:33 +00:00
return std : : make_unique < StringModel > ( seed , markov_model_params ) ;
2018-06-15 07:09:54 +00:00
if ( typeid_cast < const DataTypeFixedString * > ( & data_type ) )
return std : : make_unique < FixedStringModel > ( seed ) ;
2020-09-02 11:14:49 +00:00
if ( typeid_cast < const DataTypeUUID * > ( & data_type ) )
return std : : make_unique < UUIDModel > ( seed ) ;
2020-05-18 08:08:55 +00:00
if ( const auto * type = typeid_cast < const DataTypeArray * > ( & data_type ) )
2018-06-15 23:54:33 +00:00
return std : : make_unique < ArrayModel > ( get ( * type - > getNestedType ( ) , seed , markov_model_params ) ) ;
2018-06-15 07:09:54 +00:00
2020-05-18 08:08:55 +00:00
if ( const auto * type = typeid_cast < const DataTypeNullable * > ( & data_type ) )
2018-06-15 23:54:33 +00:00
return std : : make_unique < NullableModel > ( get ( * type - > getNestedType ( ) , seed , markov_model_params ) ) ;
2018-06-15 07:09:54 +00:00
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " Unsupported data type " ) ;
2018-06-15 05:33:39 +00:00
}
} ;
2018-06-15 23:54:33 +00:00
class Obfuscator
2018-06-15 05:33:39 +00:00
{
private :
std : : vector < ModelPtr > models ;
public :
2018-06-15 23:54:33 +00:00
Obfuscator ( const Block & header , UInt64 seed , MarkovModelParameters markov_model_params )
2018-06-15 05:33:39 +00:00
{
ModelFactory factory ;
size_t columns = header . columns ( ) ;
models . reserve ( columns ) ;
2019-05-02 22:09:38 +00:00
for ( const auto & elem : header )
models . emplace_back ( factory . get ( * elem . type , hash ( seed , elem . name ) , markov_model_params ) ) ;
2018-06-15 05:33:39 +00:00
}
void train ( const Columns & columns )
{
size_t size = columns . size ( ) ;
for ( size_t i = 0 ; i < size ; + + i )
models [ i ] - > train ( * columns [ i ] ) ;
}
void finalize ( )
{
for ( auto & model : models )
model - > finalize ( ) ;
}
Columns generate ( const Columns & columns )
{
size_t size = columns . size ( ) ;
Columns res ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
res [ i ] = models [ i ] - > generate ( * columns [ i ] ) ;
return res ;
}
2020-02-17 07:52:13 +00:00
void updateSeed ( )
{
for ( auto & model : models )
model - > updateSeed ( ) ;
}
2022-07-25 01:27:10 +00:00
void serialize ( WriteBuffer & out ) const
{
for ( const auto & model : models )
model - > serialize ( out ) ;
}
void deserialize ( ReadBuffer & in )
{
for ( auto & model : models )
model - > deserialize ( in ) ;
}
2018-06-15 05:33:39 +00:00
} ;
}
2024-03-11 12:54:34 +00:00
# pragma clang diagnostic ignored "-Wunused-function"
2024-03-11 15:34:02 +00:00
# pragma clang diagnostic ignored "-Wmissing-declarations"
2018-06-15 05:33:39 +00:00
2018-06-16 00:27:59 +00:00
int mainEntryClickHouseObfuscator ( int argc , char * * argv )
2018-06-15 06:14:39 +00:00
try
2018-06-15 05:33:39 +00:00
{
using namespace DB ;
namespace po = boost : : program_options ;
2020-10-29 05:18:42 +00:00
registerFormats ( ) ;
2019-08-23 15:47:27 +00:00
po : : options_description description = createOptionsDescription ( " Options " , getTerminalWidth ( ) ) ;
2018-06-15 05:33:39 +00:00
description . add_options ( )
( " help " , " produce help message " )
( " structure,S " , po : : value < std : : string > ( ) , " structure of the initial table (list of column and type names) " )
( " input-format " , po : : value < std : : string > ( ) , " input format of the initial table data " )
( " output-format " , po : : value < std : : string > ( ) , " default output format " )
2019-05-02 22:09:38 +00:00
( " seed " , po : : value < std : : string > ( ) , " seed (arbitrary string), must be random string with at least 10 bytes length; note that a seed for each column is derived from this seed and a column name: you can obfuscate data for different tables and as long as you use identical seed and identical column names, the data for corresponding non-text columns for different tables will be transformed in the same way, so the data for different tables can be JOINed after obfuscation " )
2022-07-25 01:32:01 +00:00
( " limit " , po : : value < UInt64 > ( ) , " if specified - stop after generating that number of rows; the limit can be also greater than the number of source dataset - in this case it will process the dataset in a loop more than one time, using different seeds on every iteration, generating result as large as needed " )
2018-06-16 00:27:59 +00:00
( " silent " , po : : value < bool > ( ) - > default_value ( false ) , " don't print information messages to stderr " )
2022-07-25 01:27:10 +00:00
( " save " , po : : value < std : : string > ( ) , " save the models after training to the specified file. You can use --limit 0 to skip the generation step. The file is using binary, platform-dependent, opaque serialization format. The model parameters are saved, while the seed is not. " )
( " load " , po : : value < std : : string > ( ) , " load the models instead of training from the specified file. The table structure must match the saved file. The seed should be specified separately, while other model parameters are loaded. " )
2018-06-15 08:53:06 +00:00
( " order " , po : : value < UInt64 > ( ) - > default_value ( 5 ) , " order of markov model to generate strings " )
2018-06-16 00:50:52 +00:00
( " frequency-cutoff " , po : : value < UInt64 > ( ) - > default_value ( 5 ) , " frequency cutoff for markov model: remove all buckets with count less than specified " )
2018-06-16 23:34:34 +00:00
( " num-buckets-cutoff " , po : : value < UInt64 > ( ) - > default_value ( 0 ) , " cutoff for number of different possible continuations for a context: remove all histograms with less than specified number of buckets " )
2018-06-16 00:50:52 +00:00
( " frequency-add " , po : : value < UInt64 > ( ) - > default_value ( 0 ) , " add a constant to every count to lower probability distribution skew " )
( " frequency-desaturate " , po : : value < double > ( ) - > default_value ( 0 ) , " 0..1 - move every frequency towards average to lower probability distribution skew " )
2018-06-16 01:44:25 +00:00
( " determinator-sliding-window-size " , po : : value < UInt64 > ( ) - > default_value ( 8 ) , " size of a sliding window in a source string - its hash is used as a seed for RNG in markov model " )
2018-06-15 05:33:39 +00:00
;
po : : parsed_options parsed = po : : command_line_parser ( argc , argv ) . options ( description ) . run ( ) ;
po : : variables_map options ;
po : : store ( parsed , options ) ;
2018-06-15 23:54:33 +00:00
if ( options . count ( " help " )
| | ! options . count ( " seed " )
| | ! options . count ( " input-format " )
| | ! options . count ( " output-format " ) )
2018-06-15 05:33:39 +00:00
{
2020-01-11 09:50:41 +00:00
std : : cout < < documentation < < " \n "
2023-04-20 22:54:34 +00:00
< < " \n Usage: " < < argv [ 0 ] < < " [options] < in > out \n "
2018-06-15 10:03:02 +00:00
< < " \n Input must be seekable file (it will be read twice). \n "
< < " \n " < < description < < " \n "
2023-04-20 22:54:34 +00:00
< < " \n Example: \n " < < argv [ 0 ] < < " --seed \" $(head -c16 /dev/urandom | base64) \" --input-format TSV --output-format TSV --structure 'CounterID UInt32, URLDomain String, URL String, SearchPhrase String, Title String' < stats.tsv \n " ;
2018-06-15 05:33:39 +00:00
return 0 ;
}
2022-07-25 01:27:10 +00:00
if ( options . count ( " save " ) & & options . count ( " load " ) )
{
std : : cerr < < " The options --save and --load cannot be used together. \n " ;
return 1 ;
}
2018-06-15 05:33:39 +00:00
UInt64 seed = sipHash64 ( options [ " seed " ] . as < std : : string > ( ) ) ;
2022-08-11 11:32:32 +00:00
std : : string structure ;
if ( options . count ( " structure " ) )
structure = options [ " structure " ] . as < std : : string > ( ) ;
2018-06-15 05:33:39 +00:00
std : : string input_format = options [ " input-format " ] . as < std : : string > ( ) ;
std : : string output_format = options [ " output-format " ] . as < std : : string > ( ) ;
2022-07-25 01:49:09 +00:00
std : : string load_from_file ;
std : : string save_into_file ;
if ( options . count ( " load " ) )
load_from_file = options [ " load " ] . as < std : : string > ( ) ;
else if ( options . count ( " save " ) )
save_into_file = options [ " save " ] . as < std : : string > ( ) ;
2022-07-25 01:27:10 +00:00
2020-02-17 07:52:13 +00:00
UInt64 limit = 0 ;
2018-06-16 00:27:59 +00:00
if ( options . count ( " limit " ) )
limit = options [ " limit " ] . as < UInt64 > ( ) ;
bool silent = options [ " silent " ] . as < bool > ( ) ;
2018-06-15 23:54:33 +00:00
MarkovModelParameters markov_model_params ;
markov_model_params . order = options [ " order " ] . as < UInt64 > ( ) ;
2018-06-16 00:50:52 +00:00
markov_model_params . frequency_cutoff = options [ " frequency-cutoff " ] . as < UInt64 > ( ) ;
markov_model_params . num_buckets_cutoff = options [ " num-buckets-cutoff " ] . as < UInt64 > ( ) ;
markov_model_params . frequency_add = options [ " frequency-add " ] . as < UInt64 > ( ) ;
markov_model_params . frequency_desaturate = options [ " frequency-desaturate " ] . as < double > ( ) ;
2018-06-16 01:44:25 +00:00
markov_model_params . determinator_sliding_window_size = options [ " determinator-sliding-window-size " ] . as < UInt64 > ( ) ;
2018-06-15 08:53:06 +00:00
2022-07-25 01:27:10 +00:00
/// Create the header block
2022-08-11 11:32:32 +00:00
SharedContextHolder shared_context = Context : : createShared ( ) ;
auto context = Context : : createGlobal ( shared_context . get ( ) ) ;
2022-09-01 13:23:34 +00:00
auto context_const = WithContext ( context ) . getContext ( ) ;
2022-08-11 11:32:32 +00:00
context - > makeGlobalContext ( ) ;
2018-06-15 05:33:39 +00:00
Block header ;
2022-09-01 13:23:34 +00:00
ColumnsDescription schema_columns ;
2022-08-11 11:32:32 +00:00
if ( structure . empty ( ) )
2018-06-15 05:33:39 +00:00
{
2023-08-22 12:55:00 +00:00
auto file = std : : make_unique < ReadBufferFromFileDescriptor > ( STDIN_FILENO ) ;
2022-08-11 11:32:32 +00:00
2023-08-22 12:55:00 +00:00
/// stdin must be seekable
auto res = lseek ( file - > getFD ( ) , 0 , SEEK_SET ) ;
if ( - 1 = = res )
2023-12-15 18:25:49 +00:00
throw ErrnoException ( ErrorCodes : : CANNOT_SEEK_THROUGH_FILE , " Input must be seekable file (it will be read twice) " ) ;
2022-08-11 11:32:32 +00:00
2023-08-22 12:55:00 +00:00
SingleReadBufferIterator read_buffer_iterator ( std : : move ( file ) ) ;
2024-01-22 22:55:50 +00:00
schema_columns = readSchemaFromFormat ( input_format , { } , read_buffer_iterator , context_const ) ;
2018-06-15 05:33:39 +00:00
}
2022-08-11 11:32:32 +00:00
else
{
2022-09-01 13:23:34 +00:00
schema_columns = parseColumnsListFromString ( structure , context_const ) ;
}
2022-08-11 11:32:32 +00:00
2022-09-01 13:23:34 +00:00
auto schema_columns_info = schema_columns . getOrdinary ( ) ;
2022-08-11 11:32:32 +00:00
2022-09-01 13:23:34 +00:00
for ( auto & info : schema_columns_info )
{
ColumnWithTypeAndName column ;
column . name = info . name ;
column . type = info . type ;
column . column = column . type - > createColumn ( ) ;
header . insert ( std : : move ( column ) ) ;
2022-08-11 11:32:32 +00:00
}
2018-06-15 05:33:39 +00:00
ReadBufferFromFileDescriptor file_in ( STDIN_FILENO ) ;
WriteBufferFromFileDescriptor file_out ( STDOUT_FILENO ) ;
2022-08-11 11:32:32 +00:00
if ( load_from_file . empty ( ) | | structure . empty ( ) )
2018-06-16 00:27:59 +00:00
{
/// stdin must be seekable
2018-06-16 03:35:23 +00:00
auto res = lseek ( file_in . getFD ( ) , 0 , SEEK_SET ) ;
if ( - 1 = = res )
2023-12-15 18:25:49 +00:00
throw ErrnoException ( ErrorCodes : : CANNOT_SEEK_THROUGH_FILE , " Input must be seekable file (it will be read twice) " ) ;
2018-06-16 00:27:59 +00:00
}
2018-06-15 23:54:33 +00:00
Obfuscator obfuscator ( header , seed , markov_model_params ) ;
2018-06-15 05:33:39 +00:00
2019-02-10 16:55:12 +00:00
UInt64 max_block_size = 8192 ;
2018-06-15 05:33:39 +00:00
/// Train step
2020-02-17 07:52:13 +00:00
UInt64 source_rows = 0 ;
2022-07-25 01:27:10 +00:00
bool rewind_needed = false ;
if ( load_from_file . empty ( ) )
2018-06-15 05:33:39 +00:00
{
2018-06-16 00:27:59 +00:00
if ( ! silent )
std : : cerr < < " Training models \n " ;
2021-10-11 16:11:50 +00:00
Pipe pipe ( context - > getInputFormat ( input_format , file_in , header , max_block_size ) ) ;
2018-06-15 05:33:39 +00:00
2021-09-15 19:35:48 +00:00
QueryPipeline pipeline ( std : : move ( pipe ) ) ;
2021-07-22 16:05:52 +00:00
PullingPipelineExecutor executor ( pipeline ) ;
Block block ;
while ( executor . pull ( block ) )
2018-06-15 09:44:14 +00:00
{
2018-06-15 23:54:33 +00:00
obfuscator . train ( block . getColumns ( ) ) ;
2020-02-17 07:52:13 +00:00
source_rows + = block . rows ( ) ;
2018-06-16 00:27:59 +00:00
if ( ! silent )
2020-02-17 07:52:13 +00:00
std : : cerr < < " Processed " < < source_rows < < " rows \n " ;
2018-06-15 09:44:14 +00:00
}
2022-07-25 01:27:10 +00:00
obfuscator . finalize ( ) ;
rewind_needed = true ;
}
else
{
2022-07-25 01:56:32 +00:00
if ( ! silent )
std : : cerr < < " Loading models \n " ;
2022-07-25 01:27:10 +00:00
ReadBufferFromFile model_file_in ( load_from_file ) ;
CompressedReadBuffer model_in ( model_file_in ) ;
UInt8 version = 0 ;
readBinary ( version , model_in ) ;
if ( version ! = 0 )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : UNKNOWN_FORMAT_VERSION , " Unknown version of the model file " ) ;
2022-07-25 01:27:10 +00:00
readBinary ( source_rows , model_in ) ;
Names data_types = header . getDataTypeNames ( ) ;
size_t header_size = 0 ;
readBinary ( header_size , model_in ) ;
if ( header_size ! = data_types . size ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : INCORRECT_NUMBER_OF_COLUMNS , " The saved model was created for different number of columns " ) ;
2022-07-25 01:27:10 +00:00
for ( size_t i = 0 ; i < header_size ; + + i )
{
String type ;
readBinary ( type , model_in ) ;
if ( type ! = data_types [ i ] )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : TYPE_MISMATCH , " The saved model was created for different types of columns " ) ;
2022-07-25 01:27:10 +00:00
}
obfuscator . deserialize ( model_in ) ;
2018-06-15 05:33:39 +00:00
}
2022-07-25 01:27:10 +00:00
if ( ! save_into_file . empty ( ) )
{
2022-07-25 01:56:32 +00:00
if ( ! silent )
std : : cerr < < " Saving models \n " ;
2022-07-25 01:27:10 +00:00
WriteBufferFromFile model_file_out ( save_into_file ) ;
CompressedWriteBuffer model_out ( model_file_out , CompressionCodecFactory : : instance ( ) . get ( " ZSTD " , 1 ) ) ;
/// You can change version on format change, it is currently set to zero.
UInt8 version = 0 ;
writeBinary ( version , model_out ) ;
writeBinary ( source_rows , model_out ) ;
/// We are writing the data types for validation, because the models serialization depends on the data types.
Names data_types = header . getDataTypeNames ( ) ;
size_t header_size = data_types . size ( ) ;
writeBinary ( header_size , model_out ) ;
for ( const auto & type : data_types )
writeBinary ( type , model_out ) ;
/// Write the models.
obfuscator . serialize ( model_out ) ;
model_out . finalize ( ) ;
model_file_out . finalize ( ) ;
}
2018-06-15 05:33:39 +00:00
2022-07-25 01:49:09 +00:00
if ( ! options . count ( " limit " ) )
2020-02-17 07:52:13 +00:00
limit = source_rows ;
2018-06-15 05:33:39 +00:00
/// Generation step
2020-02-17 07:52:13 +00:00
UInt64 processed_rows = 0 ;
while ( processed_rows < limit )
2018-06-15 05:33:39 +00:00
{
2018-06-16 00:27:59 +00:00
if ( ! silent )
std : : cerr < < " Generating data \n " ;
2022-07-25 01:27:10 +00:00
if ( rewind_needed )
file_in . rewind ( ) ;
2018-06-15 05:33:39 +00:00
2021-10-11 16:11:50 +00:00
Pipe pipe ( context - > getInputFormat ( input_format , file_in , header , max_block_size ) ) ;
2018-06-15 05:33:39 +00:00
2020-02-17 07:52:13 +00:00
if ( processed_rows + source_rows > limit )
2021-07-22 16:05:52 +00:00
{
pipe . addSimpleTransform ( [ & ] ( const Block & cur_header )
{
return std : : make_shared < LimitTransform > ( cur_header , limit - processed_rows , 0 ) ;
} ) ;
}
2021-10-11 16:11:50 +00:00
QueryPipeline in_pipeline ( std : : move ( pipe ) ) ;
2021-07-22 16:05:52 +00:00
2021-10-11 16:11:50 +00:00
auto output = context - > getOutputFormatParallelIfPossible ( output_format , file_out , header ) ;
QueryPipeline out_pipeline ( std : : move ( output ) ) ;
2021-07-22 16:05:52 +00:00
2021-10-11 16:11:50 +00:00
PullingPipelineExecutor in_executor ( in_pipeline ) ;
PushingPipelineExecutor out_executor ( out_pipeline ) ;
2018-06-16 00:27:59 +00:00
2021-07-22 16:05:52 +00:00
Block block ;
2021-10-11 16:11:50 +00:00
out_executor . start ( ) ;
while ( in_executor . pull ( block ) )
2018-06-15 05:33:39 +00:00
{
2018-06-15 23:54:33 +00:00
Columns columns = obfuscator . generate ( block . getColumns ( ) ) ;
2021-10-11 16:11:50 +00:00
out_executor . push ( header . cloneWithColumns ( columns ) ) ;
2018-06-15 09:44:14 +00:00
processed_rows + = block . rows ( ) ;
2018-06-16 00:27:59 +00:00
if ( ! silent )
std : : cerr < < " Processed " < < processed_rows < < " rows \n " ;
2018-06-15 05:33:39 +00:00
}
2021-10-11 16:11:50 +00:00
out_executor . finish ( ) ;
2020-02-17 07:52:13 +00:00
obfuscator . updateSeed ( ) ;
2022-07-25 01:27:10 +00:00
rewind_needed = true ;
2018-06-15 05:33:39 +00:00
}
return 0 ;
}
2018-06-15 06:14:39 +00:00
catch ( . . . )
{
std : : cerr < < DB : : getCurrentExceptionMessage ( true ) < < " \n " ;
auto code = DB : : getCurrentExceptionCode ( ) ;
return code ? code : 1 ;
}