2018-06-15 05:33:39 +00:00
# include <Columns/IColumn.h>
# include <Columns/ColumnVector.h>
# include <Columns/ColumnString.h>
2018-06-15 07:09:54 +00:00
# include <Columns/ColumnArray.h>
# include <Columns/ColumnNullable.h>
# include <Columns/ColumnFixedString.h>
2018-06-15 05:33:39 +00:00
# include <DataTypes/IDataType.h>
# include <DataTypes/DataTypesNumber.h>
# include <DataTypes/DataTypeDate.h>
# include <DataTypes/DataTypeDateTime.h>
# include <DataTypes/DataTypeString.h>
2018-06-15 07:09:54 +00:00
# include <DataTypes/DataTypeFixedString.h>
# include <DataTypes/DataTypeArray.h>
# include <DataTypes/DataTypeNullable.h>
2018-06-15 05:33:39 +00:00
# include <DataTypes/DataTypeFactory.h>
# include <Interpreters/Context.h>
# include <DataStreams/IBlockInputStream.h>
# include <DataStreams/IBlockOutputStream.h>
2018-06-16 00:27:59 +00:00
# include <DataStreams/LimitBlockInputStream.h>
2018-06-15 05:33:39 +00:00
# include <Common/SipHash.h>
# include <Common/UTF8Helpers.h>
2018-06-15 09:07:42 +00:00
# include <Common/StringUtils/StringUtils.h>
2018-06-15 05:33:39 +00:00
# include <Common/HashTable/HashMap.h>
# include <Common/typeid_cast.h>
# include <Core/Block.h>
# include <common/StringRef.h>
# include <common/DateLUT.h>
# include <IO/ReadBufferFromFileDescriptor.h>
# include <IO/WriteBufferFromFileDescriptor.h>
# include <ext/bit_cast.h>
# include <memory>
2018-06-15 08:53:06 +00:00
# include <cmath>
2018-06-19 18:09:09 +00:00
# include <optional>
2018-06-16 03:35:23 +00:00
# include <unistd.h>
2018-06-15 05:33:39 +00:00
# include <boost/program_options/options_description.hpp>
# include <boost/program_options.hpp>
# include <boost/algorithm/string.hpp>
2018-06-15 09:40:40 +00:00
# include <boost/container/flat_map.hpp>
2018-06-15 05:33:39 +00:00
2018-06-16 01:44:25 +00:00
static const char * documantation = R " (
Simple tool for table data obfuscation .
It reads input table and produces output table , that retain some properties of input , but contains different data .
It allows to publish almost real production data for usage in benchmarks .
It is designed to retain the following properties of data :
- cardinalities of values ( number of distinct values ) for every column and for every tuple of columns ;
- conditional cardinalities : number of distinct values of one column under condition on value of another column ;
- probability distributions of absolute value of integers ; sign of signed integers ; exponent and sign for floats ;
- probability distributions of length of strings ;
- probability of zero values of numbers ; empty strings and arrays , NULLs ;
- data compression ratio when compressed with LZ77 and entropy family of codecs ;
2019-01-22 19:56:53 +00:00
- continuity ( magnitude of difference ) of time values across table ; continuity of floating point values .
2018-06-16 01:44:25 +00:00
- date component of DateTime values ;
- UTF - 8 validity of string values ;
- string values continue to look somewhat natural .
Most of the properties above are viable for performance testing :
- reading data , filtering , aggregation and sorting will work at almost the same speed
2018-07-24 18:46:23 +00:00
as on original data due to saved cardinalities , magnitudes , compression ratios , etc .
2018-06-16 01:44:25 +00:00
It works in deterministic fashion : you define a seed value and transform is totally determined by input data and by seed .
Some transforms are one to one and could be reversed , so you need to have large enough seed and keep it in secret .
It use some cryptographic primitives to transform data , but from the cryptographic point of view ,
2018-07-24 18:46:23 +00:00
it doesn ' t do anything properly and you should never consider the result as secure , unless you have other reasons for it .
2018-06-16 01:44:25 +00:00
It may retain some data you don ' t want to publish .
It always leave numbers 0 , 1 , - 1 as is . Also it leaves dates , lengths of arrays and null flags exactly as in source data .
For example , you have a column IsMobile in your table with values 0 and 1. In transformed data , it will have the same value .
So , the user will be able to count exact ratio of mobile traffic .
Another example , suppose you have some private data in your table , like user email and you don ' t want to publish any single email address .
If your table is large enough and contain multiple different emails and there is no email that have very high frequency than all others ,
2018-07-24 18:46:23 +00:00
it will perfectly anonymize all data . But if you have small amount of different values in a column , it can possibly reproduce some of them .
2018-06-16 01:44:25 +00:00
And you should take care and look at exact algorithm , how this tool works , and probably fine tune some of it command line parameters .
This tool works fine only with reasonable amount of data ( at least 1000 s of rows ) .
) " ;
2018-06-15 05:33:39 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR ;
2018-06-30 19:03:26 +00:00
extern const int NOT_IMPLEMENTED ;
2018-06-16 03:35:23 +00:00
extern const int CANNOT_SEEK_THROUGH_FILE ;
2018-06-15 05:33:39 +00:00
}
/// Model is used to transform columns with source data to columns
/// with similar by structure and by probability distributions but anonymized data.
class IModel
{
public :
/// Call train iteratively for each block to train a model.
virtual void train ( const IColumn & column ) ;
/// Call finalize one time after training before generating.
virtual void finalize ( ) ;
/// Call generate: pass source data column to obtain a column with anonymized data as a result.
virtual ColumnPtr generate ( const IColumn & column ) ;
virtual ~ IModel ( ) { }
} ;
using ModelPtr = std : : unique_ptr < IModel > ;
template < typename . . . Ts >
UInt64 hash ( Ts . . . xs )
{
SipHash hash ;
( hash . update ( xs ) , . . . ) ;
return hash . get64 ( ) ;
}
UInt64 maskBits ( UInt64 x , size_t num_bits )
{
2019-01-09 15:44:20 +00:00
return x & ( ( 1ULL < < num_bits ) - 1 ) ;
2018-06-15 05:33:39 +00:00
}
/// Apply Feistel network round to least significant num_bits part of x.
UInt64 feistelRound ( UInt64 x , size_t num_bits , UInt64 seed , size_t round )
{
2018-06-15 06:14:39 +00:00
size_t num_bits_left_half = num_bits / 2 ;
size_t num_bits_right_half = num_bits - num_bits_left_half ;
2018-06-15 05:33:39 +00:00
UInt64 left_half = maskBits ( x > > num_bits_right_half , num_bits_left_half ) ;
2018-06-15 06:14:39 +00:00
UInt64 right_half = maskBits ( x , num_bits_right_half ) ;
2018-06-15 05:33:39 +00:00
UInt64 new_left_half = right_half ;
2018-06-15 06:14:39 +00:00
UInt64 new_right_half = left_half ^ maskBits ( hash ( right_half , seed , round ) , num_bits_left_half ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 06:14:39 +00:00
return ( new_left_half < < num_bits_left_half ) ^ new_right_half ;
2018-06-15 05:33:39 +00:00
}
/// Apply Feistel network with num_rounds to least significant num_bits part of x.
UInt64 feistelNetwork ( UInt64 x , size_t num_bits , UInt64 seed , size_t num_rounds = 4 )
{
2018-06-15 06:14:39 +00:00
UInt64 bits = maskBits ( x , num_bits ) ;
2018-06-15 05:33:39 +00:00
for ( size_t i = 0 ; i < num_rounds ; + + i )
2018-06-15 06:14:39 +00:00
bits = feistelRound ( bits , num_bits , seed , i ) ;
2019-01-09 15:44:20 +00:00
return ( x & ~ ( ( 1ULL < < num_bits ) - 1 ) ) ^ bits ;
2018-06-15 05:33:39 +00:00
}
/// Pseudorandom permutation within set of numbers with the same log2(x).
UInt64 transform ( UInt64 x , UInt64 seed )
{
/// Keep 0 and 1 as is.
if ( x = = 0 | | x = = 1 )
return x ;
/// Pseudorandom permutation of two elements.
if ( x = = 2 | | x = = 3 )
return x ^ ( seed & 1 ) ;
size_t num_leading_zeros = __builtin_clzll ( x ) ;
2018-06-15 06:14:39 +00:00
2018-06-15 05:33:39 +00:00
return feistelNetwork ( x , 64 - num_leading_zeros - 1 , seed ) ;
}
class UnsignedIntegerModel : public IModel
{
private :
const UInt64 seed ;
public :
UnsignedIntegerModel ( UInt64 seed ) : seed ( seed ) { }
void train ( const IColumn & ) override { }
void finalize ( ) override { }
ColumnPtr generate ( const IColumn & column ) override
{
MutableColumnPtr res = column . cloneEmpty ( ) ;
size_t size = column . size ( ) ;
res - > reserve ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
res - > insert ( transform ( column . getUInt ( i ) , seed ) ) ;
return res ;
}
} ;
/// Keep sign and apply pseudorandom permutation after converting to unsigned as above.
Int64 transformSigned ( Int64 x , UInt64 seed )
{
if ( x > = 0 )
return transform ( x , seed ) ;
else
return - transform ( - x , seed ) ; /// It works Ok even for minimum signed number.
}
class SignedIntegerModel : public IModel
{
private :
const UInt64 seed ;
public :
SignedIntegerModel ( UInt64 seed ) : seed ( seed ) { }
void train ( const IColumn & ) override { }
void finalize ( ) override { }
ColumnPtr generate ( const IColumn & column ) override
{
MutableColumnPtr res = column . cloneEmpty ( ) ;
size_t size = column . size ( ) ;
res - > reserve ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
res - > insert ( transformSigned ( column . getInt ( i ) , seed ) ) ;
return res ;
}
} ;
/// Pseudorandom permutation of mantissa.
template < typename Float >
Float transformFloatMantissa ( Float x , UInt64 seed )
{
using UInt = std : : conditional_t < std : : is_same_v < Float , Float32 > , UInt32 , UInt64 > ;
constexpr size_t mantissa_num_bits = std : : is_same_v < Float , Float32 > ? 23 : 52 ;
UInt x_uint = ext : : bit_cast < UInt > ( x ) ;
x_uint = feistelNetwork ( x_uint , mantissa_num_bits , seed ) ;
return ext : : bit_cast < Float > ( x_uint ) ;
}
/// Transform difference from previous number by applying pseudorandom permutation to mantissa part of it.
2019-01-22 19:56:53 +00:00
/// It allows to retain some continuity property of source data.
2018-06-15 05:33:39 +00:00
template < typename Float >
class FloatModel : public IModel
{
private :
const UInt64 seed ;
Float src_prev_value = 0 ;
Float res_prev_value = 0 ;
public :
FloatModel ( UInt64 seed ) : seed ( seed ) { }
void train ( const IColumn & ) override { }
void finalize ( ) override { }
ColumnPtr generate ( const IColumn & column ) override
{
const auto & src_data = static_cast < const ColumnVector < Float > & > ( column ) . getData ( ) ;
size_t size = src_data . size ( ) ;
auto res_column = ColumnVector < Float > : : create ( size ) ;
auto & res_data = static_cast < ColumnVector < Float > & > ( * res_column ) . getData ( ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
res_data [ i ] = res_prev_value + transformFloatMantissa ( src_data [ i ] - src_prev_value , seed ) ;
src_prev_value = src_data [ i ] ;
res_prev_value = res_data [ i ] ;
}
return res_column ;
}
} ;
/// Leave all data as is. For example, it is used for columns of type Date.
class IdentityModel : public IModel
{
public :
void train ( const IColumn & ) override { }
void finalize ( ) override { }
ColumnPtr generate ( const IColumn & column ) override
{
return column . cloneResized ( column . size ( ) ) ;
}
} ;
2018-06-15 09:07:42 +00:00
/// Pseudorandom function, but keep word characters as word characters.
2018-06-15 07:09:54 +00:00
void transformFixedString ( const UInt8 * src , UInt8 * dst , size_t size , UInt64 seed )
{
{
SipHash hash ;
hash . update ( seed ) ;
hash . update ( reinterpret_cast < const char * > ( src ) , size ) ;
seed = hash . get64 ( ) ;
}
UInt8 * pos = dst ;
UInt8 * end = dst + size ;
size_t i = 0 ;
while ( pos < end )
{
SipHash hash ;
hash . update ( seed ) ;
hash . update ( i ) ;
2018-06-15 10:42:13 +00:00
if ( size > = 16 )
{
2019-01-04 13:32:08 +00:00
char * hash_dst = reinterpret_cast < char * > ( std : : min ( pos , end - 16 ) ) ;
hash . get128 ( hash_dst ) ;
2018-06-15 10:42:13 +00:00
}
else
{
char value [ 16 ] ;
hash . get128 ( value ) ;
memcpy ( dst , value , end - dst ) ;
}
2018-06-15 07:09:54 +00:00
pos + = 16 ;
+ + i ;
}
2018-06-15 09:07:42 +00:00
for ( size_t j = 0 ; j < size ; + + j )
{
if ( isWordCharASCII ( src [ j ] ) )
{
static constexpr char word_chars [ ] = " _01234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ " ;
2018-06-16 02:04:46 +00:00
dst [ j ] = word_chars [ dst [ j ] % ( sizeof ( word_chars ) - 1 ) ] ;
2018-06-15 09:07:42 +00:00
}
}
2018-06-15 07:09:54 +00:00
}
class FixedStringModel : public IModel
{
private :
const UInt64 seed ;
public :
FixedStringModel ( UInt64 seed ) : seed ( seed ) { }
void train ( const IColumn & ) override { }
void finalize ( ) override { }
ColumnPtr generate ( const IColumn & column ) override
{
const ColumnFixedString & column_fixed_string = static_cast < const ColumnFixedString & > ( column ) ;
const size_t string_size = column_fixed_string . getN ( ) ;
const auto & src_data = column_fixed_string . getChars ( ) ;
size_t size = column_fixed_string . size ( ) ;
auto res_column = ColumnFixedString : : create ( string_size ) ;
auto & res_data = res_column - > getChars ( ) ;
res_data . resize ( src_data . size ( ) ) ;
for ( size_t i = 0 ; i < size ; + + i )
transformFixedString ( & src_data [ i * string_size ] , & res_data [ i * string_size ] , string_size , seed ) ;
return res_column ;
}
} ;
2018-06-15 05:33:39 +00:00
/// Leave date part as is and apply pseudorandom permutation to time difference with previous value within the same log2 class.
class DateTimeModel : public IModel
{
private :
const UInt64 seed ;
UInt32 src_prev_value = 0 ;
UInt32 res_prev_value = 0 ;
const DateLUTImpl & date_lut ;
public :
DateTimeModel ( UInt64 seed ) : seed ( seed ) , date_lut ( DateLUT : : instance ( ) ) { }
void train ( const IColumn & ) override { }
void finalize ( ) override { }
ColumnPtr generate ( const IColumn & column ) override
{
const auto & src_data = static_cast < const ColumnVector < UInt32 > & > ( column ) . getData ( ) ;
size_t size = src_data . size ( ) ;
auto res_column = ColumnVector < UInt32 > : : create ( size ) ;
auto & res_data = static_cast < ColumnVector < UInt32 > & > ( * res_column ) . getData ( ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
2018-06-15 09:21:53 +00:00
UInt32 src_datetime = src_data [ i ] ;
UInt32 src_date = date_lut . toDate ( src_datetime ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 09:21:53 +00:00
Int32 src_diff = src_datetime - src_prev_value ;
Int32 res_diff = transformSigned ( src_diff , seed ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 09:21:53 +00:00
UInt32 new_datetime = res_prev_value + res_diff ;
UInt32 new_time = new_datetime - date_lut . toDate ( new_datetime ) ;
res_data [ i ] = src_date + new_time ;
2018-06-15 05:33:39 +00:00
2018-06-15 09:21:53 +00:00
src_prev_value = src_datetime ;
2018-06-15 05:33:39 +00:00
res_prev_value = res_data [ i ] ;
}
return res_column ;
}
} ;
2018-06-15 23:54:33 +00:00
struct MarkovModelParameters
{
size_t order ;
size_t frequency_cutoff ;
2018-06-16 00:50:52 +00:00
size_t num_buckets_cutoff ;
size_t frequency_add ;
double frequency_desaturate ;
2018-06-16 01:44:25 +00:00
size_t determinator_sliding_window_size ;
2018-06-15 23:54:33 +00:00
} ;
2018-06-16 00:50:52 +00:00
/** Actually it's not an order-N model, but a mix of order-{0..N} models.
*
* We calculate code point counts for every context of 0. . N previous code points .
* Then throw off some context with low amount of statistics .
*
* When generating data , we try to find statistics for a context of maximum order .
* And if not found - use context of smaller order , up to 0.
*/
2018-06-15 05:33:39 +00:00
class MarkovModel
{
private :
using CodePoint = UInt32 ;
using NGramHash = UInt32 ;
struct Histogram
{
2018-06-15 08:53:06 +00:00
UInt64 total = 0 ; /// Not including count_end.
UInt64 count_end = 0 ;
2018-06-15 09:40:40 +00:00
using Buckets = boost : : container : : flat_map < CodePoint , UInt64 > ;
Buckets buckets ;
2018-06-15 05:33:39 +00:00
void add ( CodePoint code )
{
+ + total ;
2018-06-15 09:40:40 +00:00
+ + buckets [ code ] ;
2018-06-15 05:33:39 +00:00
}
2018-06-15 08:53:06 +00:00
void addEnd ( )
2018-06-15 05:33:39 +00:00
{
2018-06-15 08:53:06 +00:00
+ + count_end ;
}
CodePoint sample ( UInt64 random , double end_multiplier ) const
{
UInt64 range = total + UInt64 ( count_end * end_multiplier ) ;
if ( range = = 0 )
return END ;
random % = range ;
2018-06-15 05:33:39 +00:00
UInt64 sum = 0 ;
2018-06-15 09:40:40 +00:00
for ( const auto & elem : buckets )
2018-06-15 05:33:39 +00:00
{
2018-06-15 09:40:40 +00:00
sum + = elem . second ;
2018-06-15 05:33:39 +00:00
if ( sum > random )
2018-06-15 09:40:40 +00:00
return elem . first ;
2018-06-15 05:33:39 +00:00
}
2018-06-15 08:53:06 +00:00
return END ;
2018-06-15 05:33:39 +00:00
}
} ;
using Table = HashMap < NGramHash , Histogram , TrivialHash > ;
Table table ;
2018-06-15 23:54:33 +00:00
MarkovModelParameters params ;
2018-06-15 05:33:39 +00:00
std : : vector < CodePoint > code_points ;
2018-06-15 23:54:33 +00:00
/// Special code point to form context before beginning of string.
2018-06-15 08:53:06 +00:00
static constexpr CodePoint BEGIN = - 1 ;
2018-06-15 23:54:33 +00:00
/// Special code point to indicate end of string.
2018-06-15 08:53:06 +00:00
static constexpr CodePoint END = - 2 ;
2018-06-15 05:33:39 +00:00
NGramHash hashContext ( const CodePoint * begin , const CodePoint * end ) const
{
return CRC32Hash ( ) ( StringRef ( reinterpret_cast < const char * > ( begin ) , ( end - begin ) * sizeof ( CodePoint ) ) ) ;
}
2018-10-13 14:33:43 +00:00
/// By the way, we don't have to use actual Unicode numbers. We use just arbitrary bijective mapping.
2018-06-15 05:33:39 +00:00
CodePoint readCodePoint ( const char * & pos , const char * end )
{
size_t length = UTF8 : : seqLength ( * pos ) ;
2018-06-29 23:27:56 +00:00
2018-06-15 05:33:39 +00:00
if ( pos + length > end )
length = end - pos ;
2018-06-29 23:27:56 +00:00
if ( length > sizeof ( CodePoint ) )
length = sizeof ( CodePoint ) ;
2018-06-15 05:33:39 +00:00
CodePoint res = 0 ;
memcpy ( & res , pos , length ) ;
2018-06-15 06:14:39 +00:00
pos + = length ;
2018-06-15 05:33:39 +00:00
return res ;
}
bool writeCodePoint ( CodePoint code , char * & pos , char * end )
{
size_t length
= ( code & 0xFF000000 ) ? 4
: ( code & 0xFFFF0000 ) ? 3
: ( code & 0xFFFFFF00 ) ? 2
: 1 ;
if ( pos + length > end )
return false ;
memcpy ( pos , & code , length ) ;
pos + = length ;
return true ;
}
public :
2018-06-15 23:54:33 +00:00
MarkovModel ( MarkovModelParameters params )
: params ( std : : move ( params ) ) , code_points ( params . order , BEGIN ) { }
2018-06-15 05:33:39 +00:00
void consume ( const char * data , size_t size )
{
2018-06-15 23:54:33 +00:00
/// First 'order' number of code points are pre-filled with BEGIN.
code_points . resize ( params . order ) ;
2018-06-15 05:33:39 +00:00
const char * pos = data ;
const char * end = data + size ;
2018-06-15 08:53:06 +00:00
while ( true )
2018-06-15 05:33:39 +00:00
{
2018-06-15 09:21:53 +00:00
const bool inside = pos < end ;
2018-06-15 08:53:06 +00:00
2018-06-15 09:21:53 +00:00
CodePoint next_code_point { } ;
2018-06-15 08:53:06 +00:00
if ( inside )
next_code_point = readCodePoint ( pos , end ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 23:54:33 +00:00
for ( size_t context_size = 0 ; context_size < params . order ; + + context_size )
2018-06-15 08:53:06 +00:00
{
NGramHash context_hash = hashContext ( code_points . data ( ) + code_points . size ( ) - context_size , code_points . data ( ) + code_points . size ( ) ) ;
if ( inside )
table [ context_hash ] . add ( next_code_point ) ;
else /// if (context_size != 0 || order == 0) /// Don't allow to break string without context (except order-0 model).
table [ context_hash ] . addEnd ( ) ;
}
if ( inside )
code_points . push_back ( next_code_point ) ;
else
break ;
2018-06-15 05:33:39 +00:00
}
}
void finalize ( )
{
2018-06-16 00:50:52 +00:00
if ( params . num_buckets_cutoff )
{
for ( auto & elem : table )
{
2019-02-28 09:35:38 +00:00
Histogram & histogram = elem . getSecond ( ) ;
2018-06-16 00:50:52 +00:00
if ( histogram . buckets . size ( ) < params . num_buckets_cutoff )
{
histogram . buckets . clear ( ) ;
histogram . total = 0 ;
}
}
}
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
if ( params . frequency_cutoff )
2018-06-15 08:53:06 +00:00
{
2018-06-16 00:50:52 +00:00
for ( auto & elem : table )
{
2019-02-28 09:35:38 +00:00
Histogram & histogram = elem . getSecond ( ) ;
2018-06-16 01:44:25 +00:00
if ( ! histogram . total )
continue ;
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
if ( histogram . total + histogram . count_end < params . frequency_cutoff )
{
histogram . buckets . clear ( ) ;
histogram . total = 0 ;
}
else
{
Histogram : : Buckets new_buckets ;
UInt64 erased_count = 0 ;
for ( const auto & bucket : histogram . buckets )
{
if ( bucket . second > = params . frequency_cutoff )
new_buckets . emplace ( bucket ) ;
else
erased_count + = bucket . second ;
}
histogram . buckets . swap ( new_buckets ) ;
histogram . total - = erased_count ;
}
}
}
if ( params . frequency_add )
{
for ( auto & elem : table )
2018-06-15 08:53:06 +00:00
{
2019-02-28 09:35:38 +00:00
Histogram & histogram = elem . getSecond ( ) ;
2018-06-16 01:44:25 +00:00
if ( ! histogram . total )
continue ;
2018-06-16 00:50:52 +00:00
for ( auto & bucket : histogram . buckets )
bucket . second + = params . frequency_add ;
histogram . count_end + = params . frequency_add ;
histogram . total + = params . frequency_add * histogram . buckets . size ( ) ;
2018-06-15 08:53:06 +00:00
}
2018-06-16 00:50:52 +00:00
}
if ( params . frequency_desaturate )
{
for ( auto & elem : table )
2018-06-15 08:53:06 +00:00
{
2019-02-28 09:35:38 +00:00
Histogram & histogram = elem . getSecond ( ) ;
2018-06-16 01:44:25 +00:00
if ( ! histogram . total )
continue ;
2018-06-16 00:50:52 +00:00
double average = histogram . total / histogram . buckets . size ( ) ;
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
UInt64 new_total = 0 ;
for ( auto & bucket : histogram . buckets )
2018-06-15 09:40:40 +00:00
{
2018-06-16 01:44:25 +00:00
bucket . second = bucket . second * ( 1.0 - params . frequency_desaturate ) + average * params . frequency_desaturate ;
2018-06-16 00:50:52 +00:00
new_total + = bucket . second ;
2018-06-15 09:40:40 +00:00
}
2018-06-15 08:53:06 +00:00
2018-06-16 00:50:52 +00:00
histogram . total = new_total ;
2018-06-15 08:53:06 +00:00
}
}
2018-06-15 05:33:39 +00:00
}
2018-06-15 08:53:06 +00:00
size_t generate ( char * data , size_t desired_size , size_t buffer_size ,
2018-06-15 05:33:39 +00:00
UInt64 seed , const char * determinator_data , size_t determinator_size )
{
2018-06-15 23:54:33 +00:00
code_points . resize ( params . order ) ;
2018-06-15 05:33:39 +00:00
char * pos = data ;
2018-06-15 08:53:06 +00:00
char * end = data + buffer_size ;
2018-06-15 05:33:39 +00:00
while ( pos < end )
{
Table : : iterator it = table . end ( ) ;
2018-06-15 23:54:33 +00:00
size_t context_size = params . order ;
2018-06-15 05:33:39 +00:00
while ( true )
{
it = table . find ( hashContext ( code_points . data ( ) + code_points . size ( ) - context_size , code_points . data ( ) + code_points . size ( ) ) ) ;
2019-02-28 09:35:38 +00:00
if ( table . end ( ) ! = it & & it - > getSecond ( ) . total + it - > getSecond ( ) . count_end ! = 0 )
2018-06-15 05:33:39 +00:00
break ;
if ( context_size = = 0 )
break ;
- - context_size ;
}
if ( table . end ( ) = = it )
2018-06-30 19:03:26 +00:00
throw Exception ( " Logical error in markov model " , ErrorCodes : : LOGICAL_ERROR ) ;
2018-06-15 05:33:39 +00:00
size_t offset_from_begin_of_string = pos - data ;
2018-06-16 01:44:25 +00:00
size_t determinator_sliding_window_size = params . determinator_sliding_window_size ;
if ( determinator_sliding_window_size > determinator_size )
determinator_sliding_window_size = determinator_size ;
2018-06-15 05:33:39 +00:00
size_t determinator_sliding_window_overflow = offset_from_begin_of_string + determinator_sliding_window_size > determinator_size
? offset_from_begin_of_string + determinator_sliding_window_size - determinator_size : 0 ;
const char * determinator_sliding_window_begin = determinator_data + offset_from_begin_of_string - determinator_sliding_window_overflow ;
SipHash hash ;
hash . update ( seed ) ;
hash . update ( determinator_sliding_window_begin , determinator_sliding_window_size ) ;
hash . update ( determinator_sliding_window_overflow ) ;
UInt64 determinator = hash . get64 ( ) ;
2018-06-15 08:53:06 +00:00
/// If string is greater than desired_size, increase probability of end.
double end_probability_multiplier = 0 ;
Int64 num_bytes_after_desired_size = ( pos - data ) - desired_size ;
2018-06-30 19:03:26 +00:00
if ( num_bytes_after_desired_size > 0 )
2018-06-15 08:53:06 +00:00
end_probability_multiplier = std : : pow ( 1.25 , num_bytes_after_desired_size ) ;
2019-02-28 09:35:38 +00:00
CodePoint code = it - > getSecond ( ) . sample ( determinator , end_probability_multiplier ) ;
2018-06-15 08:53:06 +00:00
if ( code = = END )
break ;
2018-06-15 05:33:39 +00:00
2018-06-30 19:03:26 +00:00
if ( num_bytes_after_desired_size > 0 )
{
/// Heuristic: break at ASCII non-alnum code point.
/// This allows to be close to desired_size but not break natural looking words.
if ( code < 128 & & ! isAlphaNumericASCII ( code ) )
break ;
}
2018-06-15 05:33:39 +00:00
if ( ! writeCodePoint ( code , pos , end ) )
break ;
2018-06-15 08:53:06 +00:00
code_points . push_back ( code ) ;
2018-06-15 05:33:39 +00:00
}
return pos - data ;
}
} ;
/// Generate length of strings as above.
/// To generate content of strings, use
/// order-N Markov model on Unicode code points,
/// and to generate next code point use deterministic RNG
2018-06-16 02:06:51 +00:00
/// determined by hash of a sliding window (default 8 bytes) of source string.
2018-06-15 05:33:39 +00:00
/// This is intended to generate locally-similar strings from locally-similar sources.
class StringModel : public IModel
{
private :
UInt64 seed ;
2018-06-15 08:53:06 +00:00
MarkovModel markov_model ;
2018-06-15 05:33:39 +00:00
public :
2018-06-15 23:54:33 +00:00
StringModel ( UInt64 seed , MarkovModelParameters params ) : seed ( seed ) , markov_model ( std : : move ( params ) ) { }
2018-06-15 05:33:39 +00:00
void train ( const IColumn & column ) override
{
const ColumnString & column_string = static_cast < const ColumnString & > ( column ) ;
size_t size = column_string . size ( ) ;
for ( size_t i = 0 ; i < size ; + + i )
{
StringRef string = column_string . getDataAt ( i ) ;
markov_model . consume ( string . data , string . size ) ;
}
}
void finalize ( ) override
{
2018-06-15 08:53:06 +00:00
markov_model . finalize ( ) ;
2018-06-15 05:33:39 +00:00
}
ColumnPtr generate ( const IColumn & column ) override
{
const ColumnString & column_string = static_cast < const ColumnString & > ( column ) ;
size_t size = column_string . size ( ) ;
auto res_column = ColumnString : : create ( ) ;
res_column - > reserve ( size ) ;
std : : string new_string ;
for ( size_t i = 0 ; i < size ; + + i )
{
StringRef src_string = column_string . getDataAt ( i ) ;
size_t desired_string_size = transform ( src_string . size , seed ) ;
2018-06-15 08:53:06 +00:00
new_string . resize ( desired_string_size * 2 ) ;
2018-06-15 05:33:39 +00:00
2018-06-15 07:09:54 +00:00
size_t actual_size = 0 ;
if ( desired_string_size ! = 0 )
2018-06-15 08:53:06 +00:00
actual_size = markov_model . generate ( new_string . data ( ) , desired_string_size , new_string . size ( ) , seed , src_string . data , src_string . size ) ;
2018-06-15 05:33:39 +00:00
res_column - > insertData ( new_string . data ( ) , actual_size ) ;
}
return res_column ;
}
} ;
2018-06-15 07:09:54 +00:00
class ArrayModel : public IModel
{
private :
ModelPtr nested_model ;
public :
ArrayModel ( ModelPtr nested_model ) : nested_model ( std : : move ( nested_model ) ) { }
void train ( const IColumn & column ) override
{
const ColumnArray & column_array = static_cast < const ColumnArray & > ( column ) ;
const IColumn & nested_column = column_array . getData ( ) ;
nested_model - > train ( nested_column ) ;
}
void finalize ( ) override
{
nested_model - > finalize ( ) ;
}
ColumnPtr generate ( const IColumn & column ) override
{
const ColumnArray & column_array = static_cast < const ColumnArray & > ( column ) ;
const IColumn & nested_column = column_array . getData ( ) ;
ColumnPtr new_nested_column = nested_model - > generate ( nested_column ) ;
return ColumnArray : : create ( ( * std : : move ( new_nested_column ) ) . mutate ( ) , ( * std : : move ( column_array . getOffsetsPtr ( ) ) ) . mutate ( ) ) ;
}
} ;
class NullableModel : public IModel
{
private :
ModelPtr nested_model ;
public :
NullableModel ( ModelPtr nested_model ) : nested_model ( std : : move ( nested_model ) ) { }
void train ( const IColumn & column ) override
{
const ColumnNullable & column_nullable = static_cast < const ColumnNullable & > ( column ) ;
const IColumn & nested_column = column_nullable . getNestedColumn ( ) ;
nested_model - > train ( nested_column ) ;
}
void finalize ( ) override
{
nested_model - > finalize ( ) ;
}
ColumnPtr generate ( const IColumn & column ) override
{
const ColumnNullable & column_nullable = static_cast < const ColumnNullable & > ( column ) ;
const IColumn & nested_column = column_nullable . getNestedColumn ( ) ;
ColumnPtr new_nested_column = nested_model - > generate ( nested_column ) ;
return ColumnNullable : : create ( ( * std : : move ( new_nested_column ) ) . mutate ( ) , ( * std : : move ( column_nullable . getNullMapColumnPtr ( ) ) ) . mutate ( ) ) ;
}
} ;
2018-06-15 05:33:39 +00:00
class ModelFactory
{
public :
2018-06-15 23:54:33 +00:00
ModelPtr get ( const IDataType & data_type , UInt64 seed , MarkovModelParameters markov_model_params ) const
2018-06-15 05:33:39 +00:00
{
2018-09-07 14:37:26 +00:00
if ( isInteger ( data_type ) )
2018-06-15 05:33:39 +00:00
{
2018-09-07 14:37:26 +00:00
if ( isUnsignedInteger ( data_type ) )
2018-06-15 05:33:39 +00:00
return std : : make_unique < UnsignedIntegerModel > ( seed ) ;
else
return std : : make_unique < SignedIntegerModel > ( seed ) ;
}
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeFloat32 * > ( & data_type ) )
return std : : make_unique < FloatModel < Float32 > > ( seed ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeFloat64 * > ( & data_type ) )
return std : : make_unique < FloatModel < Float64 > > ( seed ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeDate * > ( & data_type ) )
return std : : make_unique < IdentityModel > ( ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeDateTime * > ( & data_type ) )
return std : : make_unique < DateTimeModel > ( seed ) ;
2018-06-15 07:09:54 +00:00
2018-06-15 05:33:39 +00:00
if ( typeid_cast < const DataTypeString * > ( & data_type ) )
2018-06-15 23:54:33 +00:00
return std : : make_unique < StringModel > ( seed , markov_model_params ) ;
2018-06-15 07:09:54 +00:00
if ( typeid_cast < const DataTypeFixedString * > ( & data_type ) )
return std : : make_unique < FixedStringModel > ( seed ) ;
if ( auto type = typeid_cast < const DataTypeArray * > ( & data_type ) )
2018-06-15 23:54:33 +00:00
return std : : make_unique < ArrayModel > ( get ( * type - > getNestedType ( ) , seed , markov_model_params ) ) ;
2018-06-15 07:09:54 +00:00
if ( auto type = typeid_cast < const DataTypeNullable * > ( & data_type ) )
2018-06-15 23:54:33 +00:00
return std : : make_unique < NullableModel > ( get ( * type - > getNestedType ( ) , seed , markov_model_params ) ) ;
2018-06-15 07:09:54 +00:00
2018-06-30 19:03:26 +00:00
throw Exception ( " Unsupported data type " , ErrorCodes : : NOT_IMPLEMENTED ) ;
2018-06-15 05:33:39 +00:00
}
} ;
2018-06-15 23:54:33 +00:00
class Obfuscator
2018-06-15 05:33:39 +00:00
{
private :
std : : vector < ModelPtr > models ;
public :
2018-06-15 23:54:33 +00:00
Obfuscator ( const Block & header , UInt64 seed , MarkovModelParameters markov_model_params )
2018-06-15 05:33:39 +00:00
{
ModelFactory factory ;
size_t columns = header . columns ( ) ;
models . reserve ( columns ) ;
2019-05-02 22:09:38 +00:00
for ( const auto & elem : header )
models . emplace_back ( factory . get ( * elem . type , hash ( seed , elem . name ) , markov_model_params ) ) ;
2018-06-15 05:33:39 +00:00
}
void train ( const Columns & columns )
{
size_t size = columns . size ( ) ;
for ( size_t i = 0 ; i < size ; + + i )
models [ i ] - > train ( * columns [ i ] ) ;
}
void finalize ( )
{
for ( auto & model : models )
model - > finalize ( ) ;
}
Columns generate ( const Columns & columns )
{
size_t size = columns . size ( ) ;
Columns res ( size ) ;
for ( size_t i = 0 ; i < size ; + + i )
res [ i ] = models [ i ] - > generate ( * columns [ i ] ) ;
return res ;
}
} ;
}
2018-06-16 00:27:59 +00:00
int mainEntryClickHouseObfuscator ( int argc , char * * argv )
2018-06-15 06:14:39 +00:00
try
2018-06-15 05:33:39 +00:00
{
using namespace DB ;
namespace po = boost : : program_options ;
2018-06-15 10:42:13 +00:00
po : : options_description description ( " Options " ) ;
2018-06-15 05:33:39 +00:00
description . add_options ( )
( " help " , " produce help message " )
( " structure,S " , po : : value < std : : string > ( ) , " structure of the initial table (list of column and type names) " )
( " input-format " , po : : value < std : : string > ( ) , " input format of the initial table data " )
( " output-format " , po : : value < std : : string > ( ) , " default output format " )
2019-05-02 22:09:38 +00:00
( " seed " , po : : value < std : : string > ( ) , " seed (arbitrary string), must be random string with at least 10 bytes length; note that a seed for each column is derived from this seed and a column name: you can obfuscate data for different tables and as long as you use identical seed and identical column names, the data for corresponding non-text columns for different tables will be transformed in the same way, so the data for different tables can be JOINed after obfuscation " )
2018-06-16 00:27:59 +00:00
( " limit " , po : : value < UInt64 > ( ) , " if specified - stop after generating that number of rows " )
( " silent " , po : : value < bool > ( ) - > default_value ( false ) , " don't print information messages to stderr " )
2018-06-15 08:53:06 +00:00
( " order " , po : : value < UInt64 > ( ) - > default_value ( 5 ) , " order of markov model to generate strings " )
2018-06-16 00:50:52 +00:00
( " frequency-cutoff " , po : : value < UInt64 > ( ) - > default_value ( 5 ) , " frequency cutoff for markov model: remove all buckets with count less than specified " )
2018-06-16 23:34:34 +00:00
( " num-buckets-cutoff " , po : : value < UInt64 > ( ) - > default_value ( 0 ) , " cutoff for number of different possible continuations for a context: remove all histograms with less than specified number of buckets " )
2018-06-16 00:50:52 +00:00
( " frequency-add " , po : : value < UInt64 > ( ) - > default_value ( 0 ) , " add a constant to every count to lower probability distribution skew " )
( " frequency-desaturate " , po : : value < double > ( ) - > default_value ( 0 ) , " 0..1 - move every frequency towards average to lower probability distribution skew " )
2018-06-16 01:44:25 +00:00
( " determinator-sliding-window-size " , po : : value < UInt64 > ( ) - > default_value ( 8 ) , " size of a sliding window in a source string - its hash is used as a seed for RNG in markov model " )
2018-06-15 05:33:39 +00:00
;
po : : parsed_options parsed = po : : command_line_parser ( argc , argv ) . options ( description ) . run ( ) ;
po : : variables_map options ;
po : : store ( parsed , options ) ;
2018-06-15 23:54:33 +00:00
if ( options . count ( " help " )
| | ! options . count ( " seed " )
| | ! options . count ( " structure " )
| | ! options . count ( " input-format " )
| | ! options . count ( " output-format " ) )
2018-06-15 05:33:39 +00:00
{
2018-06-16 01:44:25 +00:00
std : : cout < < documantation < < " \n "
< < " \n Usage: " < < argv [ 0 ] < < " [options] < in > out \n "
2018-06-15 10:03:02 +00:00
< < " \n Input must be seekable file (it will be read twice). \n "
< < " \n " < < description < < " \n "
2018-06-16 03:31:49 +00:00
< < " \n Example: \n " < < argv [ 0 ] < < " --seed \" $(head -c16 /dev/urandom | base64) \" --input-format TSV --output-format TSV --structure 'CounterID UInt32, URLDomain String, URL String, SearchPhrase String, Title String' < stats.tsv \n " ;
2018-06-15 05:33:39 +00:00
return 0 ;
}
UInt64 seed = sipHash64 ( options [ " seed " ] . as < std : : string > ( ) ) ;
std : : string structure = options [ " structure " ] . as < std : : string > ( ) ;
std : : string input_format = options [ " input-format " ] . as < std : : string > ( ) ;
std : : string output_format = options [ " output-format " ] . as < std : : string > ( ) ;
2018-06-16 00:27:59 +00:00
std : : optional < UInt64 > limit ;
if ( options . count ( " limit " ) )
limit = options [ " limit " ] . as < UInt64 > ( ) ;
bool silent = options [ " silent " ] . as < bool > ( ) ;
2018-06-15 23:54:33 +00:00
MarkovModelParameters markov_model_params ;
markov_model_params . order = options [ " order " ] . as < UInt64 > ( ) ;
2018-06-16 00:50:52 +00:00
markov_model_params . frequency_cutoff = options [ " frequency-cutoff " ] . as < UInt64 > ( ) ;
markov_model_params . num_buckets_cutoff = options [ " num-buckets-cutoff " ] . as < UInt64 > ( ) ;
markov_model_params . frequency_add = options [ " frequency-add " ] . as < UInt64 > ( ) ;
markov_model_params . frequency_desaturate = options [ " frequency-desaturate " ] . as < double > ( ) ;
2018-06-16 01:44:25 +00:00
markov_model_params . determinator_sliding_window_size = options [ " determinator-sliding-window-size " ] . as < UInt64 > ( ) ;
2018-06-15 08:53:06 +00:00
2018-06-15 05:33:39 +00:00
// Create header block
std : : vector < std : : string > structure_vals ;
boost : : split ( structure_vals , structure , boost : : algorithm : : is_any_of ( " , " ) , boost : : algorithm : : token_compress_on ) ;
if ( structure_vals . size ( ) % 2 ! = 0 )
throw Exception ( " Odd number of elements in section structure: must be a list of name type pairs " , ErrorCodes : : LOGICAL_ERROR ) ;
Block header ;
const DataTypeFactory & data_type_factory = DataTypeFactory : : instance ( ) ;
for ( size_t i = 0 , size = structure_vals . size ( ) ; i < size ; i + = 2 )
{
ColumnWithTypeAndName column ;
column . name = structure_vals [ i ] ;
column . type = data_type_factory . get ( structure_vals [ i + 1 ] ) ;
column . column = column . type - > createColumn ( ) ;
header . insert ( std : : move ( column ) ) ;
}
Context context = Context : : createGlobal ( ) ;
ReadBufferFromFileDescriptor file_in ( STDIN_FILENO ) ;
WriteBufferFromFileDescriptor file_out ( STDOUT_FILENO ) ;
2018-06-16 00:27:59 +00:00
{
/// stdin must be seekable
2018-06-16 03:35:23 +00:00
auto res = lseek ( file_in . getFD ( ) , 0 , SEEK_SET ) ;
if ( - 1 = = res )
throwFromErrno ( " Input must be seekable file (it will be read twice). " , ErrorCodes : : CANNOT_SEEK_THROUGH_FILE ) ;
2018-06-16 00:27:59 +00:00
}
2018-06-15 23:54:33 +00:00
Obfuscator obfuscator ( header , seed , markov_model_params ) ;
2018-06-15 05:33:39 +00:00
2019-02-10 16:55:12 +00:00
UInt64 max_block_size = 8192 ;
2018-06-15 05:33:39 +00:00
/// Train step
{
2018-06-16 00:27:59 +00:00
if ( ! silent )
std : : cerr < < " Training models \n " ;
2018-06-15 05:33:39 +00:00
BlockInputStreamPtr input = context . getInputFormat ( input_format , file_in , header , max_block_size ) ;
2018-06-15 09:44:14 +00:00
UInt64 processed_rows = 0 ;
2018-06-15 05:33:39 +00:00
input - > readPrefix ( ) ;
while ( Block block = input - > read ( ) )
2018-06-15 09:44:14 +00:00
{
2018-06-15 23:54:33 +00:00
obfuscator . train ( block . getColumns ( ) ) ;
2018-06-15 09:44:14 +00:00
processed_rows + = block . rows ( ) ;
2018-06-16 00:27:59 +00:00
if ( ! silent )
std : : cerr < < " Processed " < < processed_rows < < " rows \n " ;
2018-06-15 09:44:14 +00:00
}
2018-06-15 05:33:39 +00:00
input - > readSuffix ( ) ;
}
2018-06-15 23:54:33 +00:00
obfuscator . finalize ( ) ;
2018-06-15 05:33:39 +00:00
/// Generation step
{
2018-06-16 00:27:59 +00:00
if ( ! silent )
std : : cerr < < " Generating data \n " ;
2018-06-15 05:33:39 +00:00
file_in . seek ( 0 ) ;
BlockInputStreamPtr input = context . getInputFormat ( input_format , file_in , header , max_block_size ) ;
BlockOutputStreamPtr output = context . getOutputFormat ( output_format , file_out , header ) ;
2018-06-16 00:27:59 +00:00
if ( limit )
input = std : : make_shared < LimitBlockInputStream > ( input , * limit , 0 ) ;
2018-06-15 09:44:14 +00:00
UInt64 processed_rows = 0 ;
2018-06-15 05:33:39 +00:00
input - > readPrefix ( ) ;
output - > writePrefix ( ) ;
while ( Block block = input - > read ( ) )
{
2018-06-15 23:54:33 +00:00
Columns columns = obfuscator . generate ( block . getColumns ( ) ) ;
2018-06-15 05:33:39 +00:00
output - > write ( header . cloneWithColumns ( columns ) ) ;
2018-06-15 09:44:14 +00:00
processed_rows + = block . rows ( ) ;
2018-06-16 00:27:59 +00:00
if ( ! silent )
std : : cerr < < " Processed " < < processed_rows < < " rows \n " ;
2018-06-15 05:33:39 +00:00
}
output - > writeSuffix ( ) ;
input - > readSuffix ( ) ;
}
return 0 ;
}
2018-06-15 06:14:39 +00:00
catch ( . . . )
{
std : : cerr < < DB : : getCurrentExceptionMessage ( true ) < < " \n " ;
auto code = DB : : getCurrentExceptionCode ( ) ;
return code ? code : 1 ;
}