From 1b8ca90fccba6ea92711c2e2f46180dc875c4f9a Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Thu, 11 Aug 2022 11:32:32 +0000 Subject: [PATCH] Add schema inference to clickhouse-obfuscator --- programs/obfuscator/Obfuscator.cpp | 77 ++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 6891d2113a5..2607d9d0902 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1239,7 +1240,6 @@ try if (options.count("help") || !options.count("seed") - || !options.count("structure") || !options.count("input-format") || !options.count("output-format")) { @@ -1259,7 +1259,11 @@ try UInt64 seed = sipHash64(options["seed"].as()); - std::string structure = options["structure"].as(); + std::string structure; + + if (options.count("structure")) + structure = options["structure"].as(); + std::string input_format = options["input-format"].as(); std::string output_format = options["output-format"].as(); @@ -1287,32 +1291,63 @@ try markov_model_params.determinator_sliding_window_size = options["determinator-sliding-window-size"].as(); /// Create the header block - std::vector structure_vals; - boost::split(structure_vals, structure, boost::algorithm::is_any_of(" ,"), boost::algorithm::token_compress_on); - - if (structure_vals.size() % 2 != 0) - throw Exception("Odd number of elements in section structure: must be a list of name type pairs", ErrorCodes::LOGICAL_ERROR); - - Block header; - const DataTypeFactory & data_type_factory = DataTypeFactory::instance(); - - for (size_t i = 0, size = structure_vals.size(); i < size; i += 2) - { - ColumnWithTypeAndName column; - column.name = structure_vals[i]; - column.type = data_type_factory.get(structure_vals[i + 1]); - column.column = column.type->createColumn(); - header.insert(std::move(column)); - } - SharedContextHolder shared_context = Context::createShared(); auto context = Context::createGlobal(shared_context.get()); context->makeGlobalContext(); + Block header; + + if (structure.empty()) + { + std::unique_ptr read_buffer_from_fd; + std::unique_ptr peekable_read_buffer_from_fd; + + ReadBufferIterator read_buffer_iterator = [&]() + { + read_buffer_from_fd = std::make_unique(STDIN_FILENO); + auto read_buf = std::make_unique(*read_buffer_from_fd); + read_buf->setCheckpoint(); + return read_buf; + }; + + auto context_const = WithContext(context).getContext(); + + auto schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, false, context_const, peekable_read_buffer_from_fd); + auto schema_columns_info = schema_columns.getOrdinary(); + + for (auto & info : schema_columns_info) + { + ColumnWithTypeAndName column; + column.name = info.name; + column.type = info.type; + column.column = column.type->createColumn(); + header.insert(std::move(column)); + } + } + else + { + std::vector structure_vals; + boost::split(structure_vals, structure, boost::algorithm::is_any_of(" ,"), boost::algorithm::token_compress_on); + + if (structure_vals.size() % 2 != 0) + throw Exception("Odd number of elements in section structure: must be a list of name type pairs", ErrorCodes::LOGICAL_ERROR); + + const DataTypeFactory & data_type_factory = DataTypeFactory::instance(); + + for (size_t i = 0, size = structure_vals.size(); i < size; i += 2) + { + ColumnWithTypeAndName column; + column.name = structure_vals[i]; + column.type = data_type_factory.get(structure_vals[i + 1]); + column.column = column.type->createColumn(); + header.insert(std::move(column)); + } + } + ReadBufferFromFileDescriptor file_in(STDIN_FILENO); WriteBufferFromFileDescriptor file_out(STDOUT_FILENO); - if (load_from_file.empty()) + if (load_from_file.empty() || structure.empty()) { /// stdin must be seekable auto res = lseek(file_in.getFD(), 0, SEEK_SET);