#include #if USE_SIMDJSON || USE_RAPIDJSON #include #include #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int LOGICAL_ERROR; extern const int INCORRECT_DATA; } namespace { using uniform = std::uniform_int_distribution; struct JSONNode; using JSONNodeList = std::list>; struct JSONValue { enum class Type : size_t { Fixed = 0, Array = 1, Object = 2, }; static Type getType(const JSONValue & v); // The node value must be one of the following: // Examples: 5, true, "abc" std::optional fixed; // Examples: [], ["a"], [1, true] std::optional array; // Examples: {}, {"a": [1,2], "b": "c"} std::optional object; }; JSONValue::Type JSONValue::getType(const JSONValue & v) { if (v.fixed) { assert(!v.array); assert(!v.object); return JSONValue::Type::Fixed; } else if (v.array) { assert(!v.fixed); assert(!v.object); return JSONValue::Type::Array; } else if (v.object) { assert(!v.fixed); assert(!v.array); return JSONValue::Type::Object; } else throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to determine JSON node type."); } // A node represents either a JSON field (a key-value pair) or a JSON value. // The key is not set for the JSON root and for the array items. struct JSONNode { std::optional key; JSONValue value; }; #if USE_SIMDJSON using ParserImpl = DB::SimdJSONParser; #elif USE_RAPIDJSON using ParserImpl = DB::RapidJSONParser; #endif std::optional getFixedValue(const ParserImpl::Element & e) { return e.isBool() ? e.getBool() : e.isInt64() ? e.getInt64() : e.isUInt64() ? e.getUInt64() : e.isDouble() ? e.getDouble() : e.isString() ? e.getString() : e.isNull() ? Field() : std::optional(); } void traverse(const ParserImpl::Element & e, std::shared_ptr node) { checkStackSize(); assert(node); auto & val = node->value; if (e.isObject()) { const auto & obj = e.getObject(); if (!val.object) val.object = JSONNodeList{}; for (const auto [k, v] : obj) { auto child = std::make_shared(); child->key = k; traverse(v, child); val.object->push_back(child); } } else if (e.isArray()) { if (!val.array) val.array = JSONNodeList{}; const auto arr = e.getArray(); for (const auto a : arr) { auto child = std::make_shared(); traverse(a, child); val.array->push_back(child); } } else { auto field = getFixedValue(e); if (!field) throw Exception(ErrorCodes::INCORRECT_DATA, "Failed to parse a fixed JSON value '{}'.", escapeString(e.getString())); val.fixed = std::move(field); } } std::shared_ptr parseJSON(const String & json) { std::string_view view{json.begin(), json.end()}; ParserImpl::Element document; ParserImpl p; if (!p.parse(json, document)) { constexpr size_t max_length = 128LU; throw Exception( ErrorCodes::INCORRECT_DATA, "Failed to parse JSON from the string '{}'{}.", escapeString(json.substr(0, max_length)), json.size() >= max_length ? "... (truncated)" : ""); } auto root = std::make_shared(); traverse(document, root); return root; } char generateRandomCharacter(pcg64 & rnd, const std::string_view & charset) { assert(!charset.empty()); auto idx = uniform(0, charset.size() - 1)(rnd); return charset[idx]; } char generateRandomKeyCharacter(pcg64 & rnd) { static constexpr std::string_view charset = "0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"; return generateRandomCharacter(rnd, charset); } char generateRandomStringValueCharacter(pcg64 & rnd) { static constexpr std::string_view charset = "0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "!@#$%^&*-+_"; return generateRandomCharacter(rnd, charset); } String generateRandomStringValue(UInt64 min_length, UInt64 max_length, pcg64 & rnd) { size_t size = min_length + rnd() % (max_length - min_length + 1); String res; res.reserve(size); for (size_t i = 0; i < size; ++i) res.push_back(generateRandomStringValueCharacter(rnd)); return res; } String generateRandomKey(UInt64 min_length, UInt64 max_length, pcg64 & rnd) { size_t size = min_length + rnd() % (max_length - min_length + 1); String res; res.reserve(size); for (size_t i = 0; i < size; ++i) res.push_back(generateRandomKeyCharacter(rnd)); return res; } enum class FuzzAction : size_t { Skip = 0, Edit = 1, Add = 2, Delete = 3, }; Field generateRandomFixedValue(const StorageFuzzJSON::Configuration & config, pcg64 & rnd) { // TODO (@jkartseva): support more field types. static std::array possible_types{ Field::Types::Which::UInt64, Field::Types::Which::String, Field::Types::Which::Bool}; Field f; auto idx = rnd() % possible_types.size(); switch (possible_types[idx]) { case Field::Types::Which::UInt64: { f = rnd(); break; } case Field::Types::Which::String: f = generateRandomStringValue(/*min_length*/ 0, config.max_string_value_length, rnd); break; case Field::Types::Which::Bool: f = bool(rnd() % 2); break; default: } return f; } String fuzzString(UInt64 min_length, UInt64 max_length, pcg64 & rnd, const String & source, std::function charGen) { String result; result.reserve(max_length); using FA = FuzzAction; auto get_action = [&]() -> FuzzAction { static constexpr std::array actions{FA::Skip, FA::Edit, FA::Add, FA::Delete}; return actions[uniform(0, 3)(rnd)]; }; size_t i = 0; while (i < source.size() && result.size() < max_length) { auto action = get_action(); switch (action) { case FA::Skip: { result.push_back(source[i++]); } break; case FA::Edit: { result.push_back(charGen(rnd)); ++i; } break; case FA::Add: { result.push_back(charGen(rnd)); } break; default: ++i; } } while (result.size() < min_length) result.push_back(charGen(rnd)); return result; } String fuzzJSONKey(const StorageFuzzJSON::Configuration & config, pcg64 & rnd, const String & key) { return fuzzString(config.min_key_length, config.max_key_length, rnd, key, generateRandomKeyCharacter); } // Randomly modify structural characters (e.g. '{', '}', '[', ']', ':', '"') to generate output that cannot be parsed as JSON. String fuzzJSONStructure(const StorageFuzzJSON::Configuration & config, pcg64 & rnd, const String & s) { return config.should_malform_output ? fuzzString(/*min_length*/ 0, /*max_length*/ s.size(), rnd, s, generateRandomStringValueCharacter) : s; } std::shared_ptr generateRandomJSONNode(const StorageFuzzJSON::Configuration & config, pcg64 & rnd, bool with_key, JSONValue::Type type) { auto node = std::make_shared(); if (with_key) node->key = generateRandomKey(config.min_key_length, config.max_key_length, rnd); auto & val = node->value; switch (type) { case JSONValue::Type::Fixed: { val.fixed = generateRandomFixedValue(config, rnd); break; } case JSONValue::Type::Array: { val.array = JSONNodeList{}; break; } case JSONValue::Type::Object: { val.object = JSONNodeList{}; break; } } return node; } template std::shared_ptr generateRandomJSONNode( const StorageFuzzJSON::Configuration & config, pcg64 & rnd, bool with_key, const std::array & possible_types) { auto type = possible_types[uniform(0, possible_types.size() - 1)(rnd)]; return generateRandomJSONNode(config, rnd, with_key, type); } std::shared_ptr generateRandomJSONNode(const StorageFuzzJSON::Configuration & config, pcg64 & rnd, bool with_key, size_t depth) { if (depth >= config.max_nesting_level) return generateRandomJSONNode(config, rnd, with_key, JSONValue::Type::Fixed); static constexpr std::array possible_types = {JSONValue::Type::Fixed, JSONValue::Type::Array, JSONValue::Type::Object}; return generateRandomJSONNode(config, rnd, with_key, possible_types); } JSONNode & fuzzSingleJSONNode(JSONNode & n, const StorageFuzzJSON::Configuration & config, pcg64 & rnd, size_t depth, size_t & node_count) { auto & val = n.value; static constexpr size_t update_key = 1; static constexpr size_t update_value = 2; auto action = 1 + rnd() % static_cast(update_key | update_value); if (n.key && (action & update_key)) n.key = fuzzJSONKey(config, rnd, *n.key); if ((action & update_value) == 0) return n; if (val.fixed) val.fixed = generateRandomFixedValue(config, rnd); else if (val.array && val.array->size() < config.max_array_size && node_count + val.array->size() < config.value_number_limit) { if (val.array->empty()) val.array->push_back(generateRandomJSONNode(config, rnd, /*with_key*/ false, depth)); else { // Use the type of the preceding element. const auto & prev = val.array->back(); auto value_type = JSONValue::getType(prev->value); val.array->push_back(generateRandomJSONNode(config, rnd, /*with_key*/ false, value_type)); } ++node_count; } else if (val.object && val.object->size() < config.max_object_size && node_count + val.object->size() < config.value_number_limit) { val.object->push_back(generateRandomJSONNode(config, rnd, /*with_key*/ true, depth)); ++node_count; } return n; } void fuzzJSONObject( const std::shared_ptr & node, WriteBuffer & out, const StorageFuzzJSON::Configuration & config, pcg64 & rnd, size_t depth, size_t & node_count) { checkStackSize(); ++node_count; bool should_fuzz = rnd() % 100 < 100 * config.probability; const auto & next_node = should_fuzz && !config.should_reuse_output ? std::make_shared(*node) : node; if (should_fuzz) fuzzSingleJSONNode(*next_node, config, rnd, depth, node_count); if (next_node->key) { writeDoubleQuoted(*next_node->key, out); out << fuzzJSONStructure(config, rnd, ":"); } auto & val = next_node->value; if (val.fixed) { if (val.fixed->getType() == Field::Types::Which::String) { out << fuzzJSONStructure(config, rnd, "\""); writeText(val.fixed->get(), out); out << fuzzJSONStructure(config, rnd, "\""); } else writeFieldText(*val.fixed, out); } else { if (!val.array && !val.object) return; const auto & [op, cl, node_list] = val.array ? std::make_tuple("[", "]", *val.array) : std::make_tuple("{", "}", *val.object); out << fuzzJSONStructure(config, rnd, op); bool first = true; for (const auto & ptr : node_list) { if (node_count >= config.value_number_limit) break; WriteBufferFromOwnString child_out; if (!first) child_out << fuzzJSONStructure(config, rnd, ", "); first = false; fuzzJSONObject(ptr, child_out, config, rnd, depth + 1, node_count); // Should not exceed the maximum length of the output string. if (out.count() + child_out.count() >= config.max_output_length) break; out << child_out.str(); } out << fuzzJSONStructure(config, rnd, cl); } } void fuzzJSONObject(std::shared_ptr n, WriteBuffer & out, const StorageFuzzJSON::Configuration & config, pcg64 & rnd) { size_t node_count = 0; return fuzzJSONObject(n, out, config, rnd, /*depth*/ 0, node_count); } class FuzzJSONSource : public ISource { public: FuzzJSONSource( UInt64 block_size_, Block block_header_, const StorageFuzzJSON::Configuration & config_, std::shared_ptr json_root_) : ISource(block_header_) , block_size(block_size_) , block_header(std::move(block_header_)) , config(config_) , rnd(config.random_seed) , json_root(json_root_) { } String getName() const override { return "FuzzJSON"; } protected: Chunk generate() override { Columns columns; columns.reserve(block_header.columns()); for (const auto& col : block_header) { chassert(col.type->getTypeId() == TypeIndex::String); columns.emplace_back(createColumn()); } return {std::move(columns), block_size}; } private: ColumnPtr createColumn(); UInt64 block_size; Block block_header; StorageFuzzJSON::Configuration config; pcg64 rnd; std::shared_ptr json_root; }; ColumnPtr FuzzJSONSource::createColumn() { auto column = ColumnString::create(); ColumnString::Chars & data_to = column->getChars(); ColumnString::Offsets & offsets_to = column->getOffsets(); offsets_to.resize(block_size); IColumn::Offset offset = 0; for (size_t row_num = 0; row_num < block_size; ++row_num) { WriteBufferFromOwnString out; fuzzJSONObject(json_root, out, config, rnd); auto data = out.str(); size_t data_len = data.size(); IColumn::Offset next_offset = offset + data_len + 1; data_to.resize(next_offset); std::copy(data.begin(), data.end(), &data_to[offset]); data_to[offset + data_len] = 0; offsets_to[row_num] = next_offset; offset = next_offset; } return column; } } StorageFuzzJSON::StorageFuzzJSON( const StorageID & table_id_, const ColumnsDescription & columns_, const String & comment_, const Configuration & config_) : IStorage(table_id_), config(config_) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns_); storage_metadata.setComment(comment_); setInMemoryMetadata(storage_metadata); } Pipe StorageFuzzJSON::read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, size_t num_streams) { storage_snapshot->check(column_names); Pipes pipes; pipes.reserve(num_streams); const ColumnsDescription & our_columns = storage_snapshot->metadata->getColumns(); Block block_header; for (const auto & name : column_names) { const auto & name_type = our_columns.get(name); MutableColumnPtr column = name_type.type->createColumn(); block_header.insert({std::move(column), name_type.type, name_type.name}); } for (UInt64 i = 0; i < num_streams; ++i) pipes.emplace_back(std::make_shared(max_block_size, block_header, config, parseJSON(config.json_str))); return Pipe::unitePipes(std::move(pipes)); } static constexpr std::array optional_configuration_keys = {"json_str", "random_seed", "reuse_output", "malform_output", "probability", "max_output_length", "max_nesting_level", "max_array_size", "max_object_size", "max_string_value_length", "min_key_length", "max_key_length"}; void StorageFuzzJSON::processNamedCollectionResult(Configuration & configuration, const NamedCollection & collection) { validateNamedCollection( collection, std::unordered_set(), std::unordered_set(optional_configuration_keys.begin(), optional_configuration_keys.end())); if (collection.has("json_str")) configuration.json_str = collection.get("json_str"); if (collection.has("random_seed")) configuration.random_seed = collection.get("random_seed"); if (collection.has("reuse_output")) configuration.should_reuse_output = static_cast(collection.get("reuse_output")); if (collection.has("malform_output")) configuration.should_malform_output = static_cast(collection.get("malform_output")); if (collection.has("probability")) { configuration.probability = collection.get("probability"); if (configuration.probability < 0.0 || configuration.probability > 1.0) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The value of the 'probability' argument must be within the interval [0, 1]."); } if (collection.has("max_output_length")) { configuration.max_output_length = collection.get("max_output_length"); if (configuration.max_output_length < 2 || configuration.max_output_length > configuration.output_length_limit) throw Exception( ErrorCodes::BAD_ARGUMENTS, "The value of the 'max_output_length' argument must be within the interval [2, {}.]", configuration.output_length_limit); } if (collection.has("max_nesting_level")) configuration.max_nesting_level = collection.get("max_nesting_level"); if (collection.has("max_array_size")) configuration.max_array_size = collection.get("max_array_size"); if (collection.has("max_object_size")) configuration.max_object_size = collection.get("max_object_size"); if (collection.has("max_string_value_length")) { auto max_string_value_length = collection.get("max_string_value_length"); if (max_string_value_length > configuration.output_length_limit) throw Exception( ErrorCodes::BAD_ARGUMENTS, "The value of the 'max_string_value_length' argument must be at most {}.", configuration.output_length_limit); configuration.max_string_value_length = std::min(max_string_value_length, configuration.max_output_length); } if (collection.has("max_key_length")) { auto max_key_length = collection.get("max_key_length"); if (max_key_length > configuration.output_length_limit) throw Exception( ErrorCodes::BAD_ARGUMENTS, "The value of the 'max_key_length' argument must be less or equal than {}.", configuration.output_length_limit); configuration.max_key_length = std::min(max_key_length, configuration.max_output_length); configuration.min_key_length = std::min(configuration.min_key_length, configuration.max_key_length); } if (collection.has("min_key_length")) { auto min_key_length = collection.get("min_key_length"); if (min_key_length == 0) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The value of the 'min_key_length' argument must be at least 1."); if (collection.has("max_key_length") && collection.get("max_key_length") < min_key_length) throw Exception( ErrorCodes::BAD_ARGUMENTS, "The value of the 'min_key_length' argument must be less or equal than " "the value of the 'max_key_lenght' argument."); configuration.min_key_length = min_key_length; configuration.max_key_length = std::max(configuration.max_key_length, configuration.min_key_length); } } StorageFuzzJSON::Configuration StorageFuzzJSON::getConfiguration(ASTs & engine_args, ContextPtr local_context) { StorageFuzzJSON::Configuration configuration{}; if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) { StorageFuzzJSON::processNamedCollectionResult(configuration, *named_collection); } else { // Supported signatures: // // FuzzJSON('json_str') // FuzzJSON('json_str', 'random_seed') if (engine_args.empty() || engine_args.size() > 2) throw Exception( ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "FuzzJSON requires 1 to 2 arguments: " "json_str, random_seed"); for (auto & engine_arg : engine_args) engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context); auto first_arg = checkAndGetLiteralArgument(engine_args[0], "json_str"); configuration.json_str = std::move(first_arg); if (engine_args.size() == 2) { const auto & literal = engine_args[1]->as(); if (!literal.value.isNull()) configuration.random_seed = checkAndGetLiteralArgument(literal, "random_seed"); } } return configuration; } void registerStorageFuzzJSON(StorageFactory & factory) { factory.registerStorage( "FuzzJSON", [](const StorageFactory::Arguments & args) -> std::shared_ptr { ASTs & engine_args = args.engine_args; if (engine_args.empty()) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Storage FuzzJSON must have arguments."); StorageFuzzJSON::Configuration configuration = StorageFuzzJSON::getConfiguration(engine_args, args.getLocalContext()); for (const auto& col : args.columns) if (col.type->getTypeId() != TypeIndex::String) throw Exception(ErrorCodes::BAD_ARGUMENTS, "'StorageFuzzJSON' supports only columns of String type, got {}.", col.type->getName()); return std::make_shared(args.table_id, args.columns, args.comment, configuration); }); } } #endif