#include #include #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int BAD_ARGUMENTS; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) { if (escaping_rule.empty()) return FormatSettings::EscapingRule::None; else if (escaping_rule == "None") return FormatSettings::EscapingRule::None; else if (escaping_rule == "Escaped") return FormatSettings::EscapingRule::Escaped; else if (escaping_rule == "Quoted") return FormatSettings::EscapingRule::Quoted; else if (escaping_rule == "CSV") return FormatSettings::EscapingRule::CSV; else if (escaping_rule == "JSON") return FormatSettings::EscapingRule::JSON; else if (escaping_rule == "XML") return FormatSettings::EscapingRule::XML; else if (escaping_rule == "Raw") return FormatSettings::EscapingRule::Raw; else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown escaping rule \"{}\"", escaping_rule); } String escapingRuleToString(FormatSettings::EscapingRule escaping_rule) { switch (escaping_rule) { case FormatSettings::EscapingRule::None: return "None"; case FormatSettings::EscapingRule::Escaped: return "Escaped"; case FormatSettings::EscapingRule::Quoted: return "Quoted"; case FormatSettings::EscapingRule::CSV: return "CSV"; case FormatSettings::EscapingRule::JSON: return "JSON"; case FormatSettings::EscapingRule::XML: return "XML"; case FormatSettings::EscapingRule::Raw: return "Raw"; } UNREACHABLE(); } void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { NullOutput out; constexpr const char * field_name = ""; constexpr size_t field_name_len = 16; switch (escaping_rule) { case FormatSettings::EscapingRule::None: /// Empty field, just skip spaces break; case FormatSettings::EscapingRule::Escaped: readEscapedStringInto(out, buf); break; case FormatSettings::EscapingRule::Quoted: readQuotedFieldInto(out, buf); break; case FormatSettings::EscapingRule::CSV: readCSVStringInto(out, buf, format_settings.csv); break; case FormatSettings::EscapingRule::JSON: skipJSONField(buf, StringRef(field_name, field_name_len)); break; case FormatSettings::EscapingRule::Raw: readStringInto(out, buf); break; default: UNREACHABLE(); } } bool deserializeFieldByEscapingRule( const DataTypePtr & type, const SerializationPtr & serialization, IColumn & column, ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { bool read = true; bool parse_as_nullable = format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type); switch (escaping_rule) { case FormatSettings::EscapingRule::Escaped: if (parse_as_nullable) read = SerializationNullable::deserializeTextEscapedImpl(column, buf, format_settings, serialization); else serialization->deserializeTextEscaped(column, buf, format_settings); break; case FormatSettings::EscapingRule::Quoted: if (parse_as_nullable) read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, buf, format_settings); break; case FormatSettings::EscapingRule::CSV: if (parse_as_nullable) read = SerializationNullable::deserializeTextCSVImpl(column, buf, format_settings, serialization); else serialization->deserializeTextCSV(column, buf, format_settings); break; case FormatSettings::EscapingRule::JSON: if (parse_as_nullable) read = SerializationNullable::deserializeTextJSONImpl(column, buf, format_settings, serialization); else serialization->deserializeTextJSON(column, buf, format_settings); break; case FormatSettings::EscapingRule::Raw: if (parse_as_nullable) read = SerializationNullable::deserializeTextRawImpl(column, buf, format_settings, serialization); else serialization->deserializeTextRaw(column, buf, format_settings); break; default: throw Exception( ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule)); } return read; } void serializeFieldByEscapingRule( const IColumn & column, const ISerialization & serialization, WriteBuffer & out, size_t row_num, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { switch (escaping_rule) { case FormatSettings::EscapingRule::Escaped: serialization.serializeTextEscaped(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::Quoted: serialization.serializeTextQuoted(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::CSV: serialization.serializeTextCSV(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::JSON: serialization.serializeTextJSON(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::XML: serialization.serializeTextXML(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::Raw: serialization.serializeTextRaw(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::None: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize field with None escaping rule"); } } void writeStringByEscapingRule( const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: writeQuotedString(value, out); break; case FormatSettings::EscapingRule::JSON: writeJSONString(value, out, format_settings); break; case FormatSettings::EscapingRule::Raw: writeString(value, out); break; case FormatSettings::EscapingRule::CSV: writeCSVString(value, out); break; case FormatSettings::EscapingRule::Escaped: writeEscapedString(value, out); break; case FormatSettings::EscapingRule::XML: writeXMLStringForTextElement(value, out); break; case FormatSettings::EscapingRule::None: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize string with None escaping rule"); } } template String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { String result; switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: if constexpr (read_string) readQuotedString(result, buf); else readQuotedField(result, buf); break; case FormatSettings::EscapingRule::JSON: if constexpr (read_string) readJSONString(result, buf); else readJSONField(result, buf); break; case FormatSettings::EscapingRule::Raw: readString(result, buf); break; case FormatSettings::EscapingRule::CSV: if constexpr (read_string) readCSVString(result, buf, format_settings.csv); else readCSVField(result, buf, format_settings.csv); break; case FormatSettings::EscapingRule::Escaped: if constexpr (read_string) readEscapedString(result, buf); else readTSVField(result, buf); break; default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); } return result; } String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { return readByEscapingRule(buf, escaping_rule, format_settings); } String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { return readByEscapingRule(buf, escaping_rule, format_settings); } String readStringOrFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { /// For Quoted escaping rule we can read value as string only if it starts with `'`. /// If there is no `'` it can be any other field number/array/etc. if (escaping_rule == FormatSettings::EscapingRule::Quoted && !buf.eof() && *buf.position() != '\'') return readFieldByEscapingRule(buf, escaping_rule, format_settings); /// For JSON it's the same as for Quoted, but we check `"`. if (escaping_rule == FormatSettings::EscapingRule::JSON && !buf.eof() && *buf.position() != '"') return readFieldByEscapingRule(buf, escaping_rule, format_settings); /// For other escaping rules we can read any field as string value. return readStringByEscapingRule(buf, escaping_rule, format_settings); } DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info) { switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: return tryInferDataTypeForSingleField(field, format_settings); case FormatSettings::EscapingRule::JSON: return tryInferDataTypeForSingleJSONField(field, format_settings, json_info); case FormatSettings::EscapingRule::CSV: { if (!format_settings.csv.use_best_effort_in_schema_inference) return std::make_shared(); if (field.empty()) return nullptr; if (field == format_settings.csv.null_representation) return makeNullable(std::make_shared()); if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) return DataTypeFactory::instance().get("Bool"); /// In CSV complex types are serialized in quotes. If we have quotes, we should try to infer type /// from data inside quotes. if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"'))) { auto data = std::string_view(field.data() + 1, field.size() - 2); /// First, try to infer dates and datetimes. if (auto date_type = tryInferDateOrDateTimeFromString(data, format_settings)) return date_type; /// Try to determine the type of value inside quotes auto type = tryInferDataTypeForSingleField(data, format_settings); /// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string. if (!type || isNumber(removeNullable(type)) || isTuple(type)) return std::make_shared(); return type; } /// Case when CSV value is not in quotes. Check if it's a number or date/datetime, and if not, determine it as a string. if (auto number_type = tryInferNumberFromString(field, format_settings)) return number_type; if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings)) return date_type; return std::make_shared(); } case FormatSettings::EscapingRule::Raw: [[fallthrough]]; case FormatSettings::EscapingRule::Escaped: { if (!format_settings.tsv.use_best_effort_in_schema_inference) return std::make_shared(); if (field.empty()) return nullptr; if (field == format_settings.tsv.null_representation) return makeNullable(std::make_shared()); if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) return DataTypeFactory::instance().get("Bool"); if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings)) return date_type; /// Special case when we have number that starts with 0. In TSV we don't parse such numbers, /// see readIntTextUnsafe in ReadHelpers.h. If we see data started with 0, we can determine it /// as a String, so parsing won't fail. if (field[0] == '0' && field.size() != 1) return std::make_shared(); auto type = tryInferDataTypeForSingleField(field, format_settings); if (!type) return std::make_shared(); return type; } default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); } } DataTypes tryInferDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info) { DataTypes data_types; data_types.reserve(fields.size()); for (const auto & field : fields) data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, escaping_rule, json_info)); return data_types; } void transformInferredTypesByEscapingRuleIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info) { switch (escaping_rule) { case FormatSettings::EscapingRule::JSON: transformInferredJSONTypesIfNeeded(first, second, settings, json_info); break; case FormatSettings::EscapingRule::Escaped: [[fallthrough]]; case FormatSettings::EscapingRule::Raw: [[fallthrough]]; case FormatSettings::EscapingRule::Quoted: [[fallthrough]]; case FormatSettings::EscapingRule::CSV: transformInferredTypesIfNeeded(first, second, settings); break; default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot transform inferred types for value with {} escaping rule", escapingRuleToString(escaping_rule)); } } DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule) { switch (escaping_rule) { case FormatSettings::EscapingRule::CSV: case FormatSettings::EscapingRule::Escaped: case FormatSettings::EscapingRule::Raw: return std::make_shared(); default: return nullptr; } } DataTypes getDefaultDataTypeForEscapingRules(const std::vector & escaping_rules) { DataTypes data_types; for (const auto & rule : escaping_rules) data_types.push_back(getDefaultDataTypeForEscapingRule(rule)); return data_types; } String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings) { return fmt::format( "schema_inference_hints={}, max_rows_to_read_for_schema_inference={}, schema_inference_make_columns_nullable={}", settings.schema_inference_hints, settings.max_rows_to_read_for_schema_inference, settings.schema_inference_make_columns_nullable); } String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule) { String result = getAdditionalFormatInfoForAllRowBasedFormats(settings); /// First, settings that are common for all text formats: result += fmt::format( ", try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}", settings.try_infer_integers, settings.try_infer_dates, settings.try_infer_datetimes); /// Second, format-specific settings: switch (escaping_rule) { case FormatSettings::EscapingRule::Escaped: case FormatSettings::EscapingRule::Raw: result += fmt::format( ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}, null_representation={}", settings.tsv.use_best_effort_in_schema_inference, settings.bool_true_representation, settings.bool_false_representation, settings.tsv.null_representation); break; case FormatSettings::EscapingRule::CSV: result += fmt::format( ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}," " null_representation={}, delimiter={}, tuple_delimiter={}", settings.csv.use_best_effort_in_schema_inference, settings.bool_true_representation, settings.bool_false_representation, settings.csv.null_representation, settings.csv.delimiter, settings.csv.tuple_delimiter); break; case FormatSettings::EscapingRule::JSON: result += fmt::format( ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_objects_as_strings={}, read_numbers_as_strings={}, try_infer_objects={}", settings.json.try_infer_numbers_from_strings, settings.json.read_bools_as_numbers, settings.json.read_objects_as_strings, settings.json.read_numbers_as_strings, settings.json.allow_object_type); break; default: break; } return result; } void checkSupportedDelimiterAfterField(FormatSettings::EscapingRule escaping_rule, const String & delimiter, const DataTypePtr & type) { if (escaping_rule != FormatSettings::EscapingRule::Escaped) return; bool is_supported_delimiter_after_string = !delimiter.empty() && (delimiter.front() == '\t' || delimiter.front() == '\n'); if (is_supported_delimiter_after_string) return; /// Nullptr means that field is skipped and it's equivalent to String if (!type || isString(removeNullable(removeLowCardinality(type)))) throw Exception(ErrorCodes::BAD_ARGUMENTS, "'Escaped' serialization requires delimiter after String field to start with '\\t' or '\\n'"); } }