#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int BAD_ARGUMENTS; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) { if (escaping_rule.empty()) return FormatSettings::EscapingRule::None; else if (escaping_rule == "None") return FormatSettings::EscapingRule::None; else if (escaping_rule == "Escaped") return FormatSettings::EscapingRule::Escaped; else if (escaping_rule == "Quoted") return FormatSettings::EscapingRule::Quoted; else if (escaping_rule == "CSV") return FormatSettings::EscapingRule::CSV; else if (escaping_rule == "JSON") return FormatSettings::EscapingRule::JSON; else if (escaping_rule == "XML") return FormatSettings::EscapingRule::XML; else if (escaping_rule == "Raw") return FormatSettings::EscapingRule::Raw; else throw Exception("Unknown escaping rule \"" + escaping_rule + "\"", ErrorCodes::BAD_ARGUMENTS); } String escapingRuleToString(FormatSettings::EscapingRule escaping_rule) { switch (escaping_rule) { case FormatSettings::EscapingRule::None: return "None"; case FormatSettings::EscapingRule::Escaped: return "Escaped"; case FormatSettings::EscapingRule::Quoted: return "Quoted"; case FormatSettings::EscapingRule::CSV: return "CSV"; case FormatSettings::EscapingRule::JSON: return "JSON"; case FormatSettings::EscapingRule::XML: return "XML"; case FormatSettings::EscapingRule::Raw: return "Raw"; } UNREACHABLE(); } void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { NullOutput out; constexpr const char * field_name = ""; constexpr size_t field_name_len = 16; switch (escaping_rule) { case FormatSettings::EscapingRule::None: /// Empty field, just skip spaces break; case FormatSettings::EscapingRule::Escaped: readEscapedStringInto(out, buf); break; case FormatSettings::EscapingRule::Quoted: readQuotedFieldInto(out, buf); break; case FormatSettings::EscapingRule::CSV: readCSVStringInto(out, buf, format_settings.csv); break; case FormatSettings::EscapingRule::JSON: skipJSONField(buf, StringRef(field_name, field_name_len)); break; case FormatSettings::EscapingRule::Raw: readStringInto(out, buf); break; default: UNREACHABLE(); } } bool deserializeFieldByEscapingRule( const DataTypePtr & type, const SerializationPtr & serialization, IColumn & column, ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { bool read = true; bool parse_as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); switch (escaping_rule) { case FormatSettings::EscapingRule::Escaped: if (parse_as_nullable) read = SerializationNullable::deserializeTextEscapedImpl(column, buf, format_settings, serialization); else serialization->deserializeTextEscaped(column, buf, format_settings); break; case FormatSettings::EscapingRule::Quoted: if (parse_as_nullable) read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, buf, format_settings); break; case FormatSettings::EscapingRule::CSV: if (parse_as_nullable) read = SerializationNullable::deserializeTextCSVImpl(column, buf, format_settings, serialization); else serialization->deserializeTextCSV(column, buf, format_settings); break; case FormatSettings::EscapingRule::JSON: if (parse_as_nullable) read = SerializationNullable::deserializeTextJSONImpl(column, buf, format_settings, serialization); else serialization->deserializeTextJSON(column, buf, format_settings); break; case FormatSettings::EscapingRule::Raw: if (parse_as_nullable) read = SerializationNullable::deserializeTextRawImpl(column, buf, format_settings, serialization); else serialization->deserializeTextRaw(column, buf, format_settings); break; default: throw Exception( ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule)); } return read; } void serializeFieldByEscapingRule( const IColumn & column, const ISerialization & serialization, WriteBuffer & out, size_t row_num, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { switch (escaping_rule) { case FormatSettings::EscapingRule::Escaped: serialization.serializeTextEscaped(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::Quoted: serialization.serializeTextQuoted(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::CSV: serialization.serializeTextCSV(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::JSON: serialization.serializeTextJSON(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::XML: serialization.serializeTextXML(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::Raw: serialization.serializeTextRaw(column, row_num, out, format_settings); break; case FormatSettings::EscapingRule::None: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize field with None escaping rule"); } } void writeStringByEscapingRule( const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: writeQuotedString(value, out); break; case FormatSettings::EscapingRule::JSON: writeJSONString(value, out, format_settings); break; case FormatSettings::EscapingRule::Raw: writeString(value, out); break; case FormatSettings::EscapingRule::CSV: writeCSVString(value, out); break; case FormatSettings::EscapingRule::Escaped: writeEscapedString(value, out); break; case FormatSettings::EscapingRule::XML: writeXMLStringForTextElement(value, out); break; case FormatSettings::EscapingRule::None: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize string with None escaping rule"); } } template String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { String result; switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: if constexpr (read_string) readQuotedString(result, buf); else readQuotedField(result, buf); break; case FormatSettings::EscapingRule::JSON: if constexpr (read_string) readJSONString(result, buf); else readJSONField(result, buf); break; case FormatSettings::EscapingRule::Raw: readString(result, buf); break; case FormatSettings::EscapingRule::CSV: if constexpr (read_string) readCSVString(result, buf, format_settings.csv); else readCSVField(result, buf, format_settings.csv); break; case FormatSettings::EscapingRule::Escaped: readEscapedString(result, buf); break; default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); } return result; } String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { return readByEscapingRule(buf, escaping_rule, format_settings); } String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { return readByEscapingRule(buf, escaping_rule, format_settings); } void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, bool is_json, const std::unordered_set * numbers_parsed_from_json_strings = nullptr) { /// Do nothing if we didn't try to infer something special. if (!settings.try_infer_integers && !settings.try_infer_dates && !settings.try_infer_datetimes && !is_json) return; auto transform_simple_types = [&](DataTypes & data_types) { /// If we have floats and integers convert them all to float. if (settings.try_infer_integers) { bool have_floats = false; bool have_integers = false; for (const auto & type : data_types) { have_floats |= isFloat(type); have_integers |= isInteger(type) && !isBool(type); } if (have_floats && have_integers) { for (auto & type : data_types) { if (isInteger(type)) type = std::make_shared(); } } } /// If we have only dates and datetimes, convert dates to datetime. /// If we have date/datetimes and smth else, convert them to string, because /// There is a special case when we inferred both Date/DateTime and Int64 from Strings, /// for example: "arr: ["2020-01-01", "2000"]" -> Tuple(Date, Int64), /// so if we have Date/DateTime and smth else (not only String) we should /// convert Date/DateTime back to String, so then we will be able to /// convert Int64 back to String as well. if (settings.try_infer_dates || settings.try_infer_datetimes) { bool have_dates = false; bool have_datetimes = false; bool all_dates_or_datetimes = true; for (const auto & type : data_types) { have_dates |= isDate(type); have_datetimes |= isDateTime64(type); all_dates_or_datetimes &= isDate(type) || isDateTime64(type); } if (!all_dates_or_datetimes && (have_dates || have_datetimes)) { for (auto & type : data_types) { if (isDate(type) || isDateTime64(type)) type = std::make_shared(); } } else if (have_dates && have_datetimes) { for (auto & type : data_types) { if (isDate(type)) type = std::make_shared(9); } } } if (!is_json) return; /// Check settings specific for JSON formats. /// If we have numbers and strings, convert numbers to strings. if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings) { bool have_strings = false; bool have_numbers = false; for (const auto & type : data_types) { have_strings |= isString(type); have_numbers |= isNumber(type); } if (have_strings && have_numbers) { for (auto & type : data_types) { if (isNumber(type) && (settings.json.read_numbers_as_strings || !numbers_parsed_from_json_strings || numbers_parsed_from_json_strings->contains(type.get()))) type = std::make_shared(); } } } if (settings.json.read_bools_as_numbers) { /// Note that have_floats and have_integers both cannot be /// equal to true as in one of previous checks we convert /// integers to floats if we have both. bool have_floats = false; bool have_integers = false; bool have_bools = false; for (const auto & type : data_types) { have_floats |= isFloat(type); have_integers |= isInteger(type) && !isBool(type); have_bools |= isBool(type); } if (have_bools && (have_integers || have_floats)) { for (auto & type : data_types) { if (isBool(type)) { if (have_integers) type = std::make_shared(); else type = std::make_shared(); } } } } }; auto transform_complex_types = [&](DataTypes & data_types) { if (!is_json) return; bool have_maps = false; bool have_objects = false; bool are_maps_equal = true; DataTypePtr first_map_type; for (const auto & type : data_types) { if (isMap(type)) { if (!have_maps) { first_map_type = type; have_maps = true; } else { are_maps_equal &= type->equals(*first_map_type); } } else if (isObject(type)) { have_objects = true; } } if (have_maps && (have_objects || !are_maps_equal)) { for (auto & type : data_types) { if (isMap(type)) type = std::make_shared("json", true); } } }; transformTypesRecursively(types, transform_simple_types, transform_complex_types); } void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule) { transformInferredTypesIfNeededImpl(types, settings, escaping_rule == FormatSettings::EscapingRule::JSON); } void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule) { DataTypes types = {first, second}; transformInferredTypesIfNeeded(types, settings, escaping_rule); first = std::move(types[0]); second = std::move(types[1]); } void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set * numbers_parsed_from_json_strings) { transformInferredTypesIfNeededImpl(types, settings, true, numbers_parsed_from_json_strings); } void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings) { DataTypes types = {first, second}; transformInferredJSONTypesIfNeeded(types, settings); first = std::move(types[0]); second = std::move(types[1]); } bool tryInferDate(const std::string_view & field) { ReadBufferFromString buf(field); DayNum tmp; return tryReadDateText(tmp, buf) && buf.eof(); } bool tryInferDateTime(const std::string_view & field, const FormatSettings & settings) { ReadBufferFromString buf(field); Float64 tmp_float; /// Check if it's just a number, and if so, don't try to infer DateTime from it, /// because we can interpret this number as a timestamp and it will lead to /// inferring DateTime instead of simple Int64/Float64 in some cases. if (tryReadFloatText(tmp_float, buf) && buf.eof()) return false; buf.seek(0, SEEK_SET); /// Return position to the beginning DateTime64 tmp; switch (settings.date_time_input_format) { case FormatSettings::DateTimeInputFormat::Basic: if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof()) return true; break; case FormatSettings::DateTimeInputFormat::BestEffort: if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof()) return true; break; case FormatSettings::DateTimeInputFormat::BestEffortUS: if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof()) return true; break; } return false; } DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings) { if (settings.try_infer_dates && tryInferDate(field)) return makeNullable(std::make_shared()); if (settings.try_infer_datetimes && tryInferDateTime(field, settings)) return makeNullable(std::make_shared(9)); return nullptr; } static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBufferFromString & buf, const FormatSettings & settings) { if (buf.eof()) return nullptr; /// Array if (checkChar('[', buf)) { skipWhitespaceIfAny(buf); DataTypes nested_types; bool first = true; while (!buf.eof() && *buf.position() != ']') { if (!first) { skipWhitespaceIfAny(buf); if (!checkChar(',', buf)) return nullptr; skipWhitespaceIfAny(buf); } else first = false; auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!nested_type) return nullptr; nested_types.push_back(nested_type); } if (buf.eof()) return nullptr; ++buf.position(); if (nested_types.empty()) return std::make_shared(std::make_shared()); transformInferredTypesIfNeeded(nested_types, settings); auto least_supertype = tryGetLeastSupertype(nested_types); if (!least_supertype) return nullptr; return std::make_shared(least_supertype); } /// Tuple if (checkChar('(', buf)) { skipWhitespaceIfAny(buf); DataTypes nested_types; bool first = true; while (!buf.eof() && *buf.position() != ')') { if (!first) { skipWhitespaceIfAny(buf); if (!checkChar(',', buf)) return nullptr; skipWhitespaceIfAny(buf); } else first = false; auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!nested_type) return nullptr; nested_types.push_back(nested_type); } if (buf.eof() || nested_types.empty()) return nullptr; ++buf.position(); return std::make_shared(nested_types); } /// Map if (checkChar('{', buf)) { skipWhitespaceIfAny(buf); DataTypes key_types; DataTypes value_types; bool first = true; while (!buf.eof() && *buf.position() != '}') { if (!first) { skipWhitespaceIfAny(buf); if (!checkChar(',', buf)) return nullptr; skipWhitespaceIfAny(buf); } else first = false; auto key_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!key_type) return nullptr; key_types.push_back(key_type); skipWhitespaceIfAny(buf); if (!checkChar(':', buf)) return nullptr; skipWhitespaceIfAny(buf); auto value_type = determineDataTypeForSingleFieldImpl(buf, settings); if (!value_type) return nullptr; value_types.push_back(value_type); } if (buf.eof()) return nullptr; ++buf.position(); skipWhitespaceIfAny(buf); if (key_types.empty()) return std::make_shared(std::make_shared(), std::make_shared()); transformInferredTypesIfNeeded(key_types, settings); transformInferredTypesIfNeeded(value_types, settings); auto key_least_supertype = tryGetLeastSupertype(key_types); auto value_least_supertype = tryGetLeastSupertype(value_types); if (!key_least_supertype || !value_least_supertype) return nullptr; if (!DataTypeMap::checkKeyType(key_least_supertype)) return nullptr; return std::make_shared(key_least_supertype, value_least_supertype); } /// String if (*buf.position() == '\'') { ++buf.position(); String field; while (!buf.eof()) { char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end()); field.append(buf.position(), next_pos); buf.position() = next_pos; if (!buf.hasPendingData()) continue; if (*buf.position() == '\'') break; field.push_back(*buf.position()); if (*buf.position() == '\\') ++buf.position(); } if (buf.eof()) return nullptr; ++buf.position(); if (auto type = tryInferDateOrDateTime(field, settings)) return type; return std::make_shared(); } /// Bool if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf)) return DataTypeFactory::instance().get("Bool"); /// Null if (checkStringCaseInsensitive("NULL", buf)) return std::make_shared(); /// Number Float64 tmp; auto * pos_before_float = buf.position(); if (tryReadFloatText(tmp, buf)) { if (settings.try_infer_integers) { auto * float_end_pos = buf.position(); buf.position() = pos_before_float; Int64 tmp_int; if (tryReadIntText(tmp_int, buf) && buf.position() == float_end_pos) return std::make_shared(); buf.position() = float_end_pos; } return std::make_shared(); } return nullptr; } static DataTypePtr determineDataTypeForSingleField(ReadBufferFromString & buf, const FormatSettings & settings) { return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf, settings)); } DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule) { switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: { ReadBufferFromString buf(field); auto type = determineDataTypeForSingleField(buf, format_settings); return buf.eof() ? type : nullptr; } case FormatSettings::EscapingRule::JSON: return JSONUtils::getDataTypeFromField(field, format_settings); case FormatSettings::EscapingRule::CSV: { if (!format_settings.csv.use_best_effort_in_schema_inference) return makeNullable(std::make_shared()); if (field.empty() || field == format_settings.csv.null_representation) return nullptr; if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) return DataTypeFactory::instance().get("Nullable(Bool)"); if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"'))) { auto data = std::string_view(field.data() + 1, field.size() - 2); if (auto date_type = tryInferDateOrDateTime(data, format_settings)) return date_type; ReadBufferFromString buf(data); /// Try to determine the type of value inside quotes auto type = determineDataTypeForSingleField(buf, format_settings); if (!type) return nullptr; /// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string. if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof()) return makeNullable(std::make_shared()); return type; } /// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string. if (format_settings.try_infer_integers) { ReadBufferFromString buf(field); Int64 tmp_int; if (tryReadIntText(tmp_int, buf) && buf.eof()) return makeNullable(std::make_shared()); } ReadBufferFromString buf(field); Float64 tmp; if (tryReadFloatText(tmp, buf) && buf.eof()) return makeNullable(std::make_shared()); return makeNullable(std::make_shared()); } case FormatSettings::EscapingRule::Raw: [[fallthrough]]; case FormatSettings::EscapingRule::Escaped: { if (!format_settings.tsv.use_best_effort_in_schema_inference) return makeNullable(std::make_shared()); if (field.empty() || field == format_settings.tsv.null_representation) return nullptr; if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) return DataTypeFactory::instance().get("Nullable(Bool)"); if (auto date_type = tryInferDateOrDateTime(field, format_settings)) return date_type; ReadBufferFromString buf(field); auto type = determineDataTypeForSingleField(buf, format_settings); if (!buf.eof()) return makeNullable(std::make_shared()); return type; } default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); } } DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule) { DataTypes data_types; data_types.reserve(fields.size()); for (const auto & field : fields) data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule)); return data_types; } DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule) { switch (escaping_rule) { case FormatSettings::EscapingRule::CSV: case FormatSettings::EscapingRule::Escaped: case FormatSettings::EscapingRule::Raw: return makeNullable(std::make_shared()); default: return nullptr; } } DataTypes getDefaultDataTypeForEscapingRules(const std::vector & escaping_rules) { DataTypes data_types; for (const auto & rule : escaping_rules) data_types.push_back(getDefaultDataTypeForEscapingRule(rule)); return data_types; } String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule) { String result; /// First, settings that are common for all text formats: result = fmt::format( "schema_inference_hints={}, try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}, max_rows_to_read_for_schema_inference={}", settings.schema_inference_hints, settings.try_infer_integers, settings.try_infer_dates, settings.try_infer_datetimes, settings.max_rows_to_read_for_schema_inference); /// Second, format-specific settings: switch (escaping_rule) { case FormatSettings::EscapingRule::Escaped: case FormatSettings::EscapingRule::Raw: result += fmt::format( ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}, null_representation={}", settings.tsv.use_best_effort_in_schema_inference, settings.bool_true_representation, settings.bool_false_representation, settings.tsv.null_representation); break; case FormatSettings::EscapingRule::CSV: result += fmt::format( ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}," " null_representation={}, delimiter={}, tuple_delimiter={}", settings.tsv.use_best_effort_in_schema_inference, settings.bool_true_representation, settings.bool_false_representation, settings.csv.null_representation, settings.csv.delimiter, settings.csv.tuple_delimiter); break; case FormatSettings::EscapingRule::JSON: result += fmt::format(", try_infer_numbers_from_strings={}, read_bools_as_numbers={}", settings.json.try_infer_numbers_from_strings, settings.json.read_bools_as_numbers); break; default: break; } return result; } }