ClickHouse/src/Formats/EscapingRuleUtils.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

895 lines
32 KiB
C++
Raw Normal View History

2021-11-09 13:14:07 +00:00
#include <Formats/EscapingRuleUtils.h>
2022-05-06 16:48:48 +00:00
#include <Formats/JSONUtils.h>
#include <Formats/ReadSchemaUtils.h>
2021-11-09 13:14:07 +00:00
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeTuple.h>
2022-07-13 15:57:55 +00:00
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
2022-07-13 15:57:55 +00:00
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/transformTypesRecursively.h>
2021-11-09 13:14:07 +00:00
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/parseDateTimeBestEffort.h>
#include <Parsers/TokenIterator.h>
2021-11-09 13:14:07 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule)
{
if (escaping_rule.empty())
return FormatSettings::EscapingRule::None;
else if (escaping_rule == "None")
return FormatSettings::EscapingRule::None;
else if (escaping_rule == "Escaped")
return FormatSettings::EscapingRule::Escaped;
else if (escaping_rule == "Quoted")
return FormatSettings::EscapingRule::Quoted;
else if (escaping_rule == "CSV")
return FormatSettings::EscapingRule::CSV;
else if (escaping_rule == "JSON")
return FormatSettings::EscapingRule::JSON;
else if (escaping_rule == "XML")
return FormatSettings::EscapingRule::XML;
else if (escaping_rule == "Raw")
return FormatSettings::EscapingRule::Raw;
else
throw Exception("Unknown escaping rule \"" + escaping_rule + "\"", ErrorCodes::BAD_ARGUMENTS);
}
String escapingRuleToString(FormatSettings::EscapingRule escaping_rule)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::None:
return "None";
case FormatSettings::EscapingRule::Escaped:
return "Escaped";
case FormatSettings::EscapingRule::Quoted:
return "Quoted";
case FormatSettings::EscapingRule::CSV:
return "CSV";
case FormatSettings::EscapingRule::JSON:
return "JSON";
case FormatSettings::EscapingRule::XML:
return "XML";
case FormatSettings::EscapingRule::Raw:
return "Raw";
}
UNREACHABLE();
2021-11-09 13:14:07 +00:00
}
void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
{
NullOutput out;
2021-11-09 13:14:07 +00:00
constexpr const char * field_name = "<SKIPPED COLUMN>";
constexpr size_t field_name_len = 16;
switch (escaping_rule)
{
case FormatSettings::EscapingRule::None:
/// Empty field, just skip spaces
break;
case FormatSettings::EscapingRule::Escaped:
readEscapedStringInto(out, buf);
2021-11-09 13:14:07 +00:00
break;
case FormatSettings::EscapingRule::Quoted:
readQuotedFieldInto(out, buf);
2021-11-09 13:14:07 +00:00
break;
case FormatSettings::EscapingRule::CSV:
readCSVStringInto(out, buf, format_settings.csv);
2021-11-09 13:14:07 +00:00
break;
case FormatSettings::EscapingRule::JSON:
skipJSONField(buf, StringRef(field_name, field_name_len));
break;
case FormatSettings::EscapingRule::Raw:
readStringInto(out, buf);
2021-11-09 13:14:07 +00:00
break;
default:
UNREACHABLE();
2021-11-09 13:14:07 +00:00
}
}
bool deserializeFieldByEscapingRule(
const DataTypePtr & type,
const SerializationPtr & serialization,
IColumn & column,
ReadBuffer & buf,
FormatSettings::EscapingRule escaping_rule,
const FormatSettings & format_settings)
{
bool read = true;
bool parse_as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable();
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Escaped:
if (parse_as_nullable)
read = SerializationNullable::deserializeTextEscapedImpl(column, buf, format_settings, serialization);
else
serialization->deserializeTextEscaped(column, buf, format_settings);
break;
case FormatSettings::EscapingRule::Quoted:
if (parse_as_nullable)
read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization);
else
serialization->deserializeTextQuoted(column, buf, format_settings);
break;
case FormatSettings::EscapingRule::CSV:
if (parse_as_nullable)
read = SerializationNullable::deserializeTextCSVImpl(column, buf, format_settings, serialization);
else
serialization->deserializeTextCSV(column, buf, format_settings);
break;
case FormatSettings::EscapingRule::JSON:
if (parse_as_nullable)
read = SerializationNullable::deserializeTextJSONImpl(column, buf, format_settings, serialization);
else
serialization->deserializeTextJSON(column, buf, format_settings);
break;
case FormatSettings::EscapingRule::Raw:
if (parse_as_nullable)
read = SerializationNullable::deserializeTextRawImpl(column, buf, format_settings, serialization);
else
serialization->deserializeTextRaw(column, buf, format_settings);
break;
default:
throw Exception(
ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule));
2021-11-09 13:14:07 +00:00
}
return read;
}
void serializeFieldByEscapingRule(
const IColumn & column,
const ISerialization & serialization,
WriteBuffer & out,
size_t row_num,
FormatSettings::EscapingRule escaping_rule,
const FormatSettings & format_settings)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Escaped:
serialization.serializeTextEscaped(column, row_num, out, format_settings);
break;
case FormatSettings::EscapingRule::Quoted:
serialization.serializeTextQuoted(column, row_num, out, format_settings);
break;
case FormatSettings::EscapingRule::CSV:
serialization.serializeTextCSV(column, row_num, out, format_settings);
break;
case FormatSettings::EscapingRule::JSON:
serialization.serializeTextJSON(column, row_num, out, format_settings);
break;
case FormatSettings::EscapingRule::XML:
serialization.serializeTextXML(column, row_num, out, format_settings);
break;
case FormatSettings::EscapingRule::Raw:
serialization.serializeTextRaw(column, row_num, out, format_settings);
break;
case FormatSettings::EscapingRule::None:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize field with None escaping rule");
}
}
void writeStringByEscapingRule(
const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
2021-11-09 13:14:07 +00:00
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
writeQuotedString(value, out);
break;
case FormatSettings::EscapingRule::JSON:
writeJSONString(value, out, format_settings);
break;
case FormatSettings::EscapingRule::Raw:
writeString(value, out);
break;
case FormatSettings::EscapingRule::CSV:
writeCSVString(value, out);
break;
case FormatSettings::EscapingRule::Escaped:
writeEscapedString(value, out);
break;
case FormatSettings::EscapingRule::XML:
writeXMLStringForTextElement(value, out);
break;
case FormatSettings::EscapingRule::None:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize string with None escaping rule");
}
}
template <bool read_string>
String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
2021-11-09 13:14:07 +00:00
{
String result;
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
if constexpr (read_string)
readQuotedString(result, buf);
else
readQuotedField(result, buf);
2021-11-09 13:14:07 +00:00
break;
case FormatSettings::EscapingRule::JSON:
if constexpr (read_string)
readJSONString(result, buf);
else
2022-05-06 16:48:48 +00:00
readJSONField(result, buf);
2021-11-09 13:14:07 +00:00
break;
case FormatSettings::EscapingRule::Raw:
readString(result, buf);
break;
case FormatSettings::EscapingRule::CSV:
if constexpr (read_string)
readCSVString(result, buf, format_settings.csv);
else
readCSVField(result, buf, format_settings.csv);
2021-11-09 13:14:07 +00:00
break;
case FormatSettings::EscapingRule::Escaped:
readEscapedString(result, buf);
break;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule));
2021-11-09 13:14:07 +00:00
}
return result;
}
String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
{
return readByEscapingRule<false>(buf, escaping_rule, format_settings);
}
String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
{
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
}
2022-07-13 15:57:55 +00:00
void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, bool is_json, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings = nullptr)
{
/// Do nothing if we didn't try to infer something special.
if (!settings.try_infer_integers && !settings.try_infer_dates && !settings.try_infer_datetimes && !is_json)
return;
auto transform_simple_types = [&](DataTypes & data_types)
{
/// If we have floats and integers convert them all to float.
if (settings.try_infer_integers)
{
bool have_floats = false;
bool have_integers = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
}
if (have_floats && have_integers)
{
for (auto & type : data_types)
{
if (isInteger(type))
type = std::make_shared<DataTypeFloat64>();
}
}
}
/// If we have only dates and datetimes, convert dates to datetime.
2022-08-08 13:43:14 +00:00
/// If we have date/datetimes and smth else, convert them to string, because
/// There is a special case when we inferred both Date/DateTime and Int64 from Strings,
/// for example: "arr: ["2020-01-01", "2000"]" -> Tuple(Date, Int64),
/// so if we have Date/DateTime and smth else (not only String) we should
/// convert Date/DateTime back to String, so then we will be able to
/// convert Int64 back to String as well.
2022-07-13 15:57:55 +00:00
if (settings.try_infer_dates || settings.try_infer_datetimes)
{
bool have_dates = false;
bool have_datetimes = false;
bool all_dates_or_datetimes = true;
for (const auto & type : data_types)
{
have_dates |= isDate(type);
have_datetimes |= isDateTime64(type);
all_dates_or_datetimes &= isDate(type) || isDateTime64(type);
}
if (!all_dates_or_datetimes && (have_dates || have_datetimes))
{
for (auto & type : data_types)
{
if (isDate(type) || isDateTime64(type))
type = std::make_shared<DataTypeString>();
}
}
else if (have_dates && have_datetimes)
{
for (auto & type : data_types)
{
if (isDate(type))
type = std::make_shared<DataTypeDateTime64>(9);
}
}
}
if (!is_json)
return;
/// Check settings specific for JSON formats.
/// If we have numbers and strings, convert numbers to strings.
2022-09-08 16:07:20 +00:00
if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings)
2022-07-13 15:57:55 +00:00
{
bool have_strings = false;
bool have_numbers = false;
for (const auto & type : data_types)
{
have_strings |= isString(type);
have_numbers |= isNumber(type);
}
if (have_strings && have_numbers)
{
for (auto & type : data_types)
{
2022-09-08 16:07:20 +00:00
if (isNumber(type)
&& (settings.json.read_numbers_as_strings || !numbers_parsed_from_json_strings
|| numbers_parsed_from_json_strings->contains(type.get())))
2022-07-13 15:57:55 +00:00
type = std::make_shared<DataTypeString>();
}
}
}
if (settings.json.read_bools_as_numbers)
{
2022-08-08 13:43:14 +00:00
/// Note that have_floats and have_integers both cannot be
/// equal to true as in one of previous checks we convert
/// integers to floats if we have both.
2022-07-13 15:57:55 +00:00
bool have_floats = false;
bool have_integers = false;
bool have_bools = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
have_bools |= isBool(type);
}
if (have_bools && (have_integers || have_floats))
{
for (auto & type : data_types)
{
if (isBool(type))
{
if (have_integers)
type = std::make_shared<DataTypeInt64>();
else
type = std::make_shared<DataTypeFloat64>();
}
}
}
}
};
auto transform_complex_types = [&](DataTypes & data_types)
{
if (!is_json)
return;
bool have_maps = false;
bool have_objects = false;
bool are_maps_equal = true;
DataTypePtr first_map_type;
for (const auto & type : data_types)
{
if (isMap(type))
{
if (!have_maps)
{
first_map_type = type;
have_maps = true;
}
else
{
are_maps_equal &= type->equals(*first_map_type);
}
}
else if (isObject(type))
{
have_objects = true;
}
}
if (have_maps && (have_objects || !are_maps_equal))
{
for (auto & type : data_types)
{
if (isMap(type))
type = std::make_shared<DataTypeObject>("json", true);
}
}
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
}
void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
{
transformInferredTypesIfNeededImpl(types, settings, escaping_rule == FormatSettings::EscapingRule::JSON);
}
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
{
DataTypes types = {first, second};
transformInferredTypesIfNeeded(types, settings, escaping_rule);
first = std::move(types[0]);
second = std::move(types[1]);
}
2022-07-18 13:40:28 +00:00
void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings)
2022-07-13 15:57:55 +00:00
{
transformInferredTypesIfNeededImpl(types, settings, true, numbers_parsed_from_json_strings);
}
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
{
DataTypes types = {first, second};
transformInferredJSONTypesIfNeeded(types, settings);
first = std::move(types[0]);
second = std::move(types[1]);
}
bool tryInferDate(const std::string_view & field)
2022-07-13 15:57:55 +00:00
{
ReadBufferFromString buf(field);
DayNum tmp;
return tryReadDateText(tmp, buf) && buf.eof();
}
2022-07-13 15:57:55 +00:00
bool tryInferDateTime(const std::string_view & field, const FormatSettings & settings)
{
ReadBufferFromString buf(field);
Float64 tmp_float;
/// Check if it's just a number, and if so, don't try to infer DateTime from it,
/// because we can interpret this number as a timestamp and it will lead to
/// inferring DateTime instead of simple Int64/Float64 in some cases.
if (tryReadFloatText(tmp_float, buf) && buf.eof())
return false;
buf.seek(0, SEEK_SET); /// Return position to the beginning
DateTime64 tmp;
switch (settings.date_time_input_format)
2022-07-13 15:57:55 +00:00
{
case FormatSettings::DateTimeInputFormat::Basic:
if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffort:
if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffortUS:
if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
2022-07-13 15:57:55 +00:00
}
return false;
}
DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings)
{
if (settings.try_infer_dates && tryInferDate(field))
return makeNullable(std::make_shared<DataTypeDate>());
if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
return makeNullable(std::make_shared<DataTypeDateTime64>(9));
2022-07-13 15:57:55 +00:00
return nullptr;
}
static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBufferFromString & buf, const FormatSettings & settings)
{
if (buf.eof())
return nullptr;
/// Array
if (checkChar('[', buf))
{
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ']')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
2022-07-13 15:57:55 +00:00
auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof())
return nullptr;
++buf.position();
if (nested_types.empty())
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
2022-07-13 15:57:55 +00:00
transformInferredTypesIfNeeded(nested_types, settings);
auto least_supertype = tryGetLeastSupertype(nested_types);
if (!least_supertype)
return nullptr;
return std::make_shared<DataTypeArray>(least_supertype);
}
/// Tuple
if (checkChar('(', buf))
{
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ')')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
2022-07-13 15:57:55 +00:00
auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof() || nested_types.empty())
return nullptr;
++buf.position();
return std::make_shared<DataTypeTuple>(nested_types);
}
/// Map
if (checkChar('{', buf))
{
skipWhitespaceIfAny(buf);
DataTypes key_types;
DataTypes value_types;
bool first = true;
while (!buf.eof() && *buf.position() != '}')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
2022-07-13 15:57:55 +00:00
auto key_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!key_type)
return nullptr;
key_types.push_back(key_type);
skipWhitespaceIfAny(buf);
if (!checkChar(':', buf))
return nullptr;
skipWhitespaceIfAny(buf);
2022-07-13 15:57:55 +00:00
auto value_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!value_type)
return nullptr;
value_types.push_back(value_type);
}
if (buf.eof())
return nullptr;
++buf.position();
skipWhitespaceIfAny(buf);
if (key_types.empty())
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>());
2022-07-13 15:57:55 +00:00
transformInferredTypesIfNeeded(key_types, settings);
transformInferredTypesIfNeeded(value_types, settings);
auto key_least_supertype = tryGetLeastSupertype(key_types);
auto value_least_supertype = tryGetLeastSupertype(value_types);
if (!key_least_supertype || !value_least_supertype)
return nullptr;
if (!DataTypeMap::checkKeyType(key_least_supertype))
return nullptr;
return std::make_shared<DataTypeMap>(key_least_supertype, value_least_supertype);
}
/// String
if (*buf.position() == '\'')
{
++buf.position();
2022-07-13 15:57:55 +00:00
String field;
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
2022-07-13 15:57:55 +00:00
field.append(buf.position(), next_pos);
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\'')
break;
2022-07-13 15:57:55 +00:00
field.push_back(*buf.position());
if (*buf.position() == '\\')
++buf.position();
}
if (buf.eof())
return nullptr;
++buf.position();
2022-07-13 15:57:55 +00:00
if (auto type = tryInferDateOrDateTime(field, settings))
return type;
return std::make_shared<DataTypeString>();
}
/// Bool
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
return DataTypeFactory::instance().get("Bool");
/// Null
if (checkStringCaseInsensitive("NULL", buf))
return std::make_shared<DataTypeNothing>();
2022-03-24 13:08:58 +00:00
/// Number
Float64 tmp;
2022-07-13 15:57:55 +00:00
auto * pos_before_float = buf.position();
if (tryReadFloatText(tmp, buf))
2022-07-13 15:57:55 +00:00
{
if (settings.try_infer_integers)
{
auto * float_end_pos = buf.position();
buf.position() = pos_before_float;
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.position() == float_end_pos)
return std::make_shared<DataTypeInt64>();
buf.position() = float_end_pos;
}
return std::make_shared<DataTypeFloat64>();
2022-07-13 15:57:55 +00:00
}
return nullptr;
}
2022-07-13 15:57:55 +00:00
static DataTypePtr determineDataTypeForSingleField(ReadBufferFromString & buf, const FormatSettings & settings)
{
2022-07-13 15:57:55 +00:00
return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf, settings));
}
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
{
ReadBufferFromString buf(field);
2022-07-13 15:57:55 +00:00
auto type = determineDataTypeForSingleField(buf, format_settings);
return buf.eof() ? type : nullptr;
}
case FormatSettings::EscapingRule::JSON:
2022-07-13 15:57:55 +00:00
return JSONUtils::getDataTypeFromField(field, format_settings);
case FormatSettings::EscapingRule::CSV:
{
if (!format_settings.csv.use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
if (field.empty() || field == format_settings.csv.null_representation)
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Nullable(Bool)");
if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"')))
{
2022-07-13 15:57:55 +00:00
auto data = std::string_view(field.data() + 1, field.size() - 2);
if (auto date_type = tryInferDateOrDateTime(data, format_settings))
return date_type;
ReadBufferFromString buf(data);
/// Try to determine the type of value inside quotes
2022-07-13 15:57:55 +00:00
auto type = determineDataTypeForSingleField(buf, format_settings);
if (!type)
return nullptr;
/// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string.
if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
return type;
}
/// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string.
2022-07-13 15:57:55 +00:00
if (format_settings.try_infer_integers)
{
ReadBufferFromString buf(field);
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
return makeNullable(std::make_shared<DataTypeInt64>());
}
ReadBufferFromString buf(field);
Float64 tmp;
if (tryReadFloatText(tmp, buf) && buf.eof())
return makeNullable(std::make_shared<DataTypeFloat64>());
return makeNullable(std::make_shared<DataTypeString>());
}
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
case FormatSettings::EscapingRule::Escaped:
{
if (!format_settings.tsv.use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
if (field.empty() || field == format_settings.tsv.null_representation)
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Nullable(Bool)");
2022-07-13 15:57:55 +00:00
if (auto date_type = tryInferDateOrDateTime(field, format_settings))
return date_type;
ReadBufferFromString buf(field);
2022-07-13 15:57:55 +00:00
auto type = determineDataTypeForSingleField(buf, format_settings);
if (!buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
return type;
}
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule));
}
}
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
{
DataTypes data_types;
data_types.reserve(fields.size());
for (const auto & field : fields)
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule));
return data_types;
}
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::CSV:
case FormatSettings::EscapingRule::Escaped:
case FormatSettings::EscapingRule::Raw:
return makeNullable(std::make_shared<DataTypeString>());
default:
return nullptr;
}
}
DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules)
{
DataTypes data_types;
for (const auto & rule : escaping_rules)
data_types.push_back(getDefaultDataTypeForEscapingRule(rule));
return data_types;
}
String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
{
String result;
/// First, settings that are common for all text formats:
result = fmt::format(
2022-08-23 11:42:57 +00:00
"schema_inference_hints={}, try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}, max_rows_to_read_for_schema_inference={}",
settings.schema_inference_hints,
settings.try_infer_integers,
settings.try_infer_dates,
2022-08-23 11:42:57 +00:00
settings.try_infer_datetimes,
settings.max_rows_to_read_for_schema_inference);
/// Second, format-specific settings:
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Escaped:
case FormatSettings::EscapingRule::Raw:
result += fmt::format(
", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}, null_representation={}",
settings.tsv.use_best_effort_in_schema_inference,
settings.bool_true_representation,
settings.bool_false_representation,
settings.tsv.null_representation);
break;
case FormatSettings::EscapingRule::CSV:
result += fmt::format(
", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={},"
" null_representation={}, delimiter={}, tuple_delimiter={}",
settings.csv.use_best_effort_in_schema_inference,
settings.bool_true_representation,
settings.bool_false_representation,
settings.csv.null_representation,
settings.csv.delimiter,
settings.csv.tuple_delimiter);
break;
case FormatSettings::EscapingRule::JSON:
result += fmt::format(", try_infer_numbers_from_strings={}, read_bools_as_numbers={}", settings.json.try_infer_numbers_from_strings, settings.json.read_bools_as_numbers);
break;
default:
break;
}
return result;
}
void checkSupportedDelimiterAfterField(FormatSettings::EscapingRule escaping_rule, const String & delimiter, const DataTypePtr & type)
{
if (escaping_rule != FormatSettings::EscapingRule::Escaped)
return;
bool is_supported_delimiter_after_string = !delimiter.empty() && (delimiter.front() == '\t' || delimiter.front() == '\n');
if (is_supported_delimiter_after_string)
return;
/// Nullptr means that field is skipped and it's equivalent to String
if (!type || isString(removeNullable(removeLowCardinality(type))))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "'Escaped' serialization requires delimiter after String field to start with '\\t' or '\\n'");
}
2021-11-09 13:14:07 +00:00
}