Refactor and improve schema inference for text formats

This commit is contained in:
avogar 2022-12-07 21:19:27 +00:00
parent d4cd53ccea
commit 7375a7d429
47 changed files with 1545 additions and 992 deletions

View File

@ -3434,6 +3434,13 @@ Use schema from cache for URL with last modification time validation (for urls w
Default value: `true`.
## schema_inference_make_columns_nullable {#schema_inference_make_columns_nullable}
Controls making inferred types `Nullable` in schema inference for formats without information about nullability.
The inferred type will be `Nullable` only if column contains `NULL` in a sample that is parsed during schema inference.
Default value: `false`.
## use_structure_from_insertion_table_in_table_functions {use_structure_from_insertion_table_in_table_functions}
Use structure from insertion table instead of schema inference from data.

View File

@ -759,6 +759,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \
M(Bool, schema_inference_make_columns_nullable, true, "Controls making inferred types Nullable in schema inference for formats without information about nullability.", 0) \
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \
M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \

View File

@ -47,6 +47,7 @@ void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &
bool have_tuple = false;
bool all_tuples = true;
size_t tuple_size = 0;
bool sizes_are_equal = true;
std::vector<DataTypes> nested_types;
@ -62,7 +63,10 @@ void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &
nested_types[elem_idx].reserve(types.size());
}
else if (tuple_size != type_tuple->getElements().size())
return;
{
sizes_are_equal = false;
break;
}
have_tuple = true;
@ -75,7 +79,7 @@ void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &
if (have_tuple)
{
if (all_tuples)
if (all_tuples && sizes_are_equal)
{
std::vector<DataTypes> transposed_nested_types(types.size());
for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
@ -168,6 +172,9 @@ void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &
types[i] = nested_types[i];
}
if (transform_complex_types)
transform_complex_types(types);
return;
}
}

View File

@ -1,21 +1,11 @@
#include <Formats/EscapingRuleUtils.h>
#include <Formats/JSONUtils.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/transformTypesRecursively.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromString.h>
@ -261,542 +251,76 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
}
void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, bool is_json, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings = nullptr)
{
/// Do nothing if we didn't try to infer something special.
if (!settings.try_infer_integers && !settings.try_infer_dates && !settings.try_infer_datetimes && !is_json)
return;
auto transform_simple_types = [&](DataTypes & data_types)
{
/// If we have floats and integers convert them all to float.
if (settings.try_infer_integers)
{
bool have_floats = false;
bool have_integers = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
}
if (have_floats && have_integers)
{
for (auto & type : data_types)
{
if (isInteger(type))
type = std::make_shared<DataTypeFloat64>();
}
}
}
/// If we have only dates and datetimes, convert dates to datetime.
/// If we have date/datetimes and smth else, convert them to string, because
/// There is a special case when we inferred both Date/DateTime and Int64 from Strings,
/// for example: "arr: ["2020-01-01", "2000"]" -> Tuple(Date, Int64),
/// so if we have Date/DateTime and smth else (not only String) we should
/// convert Date/DateTime back to String, so then we will be able to
/// convert Int64 back to String as well.
if (settings.try_infer_dates || settings.try_infer_datetimes)
{
bool have_dates = false;
bool have_datetimes = false;
bool all_dates_or_datetimes = true;
for (const auto & type : data_types)
{
have_dates |= isDate(type);
have_datetimes |= isDateTime64(type);
all_dates_or_datetimes &= isDate(type) || isDateTime64(type);
}
if (!all_dates_or_datetimes && (have_dates || have_datetimes))
{
for (auto & type : data_types)
{
if (isDate(type) || isDateTime64(type))
type = std::make_shared<DataTypeString>();
}
}
else if (have_dates && have_datetimes)
{
for (auto & type : data_types)
{
if (isDate(type))
type = std::make_shared<DataTypeDateTime64>(9);
}
}
}
if (!is_json)
return;
/// Check settings specific for JSON formats.
/// If we have numbers and strings, convert numbers to strings.
if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings)
{
bool have_strings = false;
bool have_numbers = false;
for (const auto & type : data_types)
{
have_strings |= isString(type);
have_numbers |= isNumber(type);
}
if (have_strings && have_numbers)
{
for (auto & type : data_types)
{
if (isNumber(type)
&& (settings.json.read_numbers_as_strings || !numbers_parsed_from_json_strings
|| numbers_parsed_from_json_strings->contains(type.get())))
type = std::make_shared<DataTypeString>();
}
}
}
if (settings.json.read_bools_as_numbers)
{
/// Note that have_floats and have_integers both cannot be
/// equal to true as in one of previous checks we convert
/// integers to floats if we have both.
bool have_floats = false;
bool have_integers = false;
bool have_bools = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
have_bools |= isBool(type);
}
if (have_bools && (have_integers || have_floats))
{
for (auto & type : data_types)
{
if (isBool(type))
{
if (have_integers)
type = std::make_shared<DataTypeInt64>();
else
type = std::make_shared<DataTypeFloat64>();
}
}
}
}
};
auto transform_complex_types = [&](DataTypes & data_types)
{
if (!is_json)
return;
bool have_maps = false;
bool have_objects = false;
bool are_maps_equal = true;
DataTypePtr first_map_type;
for (const auto & type : data_types)
{
if (isMap(type))
{
if (!have_maps)
{
first_map_type = type;
have_maps = true;
}
else
{
are_maps_equal &= type->equals(*first_map_type);
}
}
else if (isObject(type))
{
have_objects = true;
}
}
if (have_maps && (have_objects || !are_maps_equal))
{
for (auto & type : data_types)
{
if (isMap(type))
type = std::make_shared<DataTypeObject>("json", true);
}
}
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
}
void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
{
transformInferredTypesIfNeededImpl(types, settings, escaping_rule == FormatSettings::EscapingRule::JSON);
}
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
{
DataTypes types = {first, second};
transformInferredTypesIfNeeded(types, settings, escaping_rule);
first = std::move(types[0]);
second = std::move(types[1]);
}
void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings)
{
transformInferredTypesIfNeededImpl(types, settings, true, numbers_parsed_from_json_strings);
}
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
{
DataTypes types = {first, second};
transformInferredJSONTypesIfNeeded(types, settings);
first = std::move(types[0]);
second = std::move(types[1]);
}
bool tryInferDate(const std::string_view & field)
{
ReadBufferFromString buf(field);
DayNum tmp;
return tryReadDateText(tmp, buf) && buf.eof();
}
bool tryInferDateTime(const std::string_view & field, const FormatSettings & settings)
{
if (field.empty())
return false;
ReadBufferFromString buf(field);
Float64 tmp_float;
/// Check if it's just a number, and if so, don't try to infer DateTime from it,
/// because we can interpret this number as a timestamp and it will lead to
/// inferring DateTime instead of simple Int64/Float64 in some cases.
if (tryReadFloatText(tmp_float, buf) && buf.eof())
return false;
buf.seek(0, SEEK_SET); /// Return position to the beginning
DateTime64 tmp;
switch (settings.date_time_input_format)
{
case FormatSettings::DateTimeInputFormat::Basic:
if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffort:
if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffortUS:
if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
}
return false;
}
DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings)
{
if (settings.try_infer_dates && tryInferDate(field))
return makeNullable(std::make_shared<DataTypeDate>());
if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
return makeNullable(std::make_shared<DataTypeDateTime64>(9));
return nullptr;
}
static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBufferFromString & buf, const FormatSettings & settings)
{
if (buf.eof())
return nullptr;
/// Array
if (checkChar('[', buf))
{
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ']')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof())
return nullptr;
++buf.position();
if (nested_types.empty())
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
transformInferredTypesIfNeeded(nested_types, settings);
auto least_supertype = tryGetLeastSupertype(nested_types);
if (!least_supertype)
return nullptr;
return std::make_shared<DataTypeArray>(least_supertype);
}
/// Tuple
if (checkChar('(', buf))
{
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ')')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof() || nested_types.empty())
return nullptr;
++buf.position();
return std::make_shared<DataTypeTuple>(nested_types);
}
/// Map
if (checkChar('{', buf))
{
skipWhitespaceIfAny(buf);
DataTypes key_types;
DataTypes value_types;
bool first = true;
while (!buf.eof() && *buf.position() != '}')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto key_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!key_type)
return nullptr;
key_types.push_back(key_type);
skipWhitespaceIfAny(buf);
if (!checkChar(':', buf))
return nullptr;
skipWhitespaceIfAny(buf);
auto value_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!value_type)
return nullptr;
value_types.push_back(value_type);
}
if (buf.eof())
return nullptr;
++buf.position();
skipWhitespaceIfAny(buf);
if (key_types.empty())
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>());
transformInferredTypesIfNeeded(key_types, settings);
transformInferredTypesIfNeeded(value_types, settings);
auto key_least_supertype = tryGetLeastSupertype(key_types);
auto value_least_supertype = tryGetLeastSupertype(value_types);
if (!key_least_supertype || !value_least_supertype)
return nullptr;
if (!DataTypeMap::checkKeyType(key_least_supertype))
return nullptr;
return std::make_shared<DataTypeMap>(key_least_supertype, value_least_supertype);
}
/// String
if (*buf.position() == '\'')
{
++buf.position();
String field;
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
field.append(buf.position(), next_pos);
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\'')
break;
field.push_back(*buf.position());
if (*buf.position() == '\\')
++buf.position();
}
if (buf.eof())
return nullptr;
++buf.position();
if (auto type = tryInferDateOrDateTime(field, settings))
return type;
return std::make_shared<DataTypeString>();
}
/// Bool
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
return DataTypeFactory::instance().get("Bool");
/// Null
if (checkStringCaseInsensitive("NULL", buf))
return std::make_shared<DataTypeNothing>();
/// Number
Float64 tmp;
auto * pos_before_float = buf.position();
if (tryReadFloatText(tmp, buf))
{
if (settings.try_infer_integers)
{
auto * float_end_pos = buf.position();
buf.position() = pos_before_float;
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.position() == float_end_pos)
return std::make_shared<DataTypeInt64>();
buf.position() = float_end_pos;
}
return std::make_shared<DataTypeFloat64>();
}
return nullptr;
}
static DataTypePtr determineDataTypeForSingleField(ReadBufferFromString & buf, const FormatSettings & settings)
{
return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf, settings));
}
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
{
ReadBufferFromString buf(field);
auto type = determineDataTypeForSingleField(buf, format_settings);
return buf.eof() ? type : nullptr;
}
return tryInferDataTypeForSingleField(field, format_settings);
case FormatSettings::EscapingRule::JSON:
return JSONUtils::getDataTypeFromField(field, format_settings);
return tryInferDataTypeForSingleJSONField(field, format_settings, json_info);
case FormatSettings::EscapingRule::CSV:
{
if (!format_settings.csv.use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
if (field.empty() || field == format_settings.csv.null_representation)
if (field.empty())
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Nullable(Bool)");
if (field == format_settings.csv.null_representation)
return makeNullable(std::make_shared<DataTypeNothing>());
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Bool");
/// In CSV complex types are serialized in quotes. If we have quotes, we should try to infer type
/// from data inside quotes.
if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"')))
{
auto data = std::string_view(field.data() + 1, field.size() - 2);
if (auto date_type = tryInferDateOrDateTime(data, format_settings))
/// First, try to infer dates and datetimes.
if (auto date_type = tryInferDateOrDateTimeFromString(data, format_settings))
return date_type;
ReadBufferFromString buf(data);
/// Try to determine the type of value inside quotes
auto type = determineDataTypeForSingleField(buf, format_settings);
auto type = tryInferDataTypeForSingleField(data, format_settings);
if (!type)
return nullptr;
/// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string.
if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
/// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string.
if (!type || isNumber(removeNullable(type)) || isTuple(type))
return std::make_shared<DataTypeString>();
return type;
}
/// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string.
if (format_settings.try_infer_integers)
{
ReadBufferFromString buf(field);
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
return makeNullable(std::make_shared<DataTypeInt64>());
}
auto type = tryInferNumberFromString(field, format_settings);
ReadBufferFromString buf(field);
Float64 tmp;
if (tryReadFloatText(tmp, buf) && buf.eof())
return makeNullable(std::make_shared<DataTypeFloat64>());
if (!type)
return std::make_shared<DataTypeString>();
return makeNullable(std::make_shared<DataTypeString>());
return type;
}
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
case FormatSettings::EscapingRule::Escaped:
{
if (!format_settings.tsv.use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
if (field.empty() || field == format_settings.tsv.null_representation)
if (field.empty())
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Nullable(Bool)");
if (field == format_settings.tsv.null_representation)
return makeNullable(std::make_shared<DataTypeNothing>());
if (auto date_type = tryInferDateOrDateTime(field, format_settings))
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Bool");
if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings))
return date_type;
ReadBufferFromString buf(field);
auto type = determineDataTypeForSingleField(buf, format_settings);
if (!buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
auto type = tryInferDataTypeForSingleField(field, format_settings);
if (!type)
return std::make_shared<DataTypeString>();
return type;
}
default:
@ -804,15 +328,34 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe
}
}
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
DataTypes tryInferDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
{
DataTypes data_types;
data_types.reserve(fields.size());
for (const auto & field : fields)
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule));
data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, escaping_rule, json_info));
return data_types;
}
void transformInferredTypesByEscapingRuleIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::JSON:
transformInferredJSONTypesIfNeeded(first, second, settings, json_info);
break;
case FormatSettings::EscapingRule::Escaped: [[fallthrough]];
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
case FormatSettings::EscapingRule::Quoted: [[fallthrough]];
case FormatSettings::EscapingRule::CSV:
transformInferredTypesIfNeeded(first, second, settings);
break;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot transform inferred types for value with {} escaping rule", escapingRuleToString(escaping_rule));
}
}
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule)
{
switch (escaping_rule)
@ -820,7 +363,7 @@ DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escap
case FormatSettings::EscapingRule::CSV:
case FormatSettings::EscapingRule::Escaped:
case FormatSettings::EscapingRule::Raw:
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
default:
return nullptr;
}
@ -837,9 +380,10 @@ DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::E
String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings)
{
return fmt::format(
"schema_inference_hints={}, max_rows_to_read_for_schema_inference={}",
"schema_inference_hints={}, max_rows_to_read_for_schema_inference={}, schema_inference_make_columns_nullable={}",
settings.schema_inference_hints,
settings.max_rows_to_read_for_schema_inference);
settings.max_rows_to_read_for_schema_inference,
settings.schema_inference_make_columns_nullable);
}
String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
@ -876,7 +420,11 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
settings.csv.tuple_delimiter);
break;
case FormatSettings::EscapingRule::JSON:
result += fmt::format(", try_infer_numbers_from_strings={}, read_bools_as_numbers={}", settings.json.try_infer_numbers_from_strings, settings.json.read_bools_as_numbers);
result += fmt::format(
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, try_infer_objects={}",
settings.json.try_infer_numbers_from_strings,
settings.json.read_bools_as_numbers,
settings.json.try_infer_objects);
break;
default:
break;

View File

@ -1,6 +1,7 @@
#pragma once
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/Serializations/ISerialization.h>
#include <IO/ReadBuffer.h>
@ -38,45 +39,17 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es
/// Try to determine the type of the field written by a specific escaping rule.
/// If cannot, return nullptr.
/// - For Quoted escaping rule we can interpret a single field as a constant
/// expression and get it's type by evaluation this expression.
/// - For JSON escaping rule we can use JSON parser to parse a single field
/// and then convert JSON type of this field to ClickHouse type.
/// - For CSV escaping rule we can do the next:
/// - If the field is an unquoted string, then we try to parse it as a number,
/// and if we cannot, treat it as a String.
/// - If the field is a string in quotes, then we try to use some
/// tweaks and heuristics to determine the type inside quotes, and if we can't or
/// the result is a number or tuple (we don't parse numbers in quotes and don't
/// support tuples in CSV) we treat it as a String.
/// - If input_format_csv_use_best_effort_in_schema_inference is disabled, we
/// treat everything as a string.
/// - For TSV and TSVRaw we try to use some tweaks and heuristics to determine the type
/// of value if setting input_format_tsv_use_best_effort_in_schema_inference is enabled,
/// otherwise we treat everything as a string.
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
/// See tryInferDataTypeForSingle(JSON)Field in SchemaInferenceUtils.h
DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr);
DataTypes tryInferDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr);
/// Check if we need to transform types inferred from data and transform it if necessary.
/// See transformInferred(JSON)TypesIfNeeded in SchemaInferenceUtils.h
void transformInferredTypesByEscapingRuleIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr);
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule);
DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules);
/// Try to infer Date or Datetime from string if corresponding settings are enabled.
DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings);
/// Check if we need to transform types inferred from data and transform it if necessary.
/// It's used when we try to infer some not ordinary types from another types.
/// For example dates from strings, we should check if dates were inferred from all strings
/// in the same way and if not, transform inferred dates back to strings.
/// For example, if we have array of strings and we tried to infer dates from them,
/// to make the result type Array(Date) we should ensure that all strings were
/// successfully parsed as dated and if not, convert all dates back to strings and make result type Array(String).
void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule = FormatSettings::EscapingRule::Escaped);
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule = FormatSettings::EscapingRule::Escaped);
/// Same as transformInferredTypesIfNeeded but takes into account settings that are special for JSON formats.
void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings = nullptr);
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings);
String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings);
String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule);

View File

@ -168,6 +168,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
format_settings.schema_inference_hints = settings.schema_inference_hints;
format_settings.schema_inference_make_columns_nullable = settings.schema_inference_make_columns_nullable;
format_settings.mysql_dump.table_name = settings.input_format_mysql_dump_table_name;
format_settings.mysql_dump.map_column_names = settings.input_format_mysql_dump_map_column_names;
format_settings.sql_insert.max_batch_size = settings.output_format_sql_insert_max_batch_size;

View File

@ -71,6 +71,8 @@ struct FormatSettings
Raw
};
bool schema_inference_make_columns_nullable = true;
DateTimeOutputFormat date_time_output_format = DateTimeOutputFormat::Simple;
bool input_format_ipv4_default_on_conversion_error = false;

View File

@ -6,19 +6,13 @@
#include <IO/WriteBufferValidUTF8.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <Common/JSONParsers/RapidJSONParser.h>
#include <Common/JSONParsers/DummyJSONParser.h>
#include <base/find_symbols.h>
#include <Common/logger_useful.h>
namespace DB
{
@ -122,196 +116,6 @@ namespace JSONUtils
return {loadAtPosition(in, memory, pos), number_of_rows};
}
template <const char opening_bracket, const char closing_bracket>
static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in)
{
Memory memory;
fileSegmentationEngineJSONEachRowImpl<opening_bracket, closing_bracket>(in, memory, 0, 1, 1);
return String(memory.data(), memory.size());
}
template <class Element>
DataTypePtr getDataTypeFromFieldImpl(const Element & field, const FormatSettings & settings, std::unordered_set<const IDataType *> & numbers_parsed_from_json_strings)
{
if (field.isNull())
return nullptr;
if (field.isBool())
return DataTypeFactory::instance().get("Nullable(Bool)");
if (field.isInt64() || field.isUInt64())
{
if (settings.try_infer_integers)
return makeNullable(std::make_shared<DataTypeInt64>());
return makeNullable(std::make_shared<DataTypeFloat64>());
}
if (field.isDouble())
return makeNullable(std::make_shared<DataTypeFloat64>());
if (field.isString())
{
if (auto date_type = tryInferDateOrDateTime(field.getString(), settings))
return date_type;
if (!settings.json.try_infer_numbers_from_strings)
return makeNullable(std::make_shared<DataTypeString>());
ReadBufferFromString buf(field.getString());
if (settings.try_infer_integers)
{
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
{
auto type = std::make_shared<DataTypeInt64>();
numbers_parsed_from_json_strings.insert(type.get());
return makeNullable(type);
}
}
Float64 tmp;
if (tryReadFloatText(tmp, buf) && buf.eof())
{
auto type = std::make_shared<DataTypeFloat64>();
numbers_parsed_from_json_strings.insert(type.get());
return makeNullable(type);
}
return makeNullable(std::make_shared<DataTypeString>());
}
if (field.isArray())
{
auto array = field.getArray();
/// Return nullptr in case of empty array because we cannot determine nested type.
if (array.size() == 0)
return nullptr;
DataTypes nested_data_types;
/// If this array contains fields with different types we will treat it as Tuple.
bool are_types_the_same = true;
for (const auto element : array)
{
auto type = getDataTypeFromFieldImpl(element, settings, numbers_parsed_from_json_strings);
if (!type)
return nullptr;
if (!nested_data_types.empty() && !type->equals(*nested_data_types.back()))
are_types_the_same = false;
nested_data_types.push_back(std::move(type));
}
if (!are_types_the_same)
{
auto nested_types_copy = nested_data_types;
transformInferredJSONTypesIfNeeded(nested_types_copy, settings, &numbers_parsed_from_json_strings);
are_types_the_same = true;
for (size_t i = 1; i < nested_types_copy.size(); ++i)
are_types_the_same &= nested_types_copy[i]->equals(*nested_types_copy[i - 1]);
if (are_types_the_same)
nested_data_types = std::move(nested_types_copy);
}
if (!are_types_the_same)
return std::make_shared<DataTypeTuple>(nested_data_types);
return std::make_shared<DataTypeArray>(nested_data_types.back());
}
if (field.isObject())
{
auto object = field.getObject();
DataTypes value_types;
for (const auto key_value_pair : object)
{
auto type = getDataTypeFromFieldImpl(key_value_pair.second, settings, numbers_parsed_from_json_strings);
if (!type)
{
/// If we couldn't infer nested type and Object type is not enabled,
/// we can't determine the type of this JSON field.
if (!settings.json.try_infer_objects)
return nullptr;
continue;
}
if (settings.json.try_infer_objects && isObject(type))
return std::make_shared<DataTypeObject>("json", true);
value_types.push_back(type);
}
if (value_types.empty())
return nullptr;
transformInferredJSONTypesIfNeeded(value_types, settings, &numbers_parsed_from_json_strings);
bool are_types_equal = true;
for (size_t i = 1; i < value_types.size(); ++i)
are_types_equal &= value_types[i]->equals(*value_types[0]);
if (!are_types_equal)
{
if (!settings.json.try_infer_objects)
return nullptr;
return std::make_shared<DataTypeObject>("json", true);
}
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_types[0]);
}
throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"};
}
auto getJSONParserAndElement()
{
#if USE_SIMDJSON
return std::pair<SimdJSONParser, SimdJSONParser::Element>();
#elif USE_RAPIDJSON
return std::pair<RapidJSONParser, RapidJSONParser::Element>();
#else
return std::pair<DummyJSONParser, DummyJSONParser::Element>();
#endif
}
DataTypePtr getDataTypeFromField(const String & field, const FormatSettings & settings)
{
auto [parser, element] = getJSONParserAndElement();
bool parsed = parser.parse(field, element);
if (!parsed)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", field);
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
return getDataTypeFromFieldImpl(element, settings, numbers_parsed_from_json_strings);
}
template <class Extractor, const char opening_bracket, const char closing_bracket>
static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, const FormatSettings & settings, bool /*json_strings*/, Extractor & extractor)
{
String line = readJSONEachRowLineIntoStringImpl<opening_bracket, closing_bracket>(in);
auto [parser, element] = getJSONParserAndElement();
bool parsed = parser.parse(line, element);
if (!parsed)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", line);
auto fields = extractor.extract(element);
DataTypes data_types;
data_types.reserve(fields.size());
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
for (const auto & field : fields)
data_types.push_back(getDataTypeFromFieldImpl(field, settings, numbers_parsed_from_json_strings));
/// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings.
/// Should we try to parse data inside strings somehow in this case?
return data_types;
}
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows)
{
return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_bytes, 1, max_rows);
@ -323,68 +127,56 @@ namespace JSONUtils
return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_bytes, min_rows, max_rows);
}
struct JSONEachRowFieldsExtractor
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info)
{
template <class Element>
std::vector<Element> extract(const Element & element)
skipWhitespaceIfAny(in);
assertChar('{', in);
bool first = true;
NamesAndTypesList names_and_types;
String field;
while (!in.eof() && *in.position() != '}')
{
/// {..., "<column_name>" : <value>, ...}
if (!first)
skipComma(in);
else
first = false;
if (!element.isObject())
throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an object");
auto object = element.getObject();
std::vector<Element> fields;
fields.reserve(object.size());
column_names.reserve(object.size());
for (const auto & key_value_pair : object)
{
column_names.emplace_back(key_value_pair.first);
fields.push_back(key_value_pair.second);
}
return fields;
auto name = readFieldName(in);
auto type = tryInferDataTypeForSingleJSONField(in, settings, inference_info);
names_and_types.emplace_back(name, type);
}
std::vector<String> column_names;
};
if (in.eof())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON object");
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings)
{
JSONEachRowFieldsExtractor extractor;
auto data_types
= determineColumnDataTypesFromJSONEachRowDataImpl<JSONEachRowFieldsExtractor, '{', '}'>(in, settings, json_strings, extractor);
NamesAndTypesList result;
for (size_t i = 0; i != extractor.column_names.size(); ++i)
result.emplace_back(extractor.column_names[i], data_types[i]);
return result;
assertChar('}', in);
return names_and_types;
}
struct JSONCompactEachRowFieldsExtractor
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info)
{
template <class Element>
std::vector<Element> extract(const Element & element)
skipWhitespaceIfAny(in);
assertChar('[', in);
bool first = true;
DataTypes types;
String field;
while (!in.eof() && *in.position() != ']')
{
/// [..., <value>, ...]
if (!element.isArray())
throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an array");
auto array = element.getArray();
std::vector<Element> fields;
fields.reserve(array.size());
for (size_t i = 0; i != array.size(); ++i)
fields.push_back(array[i]);
return fields;
if (!first)
skipComma(in);
else
first = false;
auto type = tryInferDataTypeForSingleJSONField(in, settings, inference_info);
types.push_back(type);
}
};
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings)
{
JSONCompactEachRowFieldsExtractor extractor;
return determineColumnDataTypesFromJSONEachRowDataImpl<JSONCompactEachRowFieldsExtractor, '[', ']'>(in, settings, json_strings, extractor);
if (in.eof())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON array");
assertChar(']', in);
return types;
}
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf)
{
/// For JSONEachRow we can safely skip whitespace characters

View File

@ -13,24 +13,21 @@
namespace DB
{
struct JSONInferenceInfo;
namespace JSONUtils
{
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows);
std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t min_rows, size_t max_rows);
/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable.
/// JSON array with different nested types is treated as Tuple.
/// If cannot convert (for example when field contains null), return nullptr.
DataTypePtr getDataTypeFromField(const String & field, const FormatSettings & settings);
/// Read row in JSONEachRow format and try to determine type for each field.
/// Return list of names and types.
/// If cannot determine the type of some field, return nullptr for it.
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings);
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info);
/// Read row in JSONCompactEachRow format and try to determine type for each field.
/// If cannot determine the type of some field, return nullptr for it.
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings);
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info);
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf);

View File

@ -197,69 +197,6 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
return readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, retry, context, buf_out);
}
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type)
{
if (!type)
return nullptr;
WhichDataType which(type);
if (which.isNothing())
return nullptr;
if (which.isNullable())
{
const auto * nullable_type = assert_cast<const DataTypeNullable *>(type.get());
return makeNullableRecursivelyAndCheckForNothing(nullable_type->getNestedType());
}
if (which.isArray())
{
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
auto nested_type = makeNullableRecursivelyAndCheckForNothing(array_type->getNestedType());
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
}
if (which.isTuple())
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
DataTypes nested_types;
for (const auto & element : tuple_type->getElements())
{
auto nested_type = makeNullableRecursivelyAndCheckForNothing(element);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
return std::make_shared<DataTypeTuple>(std::move(nested_types));
}
if (which.isMap())
{
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
auto key_type = makeNullableRecursivelyAndCheckForNothing(map_type->getKeyType());
auto value_type = makeNullableRecursivelyAndCheckForNothing(map_type->getValueType());
return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr;
}
if (which.isLowCarnality())
{
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
auto nested_type = makeNullableRecursivelyAndCheckForNothing(lc_type->getDictionaryType());
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
}
return makeNullable(type);
}
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)
{
NamesAndTypesList result;
for (auto & [name, type] : header.getNamesAndTypesList())
result.emplace_back(name, makeNullableRecursivelyAndCheckForNothing(type));
return result;
}
SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context)
{
return getKeysForSchemaCache({source}, format, format_settings, context).front();

View File

@ -35,21 +35,7 @@ ColumnsDescription readSchemaFromFormat(
ContextPtr & context,
std::unique_ptr<ReadBuffer> & buf_out);
/// Make type Nullable recursively:
/// - Type -> Nullable(type)
/// - Array(Type) -> Array(Nullable(Type))
/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN))
/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType))
/// - LowCardinality(Type) -> LowCardinality(Nullable(Type))
/// If type is Nothing or one of the nested types is Nothing, return nullptr.
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type);
/// Call makeNullableRecursivelyAndCheckForNothing for all types
/// in the block and return names and types.
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header);
SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context);
SchemaCache::Keys getKeysForSchemaCache(const Strings & sources, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context);
void splitSchemaCacheKey(const String & key, String & source, String & format, String & additional_format_info);
}

View File

@ -0,0 +1,966 @@
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/transformTypesRecursively.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/parseDateTimeBestEffort.h>
#include <IO/PeekableReadBuffer.h>
#include <Core/Block.h>
#include <Common/assert_cast.h>
namespace DB
{
static bool checkIfTypesAreEqual(const DataTypes & types)
{
for (size_t i = 1; i < types.size(); ++i)
{
if (!types[0]->equals(*types[i]))
return false;
}
return true;
}
/// If we have both Nothing and non Nothing types, convert all Nothing types to the first non Nothing.
/// For example if we have types [Nothing, String, Nothing] we change it to [String, String, String]
static void transformNothingSimpleTypes(DataTypes & data_types)
{
bool have_nothing = false;
DataTypePtr not_nothing_type = nullptr;
for (const auto & type : data_types)
{
if (isNothing(type))
have_nothing = true;
else if (!not_nothing_type)
not_nothing_type = type;
}
if (have_nothing && not_nothing_type)
{
for (auto & type : data_types)
{
if (isNothing(type))
type = not_nothing_type;
}
}
}
/// If we have both Int64 and Float64 types, convert all Int64 to Float64.
static void transformIntegersAndFloatsToFloats(DataTypes & data_types)
{
bool have_floats = false;
bool have_integers = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
}
if (have_floats && have_integers)
{
for (auto & type : data_types)
{
if (isInteger(type))
type = std::make_shared<DataTypeFloat64>();
}
}
}
/// If we have only Date and DateTime types, convert Date to DateTime,
/// otherwise, convert all Date and DateTime to String.
static void transformDatesAndDateTimes(DataTypes & data_types)
{
bool have_dates = false;
bool have_datetimes = false;
bool all_dates_or_datetimes = true;
for (const auto & type : data_types)
{
have_dates |= isDate(type);
have_datetimes |= isDateTime64(type);
all_dates_or_datetimes &= isDate(type) || isDateTime64(type);
}
if (!all_dates_or_datetimes && (have_dates || have_datetimes))
{
for (auto & type : data_types)
{
if (isDate(type) || isDateTime64(type))
type = std::make_shared<DataTypeString>();
}
}
else if (have_dates && have_datetimes)
{
for (auto & type : data_types)
{
if (isDate(type))
type = std::make_shared<DataTypeDateTime64>(9);
}
}
}
/// If we have numbers (Int64/Float64) and String types and numbers were parsed from String,
/// convert all numbers to String.
static void transformJSONNumbersBackToString(DataTypes & data_types, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
bool have_strings = false;
bool have_numbers = false;
for (const auto & type : data_types)
{
have_strings |= isString(type);
have_numbers |= isNumber(type);
}
if (have_strings && have_numbers)
{
for (auto & type : data_types)
{
if (isNumber(type)
&& (settings.json.read_numbers_as_strings || !json_info
|| json_info->numbers_parsed_from_json_strings.contains(type.get())))
type = std::make_shared<DataTypeString>();
}
}
}
/// If we have both Bool and number (Int64/Float64) types,
/// convert all Bool to Int64/Float64.
static void transformBoolsAndNumbersToNumbers(DataTypes & data_types)
{
bool have_floats = false;
bool have_integers = false;
bool have_bools = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
have_bools |= isBool(type);
}
if (have_bools && (have_integers || have_floats))
{
for (auto & type : data_types)
{
if (isBool(type))
{
if (have_integers)
type = std::make_shared<DataTypeInt64>();
else
type = std::make_shared<DataTypeFloat64>();
}
}
}
}
/// If we have type Nothing (or Nullable(Nothing) for JSON) and some other non Nothing types,
/// convert all Nothing types to the first non Nothing.
/// For example, when we have [Nothing, Array(Int64)] it will convert it to [Array(Int64), Array(Int64)]
/// (it can happen when transforming complex nested types like [Array(Nothing), Array(Array(Int64))])
template <bool is_json>
static void transformNothingComplexTypes(DataTypes & data_types)
{
bool have_nothing = false;
DataTypePtr not_nothing_type = nullptr;
for (const auto & type : data_types)
{
if (isNothing(type) || (is_json && type->onlyNull()))
have_nothing = true;
else if (!not_nothing_type)
not_nothing_type = type;
}
if (have_nothing && not_nothing_type)
{
for (auto & type : data_types)
{
if (isNothing(type) || (is_json && type->onlyNull()))
type = not_nothing_type;
}
}
}
/// If we have both Nullable and non Nullable types, make all types Nullable
static void transformNullableTypes(DataTypes & data_types)
{
bool have_nullable = false;
for (const auto & type : data_types)
{
if (type->isNullable())
{
have_nullable = true;
break;
}
}
if (have_nullable)
{
for (auto & type : data_types)
{
if (type->canBeInsideNullable())
type = makeNullable(type);
}
}
}
/// If we have Tuple with the same nested types like Tuple(Int64, Int64),
/// convert it to Array(Int64). It's used for JSON values.
/// For example when we had type Tuple(Int64, Nullable(Nothing)) and we
/// transformed it to Tuple(Nullable(Int64), Nullable(Int64)) we will
/// also transform it to Array(Nullable(Int64))
static void transformTuplesWithEqualNestedTypesToArrays(DataTypes & data_types)
{
for (auto & type : data_types)
{
if (isTuple(type))
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
if (checkIfTypesAreEqual(tuple_type->getElements()))
type = std::make_shared<DataTypeArray>(tuple_type->getElements().back());
}
}
}
template <bool is_json>
static void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info = nullptr);
/// If we have Tuple and Array types, try to convert them all to Array
/// if there is a common type for all nested types.
/// For example, if we have [Tuple(Nullable(Nothing), String), Array(Date), Tuple(Date, String)]
/// it will convert them all to Array(String)
static void transformJSONTuplesAndArraysToArrays(DataTypes & data_types, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
bool have_arrays = false;
bool have_tuples = false;
bool tuple_sizes_are_equal = true;
size_t tuple_size = 0;
for (const auto & type : data_types)
{
if (isArray(type))
have_arrays = true;
else if (isTuple(type))
{
have_tuples = true;
const auto & current_tuple_size = assert_cast<const DataTypeTuple &>(*type).getElements().size();
if (!tuple_size)
tuple_size = current_tuple_size;
else
tuple_sizes_are_equal &= current_tuple_size == tuple_size;
}
}
if (have_tuples && (have_arrays || !tuple_sizes_are_equal))
{
DataTypes nested_types;
for (auto & type : data_types)
{
if (isArray(type))
nested_types.push_back(assert_cast<const DataTypeArray &>(*type).getNestedType());
else
{
const auto & elements = assert_cast<const DataTypeTuple & >(*type).getElements();
for (const auto & element : elements)
nested_types.push_back(element);
}
}
transformInferredTypesIfNeededImpl<true>(nested_types, settings, json_info);
if (checkIfTypesAreEqual(nested_types))
{
for (auto & type : data_types)
type = std::make_shared<DataTypeArray>(nested_types.back());
}
}
}
/// If we have Map and Object(JSON) types, convert all Map types to Object(JSON).
/// If we have Map types with different value types, convert all Map types to Object(JSON)
static void transformMapsAndObjectsToObjects(DataTypes & data_types)
{
bool have_maps = false;
bool have_objects = false;
bool maps_are_equal = true;
DataTypePtr first_map_type;
for (const auto & type : data_types)
{
if (isMap(type))
{
if (!have_maps)
{
first_map_type = type;
have_maps = true;
}
else
{
maps_are_equal &= type->equals(*first_map_type);
}
}
else if (isObject(type))
{
have_objects = true;
}
}
if (have_maps && (have_objects || !maps_are_equal))
{
for (auto & type : data_types)
{
if (isMap(type))
type = std::make_shared<DataTypeObject>("json", true);
}
}
}
template <bool is_json>
static void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
auto transform_simple_types = [&](DataTypes & data_types)
{
/// Remove all Nothing type if possible.
transformNothingSimpleTypes(data_types);
/// Transform integers to floats if needed.
if (settings.try_infer_integers)
transformIntegersAndFloatsToFloats(data_types);
/// Transform Date to DateTime or both to String if needed.
if (settings.try_infer_dates || settings.try_infer_datetimes)
transformDatesAndDateTimes(data_types);
if constexpr (!is_json)
return;
/// Check settings specific for JSON formats.
/// Convert numbers inferred from strings back to strings if needed.
if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings)
transformJSONNumbersBackToString(data_types, settings, json_info);
/// Convert Bool to number (Int64/Float64) if needed.
if (settings.json.read_bools_as_numbers)
transformBoolsAndNumbersToNumbers(data_types);
};
auto transform_complex_types = [&](DataTypes & data_types)
{
/// Make types Nullable if needed.
transformNullableTypes(data_types);
/// If we have type Nothing, it means that we had empty Array/Map while inference.
/// If there is at least one non Nothing type, change all Nothing types to it.
transformNothingComplexTypes<is_json>(data_types);
if constexpr (!is_json)
return;
/// Convert JSON tuples with same nested types to arrays.
transformTuplesWithEqualNestedTypesToArrays(data_types);
/// Convert JSON tuples and arrays to arrays if possible.
transformJSONTuplesAndArraysToArrays(data_types, settings, json_info);
/// Convert Maps to Objects if needed.
if (settings.json.try_infer_objects)
transformMapsAndObjectsToObjects(data_types);
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
}
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
{
DataTypes types = {first, second};
transformInferredTypesIfNeededImpl<false>(types, settings, nullptr);
first = types[0];
second = types[1];
}
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
DataTypes types = {first, second};
transformInferredTypesIfNeededImpl<true>(types, settings, json_info);
first = types[0];
second = types[1];
}
void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
if (!isTuple(data_type))
return;
const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
auto nested_types = tuple_type->getElements();
transformInferredTypesIfNeededImpl<true>(nested_types, settings, json_info);
if (checkIfTypesAreEqual(nested_types))
data_type = std::make_shared<DataTypeArray>(nested_types.back());
}
template <bool is_json>
static DataTypePtr tryInferDataTypeForSingleFieldImpl(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info);
static bool tryInferDate(const std::string_view & field)
{
ReadBufferFromString buf(field);
DayNum tmp;
return tryReadDateText(tmp, buf) && buf.eof();
}
static bool tryInferDateTime(const std::string_view & field, const FormatSettings & settings)
{
if (field.empty())
return false;
ReadBufferFromString buf(field);
Float64 tmp_float;
/// Check if it's just a number, and if so, don't try to infer DateTime from it,
/// because we can interpret this number as a timestamp and it will lead to
/// inferring DateTime instead of simple Int64/Float64 in some cases.
if (tryReadFloatText(tmp_float, buf) && buf.eof())
return false;
buf.seek(0, SEEK_SET); /// Return position to the beginning
DateTime64 tmp;
switch (settings.date_time_input_format)
{
case FormatSettings::DateTimeInputFormat::Basic:
if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffort:
if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffortUS:
if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
}
return false;
}
DataTypePtr tryInferDateOrDateTimeFromString(const std::string_view & field, const FormatSettings & settings)
{
if (settings.try_infer_dates && tryInferDate(field))
return std::make_shared<DataTypeDate>();
if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
return std::make_shared<DataTypeDateTime64>(9);
return nullptr;
}
template <bool is_json>
static DataTypePtr tryInferArray(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
assertChar('[', buf);
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ']')
{
if (!first)
{
/// Skip field delimiter between array elements.
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, json_info);
/// If we couldn't infer element type, array type also cannot be inferred.
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
skipWhitespaceIfAny(buf);
}
/// No ']' at the end of array
if (buf.eof())
return nullptr;
assertChar(']', buf);
skipWhitespaceIfAny(buf);
/// Empty array has type Array(Nothing)
if (nested_types.empty())
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
if (checkIfTypesAreEqual(nested_types))
return std::make_shared<DataTypeArray>(nested_types.back());
/// If element types are not equal, we should try to find common type.
/// If after transformation element types are still different, we return Tuple for JSON and
/// nullptr for other formats (nullptr means we couldn't infer the type).
if constexpr (is_json)
{
/// For JSON if we have not complete types, we should not try to transform them
/// and return it as a Tuple.
/// For example, if we have types [Float64, Nullable(Nothing), Float64]
/// it can be Array(Float64) or Tuple(Float64, <some_type>, Float64) and
/// we can't determine which one it is. But we will be able to do it later
/// when we will have types from other rows for this column.
/// For example, if in the next row we will have types [Nullable(Nothing), String, Float64],
/// we can determine the type for this colum as Tuple(Nullable(Float64), Nullable(String), Float64).
for (const auto & type : nested_types)
{
if (!checkIfTypeIsComplete(type))
return std::make_shared<DataTypeTuple>(nested_types);
}
auto nested_types_copy = nested_types;
transformInferredTypesIfNeededImpl<is_json>(nested_types_copy, settings, json_info);
if (checkIfTypesAreEqual(nested_types_copy))
return std::make_shared<DataTypeArray>(nested_types_copy.back());
return std::make_shared<DataTypeTuple>(nested_types);
}
else
{
transformInferredTypesIfNeededImpl<is_json>(nested_types, settings);
if (checkIfTypesAreEqual(nested_types))
return std::make_shared<DataTypeArray>(nested_types.back());
/// We couldn't determine common type for array element.
return nullptr;
}
}
static DataTypePtr tryInferTuple(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
assertChar('(', buf);
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ')')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = tryInferDataTypeForSingleFieldImpl<false>(buf, settings, json_info);
/// If we couldn't infer element type, tuple type also cannot be inferred.
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
skipWhitespaceIfAny(buf);
}
if (buf.eof() || nested_types.empty())
return nullptr;
assertChar(')', buf);
skipWhitespaceIfAny(buf);
return std::make_shared<DataTypeTuple>(nested_types);
}
template <bool check_eof>
static DataTypePtr tryInferNumberFromStringBuffer(ReadBufferFromString & buf, const FormatSettings & settings)
{
if (settings.try_infer_integers)
{
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && (!check_eof || buf.eof()))
return std::make_shared<DataTypeInt64>();
}
/// We cam safely get back to the start of buffer, because we read from a string and we didn't reach eof.
buf.position() = buf.buffer().begin();
Float64 tmp;
if (tryReadFloatText(tmp, buf) && (!check_eof || buf.eof()))
return std::make_shared<DataTypeFloat64>();
return nullptr;
}
static DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
{
/// If we read from String, we can do it in a more efficient way.
if (auto * string_buf = dynamic_cast<ReadBufferFromString *>(&buf))
return tryInferNumberFromStringBuffer<false>(*string_buf, settings);
Float64 tmp_float;
if (settings.try_infer_integers)
{
/// We should use PeekableReadBuffer, because we need to
/// rollback to the start of number to parse it as integer first
/// and then as float.
PeekableReadBuffer peekable_buf(buf);
PeekableReadBufferCheckpoint checkpoint(peekable_buf);
Int64 tmp_int;
bool read_int = tryReadIntText(tmp_int, peekable_buf);
auto * int_end = peekable_buf.position();
peekable_buf.rollbackToCheckpoint(true);
if (tryReadFloatText(tmp_float, peekable_buf))
{
/// Float parsing reads no fewer bytes than integer parsing,
/// so position of the buffer is either the same, or further.
/// If it's the same, then it's integer.
if (read_int && peekable_buf.position() == int_end)
return std::make_shared<DataTypeInt64>();
return std::make_shared<DataTypeFloat64>();
}
}
else if (tryReadFloatText(tmp_float, buf))
{
return std::make_shared<DataTypeFloat64>();
}
/// This is not a number.
return nullptr;
}
DataTypePtr tryInferNumberFromString(const std::string_view & field, const FormatSettings & settings)
{
ReadBufferFromString buf(field);
return tryInferNumberFromStringBuffer<true>(buf, settings);
}
template <bool is_json>
static DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
String field;
bool ok = true;
if constexpr (is_json)
ok = tryReadJSONStringInto(field, buf);
else
ok = tryReadQuotedStringInto(field, buf);
if (!ok)
return nullptr;
skipWhitespaceIfAny(buf);
/// If it's object key, we should just return String type.
if constexpr (is_json)
{
if (json_info->is_object_key)
return std::make_shared<DataTypeString>();
}
if (auto type = tryInferDateOrDateTimeFromString(field, settings))
return type;
if constexpr (is_json)
{
if (settings.json.try_infer_numbers_from_strings)
{
auto number_type = tryInferNumberFromString(field, settings);
if (number_type)
{
json_info->numbers_parsed_from_json_strings.insert(number_type.get());
return number_type;
}
}
}
return std::make_shared<DataTypeString>();
}
template <bool is_json>
static DataTypePtr tryInferMapOrObject(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
assertChar('{', buf);
skipWhitespaceIfAny(buf);
DataTypes key_types;
DataTypes value_types;
bool first = true;
while (!buf.eof() && *buf.position() != '}')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
DataTypePtr key_type;
if constexpr (is_json)
{
/// For JSON key type must be String.
json_info->is_object_key = true;
key_type = tryInferString<is_json>(buf, settings, json_info);
json_info->is_object_key = false;
}
else
{
key_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, nullptr);
}
/// If we couldn't infer key type, we cannot infer Map/JSON object type.
if (!key_type)
return nullptr;
key_types.push_back(key_type);
skipWhitespaceIfAny(buf);
if (!checkChar(':', buf))
return nullptr;
skipWhitespaceIfAny(buf);
/// If we couldn't infer element type, Map type also cannot be inferred.
auto value_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, json_info);
if (!value_type)
return nullptr;
value_types.push_back(value_type);
skipWhitespaceIfAny(buf);
}
if (buf.eof())
return nullptr;
assertChar('}', buf);
skipWhitespaceIfAny(buf);
if (key_types.empty())
{
if constexpr (is_json)
{
if (settings.json.try_infer_objects)
return std::make_shared<DataTypeObject>("json", true);
}
/// Empty Map is Map(Nothing, Nothing)
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>());
}
if constexpr (is_json)
{
/// If it's JSON field and one of value types is JSON Object, return also JSON Object.
for (const auto & value_type : value_types)
{
if (isObject(value_type))
return std::make_shared<DataTypeObject>("json", true);
}
transformInferredTypesIfNeededImpl<is_json>(value_types, settings, json_info);
if (!checkIfTypesAreEqual(value_types))
{
if (settings.json.try_infer_objects)
return std::make_shared<DataTypeObject>("json", true);
return nullptr;
}
return std::make_shared<DataTypeMap>(key_types.back(), value_types.back());
}
if (!checkIfTypesAreEqual(key_types))
transformInferredTypesIfNeededImpl<is_json>(key_types, settings);
if (!checkIfTypesAreEqual(value_types))
transformInferredTypesIfNeededImpl<is_json>(value_types, settings);
if (!checkIfTypesAreEqual(key_types) || !checkIfTypesAreEqual(value_types))
return nullptr;
auto key_type = removeNullable(key_types.back());
if (!DataTypeMap::checkKeyType(key_type))
return nullptr;
return std::make_shared<DataTypeMap>(key_type, value_types.back());
}
template <bool is_json>
static DataTypePtr tryInferDataTypeForSingleFieldImpl(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
skipWhitespaceIfAny(buf);
if (buf.eof())
return nullptr;
/// Array [field1, field2, ...]
if (*buf.position() == '[')
return tryInferArray<is_json>(buf, settings, json_info);
/// Tuple (field1, field2, ...), if format is not JSON
if constexpr (!is_json)
{
if (*buf.position() == '(')
return tryInferTuple(buf, settings, json_info);
}
/// Map/Object for JSON { key1 : value1, key2 : value2, ...}
if (*buf.position() == '{')
return tryInferMapOrObject<is_json>(buf, settings, json_info);
/// String
char quote = is_json ? '"' : '\'';
if (*buf.position() == quote)
return tryInferString<is_json>(buf, settings, json_info);
/// Bool
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
return DataTypeFactory::instance().get("Bool");
/// Null or NaN
if (checkCharCaseInsensitive('n', buf))
{
if (checkStringCaseInsensitive("ull", buf))
return makeNullable(std::make_shared<DataTypeNothing>());
else if (checkStringCaseInsensitive("an", buf))
return std::make_shared<DataTypeFloat64>();
}
/// Number
return tryInferNumber(buf, settings);
}
DataTypePtr tryInferDataTypeForSingleField(ReadBuffer & buf, const FormatSettings & settings)
{
return tryInferDataTypeForSingleFieldImpl<false>(buf, settings, nullptr);
}
DataTypePtr tryInferDataTypeForSingleField(std::string_view field, const FormatSettings & settings)
{
ReadBufferFromString buf(field);
auto type = tryInferDataTypeForSingleFieldImpl<false>(buf, settings, nullptr);
/// Check if there is no unread data in buffer.
if (!buf.eof())
return nullptr;
return type;
}
DataTypePtr tryInferDataTypeForSingleJSONField(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
return tryInferDataTypeForSingleFieldImpl<true>(buf, settings, json_info);
}
DataTypePtr tryInferDataTypeForSingleJSONField(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
ReadBufferFromString buf(field);
auto type = tryInferDataTypeForSingleFieldImpl<true>(buf, settings, json_info);
/// Check if there is no unread data in buffer.
if (!buf.eof())
return nullptr;
return type;
}
DataTypePtr makeNullableRecursively(DataTypePtr type)
{
if (!type)
return nullptr;
WhichDataType which(type);
if (which.isNullable())
return type;
if (which.isArray())
{
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
auto nested_type = makeNullableRecursively(array_type->getNestedType());
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
}
if (which.isTuple())
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
DataTypes nested_types;
for (const auto & element : tuple_type->getElements())
{
auto nested_type = makeNullableRecursively(element);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
return std::make_shared<DataTypeTuple>(std::move(nested_types));
}
if (which.isMap())
{
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
auto key_type = makeNullableRecursively(map_type->getKeyType());
auto value_type = makeNullableRecursively(map_type->getValueType());
return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr;
}
if (which.isLowCarnality())
{
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
auto nested_type = makeNullableRecursively(lc_type->getDictionaryType());
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
}
return makeNullable(type);
}
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)
{
NamesAndTypesList result;
for (auto & [name, type] : header.getNamesAndTypesList())
result.emplace_back(name, makeNullableRecursively(type));
return result;
}
bool checkIfTypeIsComplete(const DataTypePtr & type)
{
if (!type)
return false;
WhichDataType which(type);
if (which.isNothing())
return false;
if (which.isNullable())
return checkIfTypeIsComplete(assert_cast<const DataTypeNullable *>(type.get())->getNestedType());
if (which.isArray())
return checkIfTypeIsComplete(assert_cast<const DataTypeArray *>(type.get())->getNestedType());
if (which.isTuple())
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
for (const auto & element : tuple_type->getElements())
{
if (!checkIfTypeIsComplete(element))
return false;
}
return true;
}
if (which.isMap())
{
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
if (!checkIfTypeIsComplete(map_type->getKeyType()))
return false;
return checkIfTypeIsComplete(map_type->getValueType());
}
return true;
}
}

View File

@ -0,0 +1,93 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <IO/ReadBuffer.h>
namespace DB
{
/// Struct with some additional information about inferred types for JSON formats.
struct JSONInferenceInfo
{
/// We store numbers that were parsed from strings.
/// It's used in types transformation to change such numbers back to string if needed.
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
/// Indicates if currently we are inferring type for Map/Object key.
bool is_object_key = false;
};
/// Try to determine datatype of the value in buffer/string. If cannot, return nullptr.
/// In general, it tries to parse a type using the following logic:
/// If we see '[', we try to parse an array of values and recursively determine datatype for each element.
/// If we see '(', we try to parse a tuple of values and recursively determine datatype for each element.
/// If we see '{', we try to parse a Map of keys and values and recursively determine datatype for each key/value.
/// If we see a quote '\'', we treat it as a string and read until next quote.
/// If we see NULL it returns Nullable(Nothing)
/// Otherwise we try to read a number.
DataTypePtr tryInferDataTypeForSingleField(ReadBuffer & buf, const FormatSettings & settings);
DataTypePtr tryInferDataTypeForSingleField(std::string_view field, const FormatSettings & settings);
/// The same as tryInferDataTypeForSingleField, but for JSON values.
DataTypePtr tryInferDataTypeForSingleJSONField(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info);
DataTypePtr tryInferDataTypeForSingleJSONField(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Try to parse Date or DateTime value from a string.
DataTypePtr tryInferDateOrDateTimeFromString(const std::string_view & field, const FormatSettings & settings);
/// Try to parse a number value from a string. By default, it tries to parse Float64,
/// but if setting try_infer_integers is enables, it also tries to parse Int64.
DataTypePtr tryInferNumberFromString(const std::string_view & field, const FormatSettings & settings);
/// It takes two types inferred for the same column and tries to transform them to a common type if possible.
/// It's also used when we try to infer some not ordinary types from another types.
/// Example 1:
/// Dates inferred from strings. In this case we should check if dates were inferred from all strings
/// in the same way and if not, transform inferred dates back to strings.
/// For example, when we have Array(Date) (like `['2020-01-01', '2020-02-02']`) and Array(String) (like `['string', 'abc']`
/// we will convert the first type to Array(String).
/// Example 2:
/// When we have integers and floats for the same value, we should convert all integers to floats.
/// For example, when we have Array(Int64) (like `[123, 456]`) and Array(Float64) (like `[42.42, 4.42]`)
/// we will convert the first type to Array(Float64)
/// Example 3:
/// When we have not complete types like Nullable(Nothing), Array(Nullable(Nothing)) or Tuple(UInt64, Nullable(Nothing)),
/// we try to complete them using the other type.
/// For example, if we have Tuple(UInt64, Nullable(Nothing)) and Tuple(Nullable(Nothing), String) we will convert both
/// types to common type Tuple(Nullable(UInt64), Nullable(String))
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings);
/// The same as transformInferredTypesIfNeeded but uses some specific transformations for JSON.
/// Example 1:
/// When we have numbers inferred from strings and strings, we convert all such numbers back to string.
/// For example, if we have Array(Int64) (like `['123', '456']`) and Array(String) (like `['str', 'abc']`)
/// we will convert the first type to Array(String). Note that we collect information about numbers inferred
/// from strings in json_info while inference and use it here, so we will know that Array(Int64) contains
/// integer inferred from a string.
/// Example 2:
/// When we have maps with different value types, we convert all types to JSON object type.
/// For example, if we have Map(String, UInt64) (like `{"a" : 123}`) and Map(String, String) (like `{"b" : 'abc'}`)
/// we will convert both types to Object('JSON').
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Check it type is Tuple(...), try to transform nested types to find a common type for them and if all nested types
/// are the same after transform, we convert this tuple to an Array with common nested type.
/// For example, if we have Tuple(String, Nullable(Nothing)) we will convert it to Array(String).
/// It's used when all rows were read and we have Tuple in the result type that can be actually an Array.
void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Make type Nullable recursively:
/// - Type -> Nullable(type)
/// - Array(Type) -> Array(Nullable(Type))
/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN))
/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType))
/// - LowCardinality(Type) -> LowCardinality(Nullable(Type))
DataTypePtr makeNullableRecursively(DataTypePtr type);
/// Call makeNullableRecursively for all types
/// in the block and return names and types.
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header);
/// Check if type contains Nothing, like Array(Tuple(Nullable(Nothing), String))
bool checkIfTypeIsComplete(const DataTypePtr & type);
}

View File

@ -317,12 +317,17 @@ template void readStringUntilEOFInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8
/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).
* It is assumed that the cursor is located on the `\` symbol
*/
template <typename Vector>
static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
template <typename Vector, typename ReturnType = void>
static ReturnType parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
{
++buf.position();
if (buf.eof())
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
{
if constexpr (std::is_same_v<ReturnType, void>)
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
else
return false;
}
char char_after_backslash = *buf.position();
@ -361,6 +366,8 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
s.push_back(decoded_char);
++buf.position();
}
return ReturnType(true);
}
@ -519,14 +526,18 @@ template void readEscapedStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf
* backslash escape sequences are also parsed,
* that could be slightly confusing.
*/
template <char quote, bool enable_sql_style_quoting, typename Vector>
static void readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
template <char quote, bool enable_sql_style_quoting, typename Vector, typename ReturnType = void>
static ReturnType readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
if (buf.eof() || *buf.position() != quote)
{
throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
"Cannot parse quoted string: expected opening quote '{}', got '{}'",
std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
if constexpr (throw_exception)
throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
"Cannot parse quoted string: expected opening quote '{}', got '{}'",
std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
else
return false;
}
++buf.position();
@ -552,15 +563,26 @@ static void readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
continue;
}
return;
return ReturnType(true);
}
if (*buf.position() == '\\')
parseComplexEscapeSequence(s, buf);
{
if constexpr (throw_exception)
parseComplexEscapeSequence<Vector, ReturnType>(s, buf);
else
{
if (!parseComplexEscapeSequence<Vector, ReturnType>(s, buf))
return false;
}
}
}
throw ParsingException("Cannot parse quoted string: expected closing quote",
ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
if constexpr (throw_exception)
throw ParsingException("Cannot parse quoted string: expected closing quote",
ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
else
return false;
}
template <bool enable_sql_style_quoting, typename Vector>
@ -569,6 +591,14 @@ void readQuotedStringInto(Vector & s, ReadBuffer & buf)
readAnyQuotedStringInto<'\'', enable_sql_style_quoting>(s, buf);
}
template <typename Vector>
bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf)
{
return readAnyQuotedStringInto<'\'', false, Vector, bool>(s, buf);
}
template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf);
template <bool enable_sql_style_quoting, typename Vector>
void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf)
{
@ -829,6 +859,7 @@ template void readJSONStringInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UIn
template bool readJSONStringInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template void readJSONStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
template void readJSONStringInto<String>(String & s, ReadBuffer & buf);
template bool readJSONStringInto<String, bool>(String & s, ReadBuffer & buf);
template <typename Vector, typename ReturnType>
ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
@ -1396,6 +1427,39 @@ static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_fu
peekable_buf.position() = end;
}
template <typename Vector>
static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf)
{
assertChar('\'', buf);
s.push_back('\'');
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
s.append(buf.position(), next_pos);
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\'')
break;
s.push_back(*buf.position());
if (*buf.position() == '\\')
{
++buf.position();
if (!buf.eof())
{
s.push_back(*buf.position());
++buf.position();
}
}
}
++buf.position();
s.push_back('\'');
}
template <char opening_bracket, char closing_bracket, typename Vector>
static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf)
{
@ -1413,20 +1477,19 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf)
if (!buf.hasPendingData())
continue;
s.push_back(*buf.position());
if (*buf.position() == '\'')
{
readQuotedStringInto<false>(s, buf);
s.push_back('\'');
readQuotedStringFieldInto(s, buf);
}
else if (*buf.position() == opening_bracket)
{
s.push_back(opening_bracket);
++balance;
++buf.position();
}
else if (*buf.position() == closing_bracket)
{
s.push_back(closing_bracket);
--balance;
++buf.position();
}
@ -1449,11 +1512,7 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf)
/// - Number: integer, float, decimal.
if (*buf.position() == '\'')
{
s.push_back('\'');
readQuotedStringInto<false>(s, buf);
s.push_back('\'');
}
readQuotedStringFieldInto(s, buf);
else if (*buf.position() == '[')
readQuotedFieldInBracketsInto<'[', ']'>(s, buf);
else if (*buf.position() == '(')

View File

@ -605,6 +605,9 @@ bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf)
return readJSONStringInto<Vector, bool>(s, buf);
}
template <typename Vector>
bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf);
/// Reads chunk of data between {} in that way,
/// that it has balanced parentheses sequence of {}.
/// So, it may form a JSON object, but it can be incorrenct.

View File

@ -1,6 +1,5 @@
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/DataTypeString.h>
#include <Interpreters/parseColumnsListForTableFunction.h>
#include <boost/algorithm/string.hpp>
@ -17,10 +16,11 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
}
template <class SchemaReader>
void chooseResultColumnType(
SchemaReader & schema_reader,
DataTypePtr & type,
DataTypePtr & new_type,
std::function<void(DataTypePtr &, DataTypePtr &)> transform_types_if_needed,
const DataTypePtr & default_type,
const String & column_name,
size_t row)
@ -34,7 +34,7 @@ void chooseResultColumnType(
if (!new_type || type->equals(*new_type))
return;
transform_types_if_needed(type, new_type);
schema_reader.transformTypesIfNeeded(type, new_type);
if (type->equals(*new_type))
return;
@ -55,9 +55,9 @@ void chooseResultColumnType(
}
}
void checkResultColumnTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read)
void checkResultColumnTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, const String & name, const FormatSettings & settings, const DataTypePtr & default_type, size_t rows_read)
{
if (!type)
if (!checkIfTypeIsComplete(type))
{
if (!default_type)
throw Exception(
@ -69,6 +69,10 @@ void checkResultColumnTypeAndAppend(NamesAndTypesList & result, DataTypePtr & ty
type = default_type;
}
if (settings.schema_inference_make_columns_nullable)
type = makeNullableRecursively(type);
result.emplace_back(name, type);
}
@ -88,6 +92,11 @@ void IIRowSchemaReader::setContext(ContextPtr & context)
}
}
void IIRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings);
}
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IIRowSchemaReader(in_, format_settings_), column_names(splitColumnNames(format_settings.column_names_for_schema_inference))
{
@ -160,23 +169,23 @@ NamesAndTypesList IRowSchemaReader::readSchema()
if (new_data_types.size() != data_types.size())
throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values");
for (size_t i = 0; i != data_types.size(); ++i)
for (field_index = 0; field_index != data_types.size(); ++field_index)
{
/// Check if we couldn't determine the type of this column in a new row
/// or the type for this column was taken from hints.
if (!new_data_types[i] || hints.contains(column_names[i]))
if (!new_data_types[field_index] || hints.contains(column_names[field_index]))
continue;
auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type, i); };
chooseResultColumnType(data_types[i], new_data_types[i], transform_types_if_needed, getDefaultType(i), std::to_string(i + 1), rows_read);
chooseResultColumnType(*this, data_types[field_index], new_data_types[field_index], getDefaultType(field_index), std::to_string(field_index + 1), rows_read);
}
}
NamesAndTypesList result;
for (size_t i = 0; i != data_types.size(); ++i)
for (field_index = 0; field_index != data_types.size(); ++field_index)
{
transformFinalTypeIfNeeded(data_types[field_index]);
/// Check that we could determine the type of this column.
checkResultColumnTypeAndAppend(result, data_types[i], column_names[i], getDefaultType(i), rows_read);
checkResultColumnTypeAndAppend(result, data_types[field_index], column_names[field_index], format_settings, getDefaultType(field_index), rows_read);
}
return result;
@ -208,11 +217,6 @@ DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const
return nullptr;
}
void IRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
{
transformInferredTypesIfNeeded(type, new_type, format_settings);
}
IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
: IIRowSchemaReader(in_, format_settings_, default_type_)
{
@ -245,7 +249,6 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
names_order.push_back(name);
}
auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type); };
for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read)
{
auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof);
@ -277,7 +280,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
continue;
auto & type = it->second;
chooseResultColumnType(type, new_type, transform_types_if_needed, default_type, name, rows_read);
chooseResultColumnType(*this, type, new_type, default_type, name, rows_read);
}
}
@ -289,16 +292,12 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
for (auto & name : names_order)
{
auto & type = names_to_types[name];
transformFinalTypeIfNeeded(type);
/// Check that we could determine the type of this column.
checkResultColumnTypeAndAppend(result, type, name, default_type, rows_read);
checkResultColumnTypeAndAppend(result, type, name, format_settings, default_type, rows_read);
}
return result;
}
void IRowWithNamesSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings);
}
}

View File

@ -45,10 +45,14 @@ public:
bool needContext() const override { return !hints_str.empty(); }
void setContext(ContextPtr & context) override;
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
protected:
void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; }
size_t getNumRowsRead() const override { return rows_read; }
virtual void transformFinalTypeIfNeeded(DataTypePtr &) {}
size_t max_rows_to_read;
size_t rows_read = 0;
DataTypePtr default_type;
@ -82,7 +86,7 @@ protected:
void setColumnNames(const std::vector<String> & names) { column_names = names; }
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t index);
size_t field_index;
private:
DataTypePtr getDefaultType(size_t column) const;
@ -110,8 +114,6 @@ protected:
/// If it's impossible to determine the type for some column, return nullptr for it.
/// Set eof = true if can't read more data.
virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0;
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
};
/// Base class for schema inference for formats that don't need any data to
@ -125,16 +127,17 @@ public:
virtual ~IExternalSchemaReader() = default;
};
template <class SchemaReader>
void chooseResultColumnType(
SchemaReader & schema_reader,
DataTypePtr & type,
DataTypePtr & new_type,
std::function<void(DataTypePtr &, DataTypePtr &)> transform_types_if_needed,
const DataTypePtr & default_type,
const String & column_name,
size_t row);
void checkResultColumnTypeAndAppend(
NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read);
NamesAndTypesList & result, DataTypePtr & type, const String & name, const FormatSettings & settings, const DataTypePtr & default_type, size_t rows_read);
Strings splitColumnNames(const String & column_names_str);

View File

@ -3,7 +3,7 @@
#if USE_ARROW
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>

View File

@ -274,15 +274,15 @@ void CSVFormatReader::skipPrefixBeforeHeader()
}
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_)
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(
in_,
format_setting_,
format_settings_,
with_names_,
with_types_,
&reader,
getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV))
, reader(in_, format_setting_)
, reader(in_, format_settings_)
{
}
@ -293,7 +293,7 @@ DataTypes CSVSchemaReader::readRowAndGetDataTypes()
return {};
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV);
return tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV);
}

View File

@ -75,7 +75,7 @@ public:
class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_);
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
private:
DataTypes readRowAndGetDataTypes() override;

View File

@ -1,6 +1,7 @@
#include <Processors/Formats/Impl/CustomSeparatedRowInputFormat.h>
#include <Processors/Formats/Impl/TemplateRowInputFormat.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <IO/Operators.h>
@ -328,12 +329,12 @@ DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes()
first_row = false;
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
return tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), &json_inference_info);
}
void CustomSeparatedSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
void CustomSeparatedSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings, reader.getEscapingRule());
transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, reader.getEscapingRule(), &json_inference_info);
}
void registerInputFormatCustomSeparated(FormatFactory & factory)

View File

@ -2,6 +2,7 @@
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/PeekableReadBuffer.h>
#include <IO/ReadHelpers.h>
@ -98,11 +99,12 @@ public:
private:
DataTypes readRowAndGetDataTypes() override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
PeekableReadBuffer buf;
CustomSeparatedFormatReader reader;
bool first_row = true;
JSONInferenceInfo json_inference_info;
};
}

View File

@ -2,6 +2,7 @@
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/JSONUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadHelpers.h>
#include <base/find_symbols.h>
@ -175,14 +176,9 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase(
{
}
void JSONColumnsSchemaReaderBase::chooseResulType(DataTypePtr & type, DataTypePtr & new_type, const String & column_name, size_t row) const
void JSONColumnsSchemaReaderBase::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
auto convert_types_if_needed = [&](DataTypePtr & first, DataTypePtr & second)
{
DataTypes types = {first, second};
transformInferredJSONTypesIfNeeded(types, format_settings);
};
chooseResultColumnType(type, new_type, convert_types_if_needed, nullptr, column_name, row);
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
@ -222,7 +218,8 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
rows_in_block = 0;
auto column_type = readColumnAndGetDataType(column_name, rows_in_block, format_settings.max_rows_to_read_for_schema_inference - total_rows_read);
chooseResulType(names_to_types[column_name], column_type, column_name, total_rows_read + 1);
chooseResultColumnType(*this, names_to_types[column_name], column_type, nullptr, column_name, total_rows_read + 1);
++iteration;
}
while (!reader->checkChunkEndOrSkipColumnDelimiter());
@ -237,8 +234,9 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
for (auto & name : names_order)
{
auto & type = names_to_types[name];
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
/// Check that we could determine the type of this column.
checkResultColumnTypeAndAppend(result, type, name, nullptr, format_settings.max_rows_to_read_for_schema_inference);
checkResultColumnTypeAndAppend(result, type, name, format_settings, nullptr, format_settings.max_rows_to_read_for_schema_inference);
}
return result;
@ -262,8 +260,8 @@ DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String &
}
readJSONField(field, in);
DataTypePtr field_type = JSONUtils::getDataTypeFromField(field, format_settings);
chooseResulType(column_type, field_type, column_name, rows_read);
DataTypePtr field_type = tryInferDataTypeForSingleJSONField(field, format_settings, &inference_info);
chooseResultColumnType(*this, column_type, field_type, nullptr, column_name, rows_read);
++rows_read;
}
while (!reader->checkColumnEndOrSkipFieldDelimiter());

View File

@ -1,6 +1,7 @@
#pragma once
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Processors/Formats/IInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
@ -76,18 +77,18 @@ class JSONColumnsSchemaReaderBase : public ISchemaReader
public:
JSONColumnsSchemaReaderBase(ReadBuffer & in_, const FormatSettings & format_settings_, std::unique_ptr<JSONColumnsReaderBase> reader_);
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
private:
NamesAndTypesList readSchema() override;
/// Read whole column in the block (up to max_rows_to_read rows) and extract the data type.
DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read);
/// Choose result type for column from two inferred types from different rows.
void chooseResulType(DataTypePtr & type, DataTypePtr & new_type, const String & column_name, size_t row) const;
const FormatSettings format_settings;
std::unique_ptr<JSONColumnsReaderBase> reader;
Names column_names_from_settings;
JSONInferenceInfo inference_info;
};
}

View File

@ -7,6 +7,7 @@
#include <Formats/verbosePrintString.h>
#include <Formats/JSONUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -202,12 +203,17 @@ DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes()
if (in.eof())
return {};
return JSONUtils::readRowAndGetDataTypesForJSONCompactEachRow(in, format_settings, reader.yieldStrings());
return JSONUtils::readRowAndGetDataTypesForJSONCompactEachRow(in, format_settings, &inference_info);
}
void JSONCompactEachRowRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
void JSONCompactEachRowRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredJSONTypesIfNeeded(type, new_type, format_settings);
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
void JSONCompactEachRowRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
{
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
}
void registerInputFormatJSONCompactEachRow(FormatFactory & factory)

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Common/HashTable/HashMap.h>
namespace DB
@ -80,10 +81,12 @@ public:
private:
DataTypes readRowAndGetDataTypes() override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
void transformFinalTypeIfNeeded(DataTypePtr & type) override;
JSONCompactEachRowFormatReader reader;
bool first_row = true;
JSONInferenceInfo inference_info;
};
}

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/Impl/JSONEachRowRowInputFormat.h>
#include <Formats/JSONUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/FormatFactory.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -300,9 +301,8 @@ void JSONEachRowRowInputFormat::readSuffix()
assertEOF(*in);
}
JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings_)
JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IRowWithNamesSchemaReader(in_, format_settings_)
, json_strings(json_strings_)
{
}
@ -336,12 +336,17 @@ NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool &
return {};
}
return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, json_strings);
return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, &inference_info);
}
void JSONEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredJSONTypesIfNeeded(type, new_type, format_settings);
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
void JSONEachRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
{
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
}
void registerInputFormatJSONEachRow(FormatFactory & factory)
@ -391,11 +396,11 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory
void registerJSONEachRowSchemaReader(FormatFactory & factory)
{
auto register_schema_reader = [&](const String & format_name, bool json_strings)
auto register_schema_reader = [&](const String & format_name)
{
factory.registerSchemaReader(format_name, [json_strings](ReadBuffer & buf, const FormatSettings & settings)
factory.registerSchemaReader(format_name, [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_unique<JSONEachRowSchemaReader>(buf, json_strings, settings);
return std::make_unique<JSONEachRowSchemaReader>(buf, settings);
});
factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
{
@ -403,10 +408,10 @@ void registerJSONEachRowSchemaReader(FormatFactory & factory)
});
};
register_schema_reader("JSONEachRow", false);
register_schema_reader("JSONLines", false);
register_schema_reader("NDJSON", false);
register_schema_reader("JSONStringsEachRow", true);
register_schema_reader("JSONEachRow");
register_schema_reader("JSONLines");
register_schema_reader("NDJSON");
register_schema_reader("JSONStringsEachRow");
}
}

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Common/HashTable/HashMap.h>
@ -94,15 +95,16 @@ protected:
class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader
{
public:
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings_);
JSONEachRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
void transformFinalTypeIfNeeded(DataTypePtr & type) override;
bool json_strings;
bool first_row = true;
bool data_in_square_brackets = false;
JSONInferenceInfo inference_info;
};
}

View File

@ -2,6 +2,7 @@
#include <Formats/JSONUtils.h>
#include <Formats/FormatFactory.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/DataTypeString.h>
namespace DB
@ -85,7 +86,7 @@ NamesAndTypesList JSONObjectEachRowSchemaReader::readRowAndGetNamesAndDataTypes(
JSONUtils::skipComma(in);
JSONUtils::readFieldName(in);
auto names_and_types = JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, false);
auto names_and_types = JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, &inference_info);
if (!format_settings.json_object_each_row.column_for_object_name.empty())
names_and_types.emplace_front(format_settings.json_object_each_row.column_for_object_name, std::make_shared<DataTypeString>());
return names_and_types;
@ -93,7 +94,12 @@ NamesAndTypesList JSONObjectEachRowSchemaReader::readRowAndGetNamesAndDataTypes(
void JSONObjectEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredJSONTypesIfNeeded(type, new_type, format_settings);
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
void JSONObjectEachRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
{
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
}
void registerInputFormatJSONObjectEachRow(FormatFactory & factory)

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/Impl/JSONEachRowRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Common/HashTable/HashMap.h>
@ -42,8 +43,10 @@ public:
private:
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
void transformFinalTypeIfNeeded(DataTypePtr & type) override;
bool first_row = true;
JSONInferenceInfo inference_info;
};
std::optional<size_t> getColumnIndexForJSONObjectEachRowObjectName(const Block & header, const FormatSettings & settings);

View File

@ -435,7 +435,7 @@ DataTypes MySQLDumpSchemaReader::readRowAndGetDataTypes()
skipFieldDelimiter(in);
readQuotedField(value, in);
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
auto type = tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
data_types.push_back(std::move(type));
}
skipEndOfRow(in, table_name);

View File

@ -3,7 +3,7 @@
#if USE_ORC
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>

View File

@ -4,7 +4,7 @@
#if USE_PARQUET
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/copyData.h>
#include <arrow/api.h>

View File

@ -3,6 +3,7 @@
#include <Processors/Formats/Impl/RegexpRowInputFormat.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/newLineSegmentationEngine.h>
#include <IO/ReadHelpers.h>
@ -155,15 +156,15 @@ DataTypes RegexpSchemaReader::readRowAndGetDataTypes()
for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i)
{
String field(field_extractor.getField(i));
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule));
data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, &json_inference_info));
}
return data_types;
}
void RegexpSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
void RegexpSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings, format_settings.regexp.escaping_rule);
transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, format_settings.regexp.escaping_rule, &json_inference_info);
}

View File

@ -5,12 +5,13 @@
#include <string>
#include <vector>
#include <Core/Block.h>
#include <IO/PeekableReadBuffer.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/FormatFactory.h>
#include <IO/PeekableReadBuffer.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/SchemaInferenceUtils.h>
namespace DB
@ -81,12 +82,13 @@ public:
private:
DataTypes readRowAndGetDataTypes() override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
using EscapingRule = FormatSettings::EscapingRule;
RegexpFieldExtractor field_extractor;
PeekableReadBuffer buf;
JSONInferenceInfo json_inference_info;
};
}

View File

@ -249,7 +249,7 @@ NamesAndTypesList TSKVSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
if (has_value)
{
readEscapedString(value, in);
names_and_types.emplace_back(std::move(name), determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped));
names_and_types.emplace_back(std::move(name), tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped));
}
else
{

View File

@ -268,7 +268,7 @@ DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypes()
return {};
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
return tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
}
void registerInputFormatTabSeparated(FormatFactory & factory)

View File

@ -2,6 +2,7 @@
#include <Formats/FormatFactory.h>
#include <Formats/verbosePrintString.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/Operators.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -493,16 +494,16 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front();
field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i]));
data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], &json_inference_info));
}
format_reader.skipRowEndDelimiter();
return data_types;
}
void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx)
void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings, row_format.escaping_rules[column_idx]);
transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, row_format.escaping_rules[field_index], &json_inference_info);
}
static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings)

View File

@ -5,6 +5,7 @@
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadHelpers.h>
#include <IO/PeekableReadBuffer.h>
#include <Interpreters/Context.h>
@ -121,13 +122,14 @@ public:
DataTypes readRowAndGetDataTypes() override;
private:
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
PeekableReadBuffer buf;
const ParsedTemplateFormatString format;
const ParsedTemplateFormatString row_format;
TemplateFormatReader format_reader;
bool first_row = true;
JSONInferenceInfo json_inference_info;
};
bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces);

View File

@ -599,7 +599,7 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes()
}
readQuotedField(value, buf);
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
auto type = tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
data_types.push_back(std::move(type));
}

View File

@ -0,0 +1,23 @@
<test>
<substitutions>
<substitution>
<name>format</name>
<values>
<value>TabSeparated</value>
<value>CSV</value>
<value>Values</value>
<value>JSONEachRow</value>
<value>JSONCompactEachRow</value>
</values>
</substitution>
</substitutions>
<fill_query>INSERT INTO function file(data.{format}) SELECT WatchID, Title, EventTime, RefererCategories, RefererRegions FROM test.hits LIMIT 25000 SETTINGS engine_file_truncate_on_insert=1</fill_query>
<query>DESC file(data.{format}) SETTINGS schema_inference_use_cache_for_file=0</query>
<drop_query>INSERT INTO FUNCTION file(data.{format}) SELECT * FROM numbers(0) SETTINGS engine_file_truncate_on_insert=1</drop_query>
</test>

View File

@ -0,0 +1,48 @@
JSONEachRow
x Array(Nullable(Int64))
x Array(Nullable(Int64))
x Array(Nullable(Int64))
x Array(Nullable(Int64))
x Tuple(Nullable(String), Nullable(Int64))
x Tuple(Nullable(String), Nullable(Int64))
x Map(String, Nullable(Int64))
x Map(String, Nullable(Int64))
x Array(Nullable(Int64))
x Array(Array(Nullable(Int64)))
x Array(Map(String, Nullable(Int64)))
x Array(Array(Nullable(String)))
x Array(Int64)
x Array(Nullable(Int64))
x Array(Int64)
x Array(Nullable(Int64))
JSONCompactEachRow
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Tuple(Nullable(String), Nullable(Int64))
c1 Tuple(Nullable(String), Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Array(Nullable(Int64)))
c1 Array(Map(String, Nullable(Int64)))
c1 Array(Array(Nullable(String)))
c1 Array(Int64)
c1 Array(Nullable(Int64))
c1 Array(Int64)
c1 Array(Nullable(Int64))
CSV
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Array(Array(Nullable(Int64)))
c1 Array(Map(String, Nullable(Int64)))
c1 Array(Array(Nullable(String)))
c1 Array(Int64)
c1 Array(Nullable(Int64))
c1 Array(Int64)
c1 Array(Nullable(Int64))

View File

@ -0,0 +1,63 @@
select 'JSONEachRow';
set schema_inference_make_columns_nullable=1;
desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH }
desc format(JSONEachRow, '{"x" : [null, 1]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : []}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : [null]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : [1, null]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : ["abc", 1]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : ["abc", null]}');
desc format(JSONEachRow, '{"x" : {}}, {"x" : {"a" : 1}}');
desc format(JSONEachRow, '{"x" : {"a" : null}}, {"x" : {"b" : 1}}');
desc format(JSONEachRow, '{"x" : null}, {"x" : [1, 2]}');
desc format(JSONEachRow, '{"x" : [[], [null], [1, 2, 3]]}');
desc format(JSONEachRow, '{"x" : [{"a" : null}, {"b" : 1}]}');
desc format(JSONEachRow, '{"x" : [["2020-01-01", null, "1234"], ["abcd"]]}');
set schema_inference_make_columns_nullable=0;
desc format(JSONEachRow, '{"x" : [1, 2]}');
desc format(JSONEachRow, '{"x" : [null, 1]}');
desc format(JSONEachRow, '{"x" : [1, 2]}, {"x" : [3]}');
desc format(JSONEachRow, '{"x" : [1, 2]}, {"x" : [null]}');
select 'JSONCompactEachRow';
set schema_inference_make_columns_nullable=1;
desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH }
desc format(JSONCompactEachRow, '[[null, 1]]');
desc format(JSONCompactEachRow, '[[null, 1]], [[]]');
desc format(JSONCompactEachRow, '[[null, 1]], [[null]]');
desc format(JSONCompactEachRow, '[[null, 1]], [[1, null]]');
desc format(JSONCompactEachRow, '[[null, 1]], [["abc", 1]]');
desc format(JSONCompactEachRow, '[[null, 1]], [["abc", null]]');
desc format(JSONCompactEachRow, '[{}], [{"a" : 1}]');
desc format(JSONCompactEachRow, '[{"a" : null}], [{"b" : 1}]');
desc format(JSONCompactEachRow, '[null], [[1, 2]]');
desc format(JSONCompactEachRow, '[[[], [null], [1, 2, 3]]]');
desc format(JSONCompactEachRow, '[[{"a" : null}, {"b" : 1}]]');
desc format(JSONCompactEachRow, '[[["2020-01-01", null, "1234"], ["abcd"]]]');
set schema_inference_make_columns_nullable=0;
desc format(JSONCompactEachRow, '[[1, 2]]');
desc format(JSONCompactEachRow, '[[null, 1]]');
desc format(JSONCompactEachRow, '[[1, 2]], [[3]]');
desc format(JSONCompactEachRow, '[[1, 2]], [[null]]');
select 'CSV';
set schema_inference_make_columns_nullable=1;
desc format(CSV, '"[null, 1]"');
desc format(CSV, '"[null, 1]"\n"[]"');
desc format(CSV, '"[null, 1]"\n"[null]"');
desc format(CSV, '"[null, 1]"\n"[1, null]"');
desc format(CSV, '"{}"\n"{\'a\' : 1}"');
desc format(CSV, '"{\'a\' : null}"\n"{\'b\' : 1}"');
desc format(CSV, '"[[], [null], [1, 2, 3]]"');
desc format(CSV, '"[{\'a\' : null}, {\'b\' : 1}]"');
desc format(CSV, '"[[\'2020-01-01\', null, \'1234\'], [\'abcd\']]"');
set schema_inference_make_columns_nullable=0;
desc format(CSV, '"[1,2]"');
desc format(CSV, '"[NULL, 1]"');
desc format(CSV, '"[1, 2]"\n"[3]"');
desc format(CSV, '"[1, 2]"\n"[null]"');

View File

@ -0,0 +1 @@
s Nullable(String)

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select randomString(100) as s format JSONEachRow" | $CLICKHOUSE_LOCAL -q "desc test" --table='test' --input-format='JSONEachRow'

View File

@ -0,0 +1,2 @@
c1 Array(Nullable(String))
c1 Nullable(String)

View File

@ -0,0 +1,2 @@
desc format(CSV, '"[\'abc\\\'\']"');
desc format(Values, '(\'abc\\\'\')');