mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-04 21:42:39 +00:00
1039 lines
37 KiB
C++
1039 lines
37 KiB
C++
#include <Formats/SchemaInferenceUtils.h>
|
|
#include <DataTypes/DataTypeNullable.h>
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
#include <DataTypes/DataTypeString.h>
|
|
#include <DataTypes/DataTypeDateTime64.h>
|
|
#include <DataTypes/DataTypeDateTime.h>
|
|
#include <DataTypes/DataTypeDate.h>
|
|
#include <DataTypes/DataTypeArray.h>
|
|
#include <DataTypes/DataTypeTuple.h>
|
|
#include <DataTypes/DataTypeMap.h>
|
|
#include <DataTypes/DataTypeLowCardinality.h>
|
|
#include <DataTypes/DataTypeNothing.h>
|
|
#include <DataTypes/transformTypesRecursively.h>
|
|
#include <DataTypes/DataTypeObject.h>
|
|
#include <DataTypes/DataTypeFactory.h>
|
|
#include <IO/ReadBufferFromString.h>
|
|
#include <IO/ReadHelpers.h>
|
|
#include <IO/parseDateTimeBestEffort.h>
|
|
#include <IO/PeekableReadBuffer.h>
|
|
|
|
#include <Core/Block.h>
|
|
#include <Common/assert_cast.h>
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int TOO_DEEP_RECURSION;
|
|
}
|
|
|
|
namespace
|
|
{
|
|
bool checkIfTypesAreEqual(const DataTypes & types)
|
|
{
|
|
if (types.empty())
|
|
return true;
|
|
|
|
for (size_t i = 1; i < types.size(); ++i)
|
|
{
|
|
if (!types[0]->equals(*types[i]))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// If we have both Nothing and non Nothing types, convert all Nothing types to the first non Nothing.
|
|
/// For example if we have types [Nothing, String, Nothing] we change it to [String, String, String]
|
|
void transformNothingSimpleTypes(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
/// Check if we have both Nothing and non Nothing types.
|
|
if (!type_indexes.contains(TypeIndex::Nothing) || type_indexes.size() <= 1)
|
|
return;
|
|
|
|
DataTypePtr not_nothing_type = nullptr;
|
|
for (const auto & type : data_types)
|
|
{
|
|
if (!isNothing(type))
|
|
{
|
|
not_nothing_type = type;
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isNothing(type))
|
|
type = not_nothing_type;
|
|
}
|
|
}
|
|
|
|
/// If we have both Int64 and Float64 types, convert all Int64 to Float64.
|
|
void transformIntegersAndFloatsToFloats(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
if (!type_indexes.contains(TypeIndex::Int64) || !type_indexes.contains(TypeIndex::Float64))
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isInteger(type))
|
|
type = std::make_shared<DataTypeFloat64>();
|
|
}
|
|
}
|
|
|
|
/// If we have only Date and DateTime types, convert Date to DateTime,
|
|
/// otherwise, convert all Date and DateTime to String.
|
|
void transformDatesAndDateTimes(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
bool have_dates = type_indexes.contains(TypeIndex::Date);
|
|
bool have_datetimes = type_indexes.contains(TypeIndex::DateTime64);
|
|
bool all_dates_or_datetimes = (type_indexes.size() == (static_cast<size_t>(have_dates) + static_cast<size_t>(have_datetimes)));
|
|
|
|
if (!all_dates_or_datetimes && (have_dates || have_datetimes))
|
|
{
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isDate(type) || isDateTime64(type))
|
|
type = std::make_shared<DataTypeString>();
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if (have_dates && have_datetimes)
|
|
{
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isDate(type))
|
|
type = std::make_shared<DataTypeDateTime64>(9);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// If we have numbers (Int64/Float64) and String types and numbers were parsed from String,
|
|
/// convert all numbers to String.
|
|
void transformJSONNumbersBackToString(
|
|
DataTypes & data_types, const FormatSettings & settings, const TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info)
|
|
{
|
|
bool have_strings = type_indexes.contains(TypeIndex::String);
|
|
bool have_numbers = type_indexes.contains(TypeIndex::Int64) || type_indexes.contains(TypeIndex::Float64);
|
|
if (!have_strings || !have_numbers)
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isNumber(type)
|
|
&& (settings.json.read_numbers_as_strings || !json_info
|
|
|| json_info->numbers_parsed_from_json_strings.contains(type.get())))
|
|
type = std::make_shared<DataTypeString>();
|
|
}
|
|
}
|
|
|
|
/// If we have both Bool and number (Int64/Float64) types,
|
|
/// convert all Bool to Int64/Float64.
|
|
void transformBoolsAndNumbersToNumbers(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
bool have_floats = type_indexes.contains(TypeIndex::Float64);
|
|
bool have_integers = type_indexes.contains(TypeIndex::Int64);
|
|
bool have_bools = type_indexes.contains(TypeIndex::UInt8);
|
|
/// Check if we have both Bool and Integer/Float.
|
|
if (!have_bools || (!have_integers && !have_floats))
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isBool(type))
|
|
{
|
|
if (have_integers)
|
|
type = std::make_shared<DataTypeInt64>();
|
|
else
|
|
type = std::make_shared<DataTypeFloat64>();
|
|
}
|
|
}
|
|
}
|
|
|
|
/// If we have type Nothing/Nullable(Nothing) and some other non Nothing types,
|
|
/// convert all Nothing/Nullable(Nothing) types to the first non Nothing.
|
|
/// For example, when we have [Nothing, Array(Int64)] it will convert it to [Array(Int64), Array(Int64)]
|
|
/// (it can happen when transforming complex nested types like [Array(Nothing), Array(Array(Int64))])
|
|
void transformNothingComplexTypes(DataTypes & data_types)
|
|
{
|
|
bool have_nothing = false;
|
|
DataTypePtr not_nothing_type = nullptr;
|
|
for (const auto & type : data_types)
|
|
{
|
|
if (isNothing(removeNullable(type)))
|
|
have_nothing = true;
|
|
else
|
|
not_nothing_type = type;
|
|
}
|
|
|
|
if (!have_nothing || !not_nothing_type)
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isNothing(removeNullable(type)))
|
|
type = not_nothing_type;
|
|
}
|
|
}
|
|
|
|
/// If we have both Nullable and non Nullable types, make all types Nullable
|
|
void transformNullableTypes(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
if (!type_indexes.contains(TypeIndex::Nullable))
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (type->canBeInsideNullable())
|
|
type = makeNullable(type);
|
|
}
|
|
}
|
|
|
|
/// If we have Tuple with the same nested types like Tuple(Int64, Int64),
|
|
/// convert it to Array(Int64). It's used for JSON values.
|
|
/// For example when we had type Tuple(Int64, Nullable(Nothing)) and we
|
|
/// transformed it to Tuple(Nullable(Int64), Nullable(Int64)) we will
|
|
/// also transform it to Array(Nullable(Int64))
|
|
void transformTuplesWithEqualNestedTypesToArrays(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
if (!type_indexes.contains(TypeIndex::Tuple))
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isTuple(type))
|
|
{
|
|
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
|
|
if (checkIfTypesAreEqual(tuple_type->getElements()))
|
|
type = std::make_shared<DataTypeArray>(tuple_type->getElements().back());
|
|
}
|
|
}
|
|
}
|
|
|
|
template <bool is_json>
|
|
void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info = nullptr);
|
|
|
|
/// If we have Tuple and Array types, try to convert them all to Array
|
|
/// if there is a common type for all nested types.
|
|
/// For example, if we have [Tuple(Nullable(Nothing), String), Array(Date), Tuple(Date, String)]
|
|
/// it will convert them all to Array(String)
|
|
void transformJSONTuplesAndArraysToArrays(
|
|
DataTypes & data_types, const FormatSettings & settings, const TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info)
|
|
{
|
|
if (!type_indexes.contains(TypeIndex::Tuple))
|
|
return;
|
|
|
|
bool have_arrays = type_indexes.contains(TypeIndex::Array);
|
|
bool tuple_sizes_are_equal = true;
|
|
size_t tuple_size = 0;
|
|
for (const auto & type : data_types)
|
|
{
|
|
if (isTuple(type))
|
|
{
|
|
const auto & current_tuple_size = assert_cast<const DataTypeTuple &>(*type).getElements().size();
|
|
if (!tuple_size)
|
|
tuple_size = current_tuple_size;
|
|
else
|
|
tuple_sizes_are_equal &= current_tuple_size == tuple_size;
|
|
}
|
|
}
|
|
|
|
/// Check if we have arrays and tuples with same size.
|
|
if (!have_arrays && !tuple_sizes_are_equal)
|
|
return;
|
|
|
|
DataTypes nested_types;
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isArray(type))
|
|
nested_types.push_back(assert_cast<const DataTypeArray &>(*type).getNestedType());
|
|
else if (isTuple(type))
|
|
{
|
|
const auto & elements = assert_cast<const DataTypeTuple &>(*type).getElements();
|
|
for (const auto & element : elements)
|
|
nested_types.push_back(element);
|
|
}
|
|
}
|
|
|
|
transformInferredTypesIfNeededImpl<true>(nested_types, settings, json_info);
|
|
if (checkIfTypesAreEqual(nested_types))
|
|
{
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isArray(type) || isTuple(type))
|
|
type = std::make_shared<DataTypeArray>(nested_types.back());
|
|
}
|
|
}
|
|
}
|
|
|
|
/// If we have Map and Object(JSON) types, convert all Map types to Object(JSON).
|
|
/// If we have Map types with different value types, convert all Map types to Object(JSON)
|
|
void transformMapsAndObjectsToObjects(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
if (!type_indexes.contains(TypeIndex::Map))
|
|
return;
|
|
|
|
bool have_objects = type_indexes.contains(TypeIndex::Object);
|
|
bool maps_are_equal = true;
|
|
DataTypePtr first_map_type = nullptr;
|
|
for (const auto & type : data_types)
|
|
{
|
|
if (isMap(type))
|
|
{
|
|
if (!first_map_type)
|
|
first_map_type = type;
|
|
else
|
|
maps_are_equal &= type->equals(*first_map_type);
|
|
}
|
|
}
|
|
|
|
if (!have_objects && maps_are_equal)
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isMap(type))
|
|
type = std::make_shared<DataTypeObject>("json", true);
|
|
}
|
|
}
|
|
|
|
void transformMapsObjectsAndStringsToStrings(DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
bool have_maps = type_indexes.contains(TypeIndex::Map);
|
|
bool have_objects = type_indexes.contains(TypeIndex::Object);
|
|
bool have_strings = type_indexes.contains(TypeIndex::String);
|
|
|
|
/// Check if we have both String and Map/Object
|
|
if (!have_strings || (!have_maps && !have_objects))
|
|
return;
|
|
|
|
for (auto & type : data_types)
|
|
{
|
|
if (isMap(type) || isObject(type))
|
|
type = std::make_shared<DataTypeString>();
|
|
}
|
|
}
|
|
|
|
template <bool is_json>
|
|
void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
|
{
|
|
auto transform_simple_types = [&](DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
/// Remove all Nothing type if possible.
|
|
transformNothingSimpleTypes(data_types, type_indexes);
|
|
|
|
/// Transform integers to floats if needed.
|
|
if (settings.try_infer_integers)
|
|
transformIntegersAndFloatsToFloats(data_types, type_indexes);
|
|
|
|
/// Transform Date to DateTime or both to String if needed.
|
|
if (settings.try_infer_dates || settings.try_infer_datetimes)
|
|
transformDatesAndDateTimes(data_types, type_indexes);
|
|
|
|
if constexpr (!is_json)
|
|
return;
|
|
|
|
/// Check settings specific for JSON formats.
|
|
|
|
/// Convert numbers inferred from strings back to strings if needed.
|
|
if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings)
|
|
transformJSONNumbersBackToString(data_types, settings, type_indexes, json_info);
|
|
|
|
/// Convert Bool to number (Int64/Float64) if needed.
|
|
if (settings.json.read_bools_as_numbers)
|
|
transformBoolsAndNumbersToNumbers(data_types, type_indexes);
|
|
};
|
|
|
|
auto transform_complex_types = [&](DataTypes & data_types, const TypeIndexesSet & type_indexes)
|
|
{
|
|
/// Make types Nullable if needed.
|
|
transformNullableTypes(data_types, type_indexes);
|
|
|
|
/// If we have type Nothing, it means that we had empty Array/Map while inference.
|
|
/// If there is at least one non Nothing type, change all Nothing types to it.
|
|
transformNothingComplexTypes(data_types);
|
|
|
|
if constexpr (!is_json)
|
|
return;
|
|
|
|
/// Convert JSON tuples with same nested types to arrays.
|
|
transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes);
|
|
|
|
/// Convert JSON tuples and arrays to arrays if possible.
|
|
transformJSONTuplesAndArraysToArrays(data_types, settings, type_indexes, json_info);
|
|
|
|
/// Convert Maps to Objects if needed.
|
|
if (settings.json.try_infer_objects)
|
|
transformMapsAndObjectsToObjects(data_types, type_indexes);
|
|
|
|
if (settings.json.read_objects_as_strings)
|
|
transformMapsObjectsAndStringsToStrings(data_types, type_indexes);
|
|
};
|
|
|
|
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
|
|
}
|
|
|
|
template <bool is_json>
|
|
DataTypePtr tryInferDataTypeForSingleFieldImpl(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth = 1);
|
|
|
|
bool tryInferDate(std::string_view field)
|
|
{
|
|
ReadBufferFromString buf(field);
|
|
DayNum tmp;
|
|
return tryReadDateText(tmp, buf) && buf.eof();
|
|
}
|
|
|
|
bool tryInferDateTime(std::string_view field, const FormatSettings & settings)
|
|
{
|
|
if (field.empty())
|
|
return false;
|
|
|
|
ReadBufferFromString buf(field);
|
|
Float64 tmp_float;
|
|
/// Check if it's just a number, and if so, don't try to infer DateTime from it,
|
|
/// because we can interpret this number as a timestamp and it will lead to
|
|
/// inferring DateTime instead of simple Int64/Float64 in some cases.
|
|
if (tryReadFloatText(tmp_float, buf) && buf.eof())
|
|
return false;
|
|
|
|
buf.seek(0, SEEK_SET); /// Return position to the beginning
|
|
DateTime64 tmp;
|
|
switch (settings.date_time_input_format)
|
|
{
|
|
case FormatSettings::DateTimeInputFormat::Basic:
|
|
if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof())
|
|
return true;
|
|
break;
|
|
case FormatSettings::DateTimeInputFormat::BestEffort:
|
|
if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
|
|
return true;
|
|
break;
|
|
case FormatSettings::DateTimeInputFormat::BestEffortUS:
|
|
if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
|
|
return true;
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
template <bool is_json>
|
|
DataTypePtr tryInferArray(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth)
|
|
{
|
|
assertChar('[', buf);
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
DataTypes nested_types;
|
|
bool first = true;
|
|
bool have_invalid_nested_type = false;
|
|
while (!buf.eof() && *buf.position() != ']')
|
|
{
|
|
if (!first)
|
|
{
|
|
/// Skip field delimiter between array elements.
|
|
if (!checkChar(',', buf))
|
|
return nullptr;
|
|
skipWhitespaceIfAny(buf);
|
|
}
|
|
else
|
|
first = false;
|
|
|
|
auto nested_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, json_info, depth + 2);
|
|
|
|
if (nested_type)
|
|
nested_types.push_back(nested_type);
|
|
else
|
|
have_invalid_nested_type = true;
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
}
|
|
|
|
/// No ']' at the end.
|
|
if (buf.eof())
|
|
return nullptr;
|
|
|
|
assertChar(']', buf);
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
/// Nested data is invalid.
|
|
if (have_invalid_nested_type)
|
|
return nullptr;
|
|
|
|
/// Empty array has type Array(Nothing)
|
|
if (nested_types.empty())
|
|
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
|
|
|
|
if (checkIfTypesAreEqual(nested_types))
|
|
return std::make_shared<DataTypeArray>(std::move(nested_types.back()));
|
|
|
|
/// If element types are not equal, we should try to find common type.
|
|
/// If after transformation element types are still different, we return Tuple for JSON and
|
|
/// nullptr for other formats (nullptr means we couldn't infer the type).
|
|
if constexpr (is_json)
|
|
{
|
|
/// For JSON if we have not complete types, we should not try to transform them
|
|
/// and return it as a Tuple.
|
|
/// For example, if we have types [Float64, Nullable(Nothing), Float64]
|
|
/// it can be Array(Float64) or Tuple(Float64, <some_type>, Float64) and
|
|
/// we can't determine which one it is. But we will be able to do it later
|
|
/// when we will have types from other rows for this column.
|
|
/// For example, if in the next row we will have types [Nullable(Nothing), String, Float64],
|
|
/// we can determine the type for this column as Tuple(Nullable(Float64), Nullable(String), Float64).
|
|
for (const auto & type : nested_types)
|
|
{
|
|
if (!checkIfTypeIsComplete(type))
|
|
return std::make_shared<DataTypeTuple>(nested_types);
|
|
}
|
|
|
|
auto nested_types_copy = nested_types;
|
|
transformInferredTypesIfNeededImpl<is_json>(nested_types_copy, settings, json_info);
|
|
|
|
if (checkIfTypesAreEqual(nested_types_copy))
|
|
return std::make_shared<DataTypeArray>(nested_types_copy.back());
|
|
|
|
return std::make_shared<DataTypeTuple>(nested_types);
|
|
}
|
|
else
|
|
{
|
|
transformInferredTypesIfNeededImpl<is_json>(nested_types, settings);
|
|
if (checkIfTypesAreEqual(nested_types))
|
|
return std::make_shared<DataTypeArray>(nested_types.back());
|
|
|
|
/// We couldn't determine common type for array element.
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
DataTypePtr tryInferTuple(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth)
|
|
{
|
|
assertChar('(', buf);
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
DataTypes nested_types;
|
|
bool first = true;
|
|
bool have_invalid_nested_type = false;
|
|
while (!buf.eof() && *buf.position() != ')')
|
|
{
|
|
if (!first)
|
|
{
|
|
if (!checkChar(',', buf))
|
|
return nullptr;
|
|
skipWhitespaceIfAny(buf);
|
|
}
|
|
else
|
|
first = false;
|
|
|
|
auto nested_type = tryInferDataTypeForSingleFieldImpl<false>(buf, settings, json_info, depth + 1);
|
|
if (nested_type)
|
|
nested_types.push_back(nested_type);
|
|
else
|
|
have_invalid_nested_type = true;
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
}
|
|
|
|
/// No ')' at the end.
|
|
if (buf.eof())
|
|
return nullptr;
|
|
|
|
assertChar(')', buf);
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
/// Nested data is invalid.
|
|
if (have_invalid_nested_type || nested_types.empty())
|
|
return nullptr;
|
|
|
|
return std::make_shared<DataTypeTuple>(nested_types);
|
|
}
|
|
|
|
DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
|
|
{
|
|
if (buf.eof())
|
|
return nullptr;
|
|
|
|
Float64 tmp_float;
|
|
if (settings.try_infer_integers)
|
|
{
|
|
/// If we read from String, we can do it in a more efficient way.
|
|
if (auto * string_buf = dynamic_cast<ReadBufferFromString *>(&buf))
|
|
{
|
|
/// Remember the pointer to the start of the number to rollback to it.
|
|
char * number_start = buf.position();
|
|
Int64 tmp_int;
|
|
bool read_int = tryReadIntText(tmp_int, buf);
|
|
/// If we reached eof, it cannot be float (it requires no less data than integer)
|
|
if (buf.eof())
|
|
return read_int ? std::make_shared<DataTypeInt64>() : nullptr;
|
|
|
|
char * int_end = buf.position();
|
|
/// We cam safely get back to the start of the number, because we read from a string and we didn't reach eof.
|
|
buf.position() = number_start;
|
|
if (tryReadFloatText(tmp_float, buf))
|
|
{
|
|
if (read_int && buf.position() == int_end)
|
|
return std::make_shared<DataTypeInt64>();
|
|
return std::make_shared<DataTypeFloat64>();
|
|
}
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
/// We should use PeekableReadBuffer, because we need to
|
|
/// rollback to the start of number to parse it as integer first
|
|
/// and then as float.
|
|
PeekableReadBuffer peekable_buf(buf);
|
|
PeekableReadBufferCheckpoint checkpoint(peekable_buf);
|
|
Int64 tmp_int;
|
|
bool read_int = tryReadIntText(tmp_int, peekable_buf);
|
|
auto * int_end = peekable_buf.position();
|
|
peekable_buf.rollbackToCheckpoint(true);
|
|
if (tryReadFloatText(tmp_float, peekable_buf))
|
|
{
|
|
/// Float parsing reads no fewer bytes than integer parsing,
|
|
/// so position of the buffer is either the same, or further.
|
|
/// If it's the same, then it's integer.
|
|
if (read_int && peekable_buf.position() == int_end)
|
|
return std::make_shared<DataTypeInt64>();
|
|
return std::make_shared<DataTypeFloat64>();
|
|
}
|
|
}
|
|
else if (tryReadFloatText(tmp_float, buf))
|
|
{
|
|
return std::make_shared<DataTypeFloat64>();
|
|
}
|
|
|
|
/// This is not a number.
|
|
return nullptr;
|
|
}
|
|
|
|
template <bool is_json>
|
|
DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
|
{
|
|
String field;
|
|
bool ok = true;
|
|
if constexpr (is_json)
|
|
ok = tryReadJSONStringInto(field, buf);
|
|
else
|
|
ok = tryReadQuotedStringInto(field, buf);
|
|
|
|
if (!ok)
|
|
return nullptr;
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
/// If it's object key, we should just return String type.
|
|
if constexpr (is_json)
|
|
{
|
|
if (json_info->is_object_key)
|
|
return std::make_shared<DataTypeString>();
|
|
}
|
|
|
|
if (auto type = tryInferDateOrDateTimeFromString(field, settings))
|
|
return type;
|
|
|
|
if constexpr (is_json)
|
|
{
|
|
if (settings.json.try_infer_numbers_from_strings)
|
|
{
|
|
if (auto number_type = tryInferNumberFromString(field, settings))
|
|
{
|
|
json_info->numbers_parsed_from_json_strings.insert(number_type.get());
|
|
return number_type;
|
|
}
|
|
}
|
|
}
|
|
|
|
return std::make_shared<DataTypeString>();
|
|
}
|
|
|
|
template <bool is_json>
|
|
DataTypePtr tryInferMapOrObject(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth)
|
|
{
|
|
assertChar('{', buf);
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
DataTypes key_types;
|
|
DataTypes value_types;
|
|
bool first = true;
|
|
bool have_invalid_nested_type = false;
|
|
while (!buf.eof() && *buf.position() != '}')
|
|
{
|
|
if (!first)
|
|
{
|
|
if (!checkChar(',', buf))
|
|
return nullptr;
|
|
skipWhitespaceIfAny(buf);
|
|
}
|
|
else
|
|
first = false;
|
|
|
|
DataTypePtr key_type;
|
|
if constexpr (is_json)
|
|
{
|
|
/// For JSON key type must be String.
|
|
json_info->is_object_key = true;
|
|
key_type = tryInferString<is_json>(buf, settings, json_info);
|
|
json_info->is_object_key = false;
|
|
}
|
|
else
|
|
{
|
|
key_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, nullptr, depth + 1);
|
|
}
|
|
|
|
if (key_type)
|
|
key_types.push_back(key_type);
|
|
else
|
|
have_invalid_nested_type = true;
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
if (!checkChar(':', buf))
|
|
return nullptr;
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
auto value_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, json_info, depth + 1);
|
|
if (value_type)
|
|
value_types.push_back(value_type);
|
|
else
|
|
have_invalid_nested_type = true;
|
|
skipWhitespaceIfAny(buf);
|
|
}
|
|
|
|
/// No '}' at the end.
|
|
if (buf.eof())
|
|
return nullptr;
|
|
|
|
assertChar('}', buf);
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
/// Nested data is invalid.
|
|
if (have_invalid_nested_type)
|
|
return nullptr;
|
|
|
|
if (key_types.empty())
|
|
{
|
|
if constexpr (is_json)
|
|
{
|
|
if (settings.json.try_infer_objects)
|
|
return std::make_shared<DataTypeObject>("json", true);
|
|
}
|
|
/// Empty Map is Map(Nothing, Nothing)
|
|
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>());
|
|
}
|
|
|
|
if constexpr (is_json)
|
|
{
|
|
/// If it's JSON field and one of value types is JSON Object, return also JSON Object.
|
|
for (const auto & value_type : value_types)
|
|
{
|
|
if (isObject(value_type))
|
|
return std::make_shared<DataTypeObject>("json", true);
|
|
}
|
|
|
|
transformInferredTypesIfNeededImpl<is_json>(value_types, settings, json_info);
|
|
if (!checkIfTypesAreEqual(value_types))
|
|
{
|
|
if (settings.json.try_infer_objects)
|
|
return std::make_shared<DataTypeObject>("json", true);
|
|
if (settings.json.read_objects_as_strings)
|
|
return makeNullable(std::make_shared<DataTypeString>());
|
|
return nullptr;
|
|
}
|
|
|
|
return std::make_shared<DataTypeMap>(key_types.back(), value_types.back());
|
|
}
|
|
|
|
if (!checkIfTypesAreEqual(key_types))
|
|
transformInferredTypesIfNeededImpl<is_json>(key_types, settings);
|
|
if (!checkIfTypesAreEqual(value_types))
|
|
transformInferredTypesIfNeededImpl<is_json>(value_types, settings);
|
|
|
|
if (!checkIfTypesAreEqual(key_types) || !checkIfTypesAreEqual(value_types))
|
|
return nullptr;
|
|
|
|
auto key_type = removeNullable(key_types.back());
|
|
if (!DataTypeMap::checkKeyType(key_type))
|
|
return nullptr;
|
|
|
|
return std::make_shared<DataTypeMap>(key_type, value_types.back());
|
|
}
|
|
|
|
template <bool is_json>
|
|
DataTypePtr tryInferDataTypeForSingleFieldImpl(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth)
|
|
{
|
|
if (depth > settings.max_parser_depth)
|
|
throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
|
|
"Maximum parse depth ({}) exceeded. Consider rising max_parser_depth setting.", settings.max_parser_depth);
|
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
if (buf.eof())
|
|
return nullptr;
|
|
|
|
/// Array [field1, field2, ...]
|
|
if (*buf.position() == '[')
|
|
return tryInferArray<is_json>(buf, settings, json_info, depth);
|
|
|
|
/// Tuple (field1, field2, ...), if format is not JSON
|
|
if constexpr (!is_json)
|
|
{
|
|
if (*buf.position() == '(')
|
|
return tryInferTuple(buf, settings, json_info, depth);
|
|
}
|
|
|
|
/// Map/Object for JSON { key1 : value1, key2 : value2, ...}
|
|
if (*buf.position() == '{')
|
|
return tryInferMapOrObject<is_json>(buf, settings, json_info, depth);
|
|
|
|
/// String
|
|
char quote = is_json ? '"' : '\'';
|
|
if (*buf.position() == quote)
|
|
return tryInferString<is_json>(buf, settings, json_info);
|
|
|
|
/// Bool
|
|
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
|
|
return DataTypeFactory::instance().get("Bool");
|
|
|
|
/// Null or NaN
|
|
if (checkCharCaseInsensitive('n', buf))
|
|
{
|
|
if (checkStringCaseInsensitive("ull", buf))
|
|
return makeNullable(std::make_shared<DataTypeNothing>());
|
|
else if (checkStringCaseInsensitive("an", buf))
|
|
return std::make_shared<DataTypeFloat64>();
|
|
}
|
|
|
|
/// Number
|
|
return tryInferNumber(buf, settings);
|
|
}
|
|
}
|
|
|
|
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
|
|
{
|
|
DataTypes types = {first, second};
|
|
transformInferredTypesIfNeededImpl<false>(types, settings, nullptr);
|
|
first = std::move(types[0]);
|
|
second = std::move(types[1]);
|
|
}
|
|
|
|
void transformInferredJSONTypesIfNeeded(
|
|
DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
|
{
|
|
DataTypes types = {first, second};
|
|
transformInferredTypesIfNeededImpl<true>(types, settings, json_info);
|
|
first = std::move(types[0]);
|
|
second = std::move(types[1]);
|
|
}
|
|
|
|
void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
|
{
|
|
if (!data_type)
|
|
return;
|
|
|
|
if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get()))
|
|
{
|
|
auto nested_type = array_type->getNestedType();
|
|
transformJSONTupleToArrayIfPossible(nested_type, settings, json_info);
|
|
data_type = std::make_shared<DataTypeArray>(nested_type);
|
|
return;
|
|
}
|
|
|
|
if (const auto * map_type = typeid_cast<const DataTypeMap *>(data_type.get()))
|
|
{
|
|
auto value_type = map_type->getValueType();
|
|
transformJSONTupleToArrayIfPossible(value_type, settings, json_info);
|
|
data_type = std::make_shared<DataTypeMap>(map_type->getKeyType(), value_type);
|
|
return;
|
|
}
|
|
|
|
if (const auto * tuple_type = typeid_cast<const DataTypeTuple *>(data_type.get()))
|
|
{
|
|
auto nested_types = tuple_type->getElements();
|
|
for (auto & nested_type : nested_types)
|
|
transformJSONTupleToArrayIfPossible(nested_type, settings, json_info);
|
|
|
|
auto nested_types_copy = nested_types;
|
|
transformInferredTypesIfNeededImpl<true>(nested_types_copy, settings, json_info);
|
|
if (checkIfTypesAreEqual(nested_types_copy))
|
|
data_type = std::make_shared<DataTypeArray>(nested_types_copy.back());
|
|
else
|
|
data_type = std::make_shared<DataTypeTuple>(nested_types);
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings)
|
|
{
|
|
ReadBufferFromString buf(field);
|
|
|
|
if (settings.try_infer_integers)
|
|
{
|
|
Int64 tmp_int;
|
|
if (tryReadIntText(tmp_int, buf) && buf.eof())
|
|
return std::make_shared<DataTypeInt64>();
|
|
}
|
|
|
|
/// We cam safely get back to the start of buffer, because we read from a string and we didn't reach eof.
|
|
buf.position() = buf.buffer().begin();
|
|
|
|
Float64 tmp;
|
|
if (tryReadFloatText(tmp, buf) && buf.eof())
|
|
return std::make_shared<DataTypeFloat64>();
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)
|
|
{
|
|
if (settings.try_infer_dates && tryInferDate(field))
|
|
return std::make_shared<DataTypeDate>();
|
|
|
|
if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
|
|
return std::make_shared<DataTypeDateTime64>(9);
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
DataTypePtr tryInferDataTypeForSingleField(ReadBuffer & buf, const FormatSettings & settings)
|
|
{
|
|
return tryInferDataTypeForSingleFieldImpl<false>(buf, settings, nullptr);
|
|
}
|
|
|
|
DataTypePtr tryInferDataTypeForSingleField(std::string_view field, const FormatSettings & settings)
|
|
{
|
|
ReadBufferFromString buf(field);
|
|
auto type = tryInferDataTypeForSingleFieldImpl<false>(buf, settings, nullptr);
|
|
/// Check if there is no unread data in buffer.
|
|
if (!buf.eof())
|
|
return nullptr;
|
|
return type;
|
|
}
|
|
|
|
DataTypePtr tryInferDataTypeForSingleJSONField(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
|
{
|
|
return tryInferDataTypeForSingleFieldImpl<true>(buf, settings, json_info);
|
|
}
|
|
|
|
DataTypePtr tryInferDataTypeForSingleJSONField(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info)
|
|
{
|
|
ReadBufferFromString buf(field);
|
|
auto type = tryInferDataTypeForSingleFieldImpl<true>(buf, settings, json_info);
|
|
/// Check if there is no unread data in buffer.
|
|
if (!buf.eof())
|
|
return nullptr;
|
|
return type;
|
|
}
|
|
|
|
DataTypePtr makeNullableRecursively(DataTypePtr type)
|
|
{
|
|
if (!type)
|
|
return nullptr;
|
|
|
|
WhichDataType which(type);
|
|
|
|
if (which.isNullable())
|
|
return type;
|
|
|
|
if (which.isArray())
|
|
{
|
|
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
|
|
auto nested_type = makeNullableRecursively(array_type->getNestedType());
|
|
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
|
|
}
|
|
|
|
if (which.isTuple())
|
|
{
|
|
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
|
|
DataTypes nested_types;
|
|
for (const auto & element : tuple_type->getElements())
|
|
{
|
|
auto nested_type = makeNullableRecursively(element);
|
|
if (!nested_type)
|
|
return nullptr;
|
|
nested_types.push_back(nested_type);
|
|
}
|
|
|
|
if (tuple_type->haveExplicitNames())
|
|
return std::make_shared<DataTypeTuple>(std::move(nested_types), tuple_type->getElementNames());
|
|
|
|
return std::make_shared<DataTypeTuple>(std::move(nested_types));
|
|
|
|
}
|
|
|
|
if (which.isMap())
|
|
{
|
|
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
|
|
auto key_type = makeNullableRecursively(map_type->getKeyType());
|
|
auto value_type = makeNullableRecursively(map_type->getValueType());
|
|
return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr;
|
|
}
|
|
|
|
if (which.isLowCarnality())
|
|
{
|
|
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
|
|
auto nested_type = makeNullableRecursively(lc_type->getDictionaryType());
|
|
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
|
|
}
|
|
|
|
if (which.isObject())
|
|
{
|
|
const auto * object_type = assert_cast<const DataTypeObject *>(type.get());
|
|
if (object_type->hasNullableSubcolumns())
|
|
return type;
|
|
return std::make_shared<DataTypeObject>(object_type->getSchemaFormat(), true);
|
|
}
|
|
|
|
return makeNullable(type);
|
|
}
|
|
|
|
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)
|
|
{
|
|
NamesAndTypesList result;
|
|
for (auto & [name, type] : header.getNamesAndTypesList())
|
|
result.emplace_back(name, makeNullableRecursively(type));
|
|
return result;
|
|
}
|
|
|
|
bool checkIfTypeIsComplete(const DataTypePtr & type)
|
|
{
|
|
if (!type)
|
|
return false;
|
|
|
|
WhichDataType which(type);
|
|
|
|
if (which.isNothing())
|
|
return false;
|
|
|
|
if (which.isNullable())
|
|
return checkIfTypeIsComplete(assert_cast<const DataTypeNullable *>(type.get())->getNestedType());
|
|
|
|
if (which.isArray())
|
|
return checkIfTypeIsComplete(assert_cast<const DataTypeArray *>(type.get())->getNestedType());
|
|
|
|
if (which.isTuple())
|
|
{
|
|
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
|
|
for (const auto & element : tuple_type->getElements())
|
|
{
|
|
if (!checkIfTypeIsComplete(element))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
if (which.isMap())
|
|
{
|
|
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
|
|
if (!checkIfTypeIsComplete(map_type->getKeyType()))
|
|
return false;
|
|
return checkIfTypeIsComplete(map_type->getValueType());
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
}
|