mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 07:01:59 +00:00
Merge pull request #35582 from Avogar/improve-schema-inference
Improve schema inference and add some fixes
This commit is contained in:
commit
d7b88d7683
@ -641,6 +641,12 @@ class IColumn;
|
||||
M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
|
||||
M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \
|
||||
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \
|
||||
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
|
||||
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
|
||||
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Parquet", 0) \
|
||||
M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format ORC", 0) \
|
||||
M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \
|
||||
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
|
||||
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
|
||||
\
|
||||
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \
|
||||
|
@ -45,22 +45,7 @@ DataTypeMap::DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & valu
|
||||
|
||||
void DataTypeMap::assertKeyType() const
|
||||
{
|
||||
bool type_error = false;
|
||||
if (key_type->getTypeId() == TypeIndex::LowCardinality)
|
||||
{
|
||||
const auto & low_cardinality_data_type = assert_cast<const DataTypeLowCardinality &>(*key_type);
|
||||
if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType())))
|
||||
type_error = true;
|
||||
}
|
||||
else if (!key_type->isValueRepresentedByInteger()
|
||||
&& !isStringOrFixedString(*key_type)
|
||||
&& !WhichDataType(key_type).isNothing()
|
||||
&& !WhichDataType(key_type).isUUID())
|
||||
{
|
||||
type_error = true;
|
||||
}
|
||||
|
||||
if (type_error)
|
||||
if (!checkKeyType(key_type))
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Type of Map key must be a type, that can be represented by integer or String or FixedString (possibly LowCardinality) or UUID,"
|
||||
" but {} given", key_type->getName());
|
||||
@ -102,6 +87,25 @@ bool DataTypeMap::equals(const IDataType & rhs) const
|
||||
return nested->equals(*rhs_map.nested);
|
||||
}
|
||||
|
||||
bool DataTypeMap::checkKeyType(DataTypePtr key_type)
|
||||
{
|
||||
if (key_type->getTypeId() == TypeIndex::LowCardinality)
|
||||
{
|
||||
const auto & low_cardinality_data_type = assert_cast<const DataTypeLowCardinality &>(*key_type);
|
||||
if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType())))
|
||||
return false;
|
||||
}
|
||||
else if (!key_type->isValueRepresentedByInteger()
|
||||
&& !isStringOrFixedString(*key_type)
|
||||
&& !WhichDataType(key_type).isNothing()
|
||||
&& !WhichDataType(key_type).isUUID())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static DataTypePtr create(const ASTPtr & arguments)
|
||||
{
|
||||
if (!arguments || arguments->children.size() != 2)
|
||||
|
@ -48,6 +48,8 @@ public:
|
||||
|
||||
SerializationPtr doGetDefaultSerialization() const override;
|
||||
|
||||
static bool checkKeyType(DataTypePtr key_type);
|
||||
|
||||
private:
|
||||
void assertKeyType() const;
|
||||
};
|
||||
|
@ -5,12 +5,17 @@
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeNothing.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/getLeastSupertype.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Poco/JSON/Parser.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <Parsers/TokenIterator.h>
|
||||
#include <Parsers/ExpressionListParsers.h>
|
||||
#include <Interpreters/evaluateConstantExpression.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -18,7 +23,6 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule)
|
||||
@ -138,7 +142,8 @@ bool deserializeFieldByEscapingRule(
|
||||
serialization->deserializeTextRaw(column, buf, format_settings);
|
||||
break;
|
||||
default:
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule));
|
||||
throw Exception(
|
||||
ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule));
|
||||
}
|
||||
return read;
|
||||
}
|
||||
@ -176,7 +181,8 @@ void serializeFieldByEscapingRule(
|
||||
}
|
||||
}
|
||||
|
||||
void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
|
||||
void writeStringByEscapingRule(
|
||||
const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
|
||||
{
|
||||
switch (escaping_rule)
|
||||
{
|
||||
@ -249,85 +255,269 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e
|
||||
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
|
||||
}
|
||||
|
||||
static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context)
|
||||
static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf)
|
||||
{
|
||||
if (!context)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression");
|
||||
if (buf.eof())
|
||||
return nullptr;
|
||||
|
||||
ParserExpression parser;
|
||||
Expected expected;
|
||||
Tokens tokens(field.data, field.data + field.size);
|
||||
IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth);
|
||||
ASTPtr ast;
|
||||
|
||||
/// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats.
|
||||
bool parsed = parser.parse(token_iterator, ast, expected);
|
||||
if (!parsed || !token_iterator->isEnd())
|
||||
return false;
|
||||
|
||||
try
|
||||
/// Array
|
||||
if (checkChar('[', buf))
|
||||
{
|
||||
std::pair<Field, DataTypePtr> result = evaluateConstantExpression(ast, context);
|
||||
type = generalizeDataType(result.second);
|
||||
return true;
|
||||
skipWhitespaceIfAny(buf);
|
||||
|
||||
DataTypes nested_types;
|
||||
bool first = true;
|
||||
while (!buf.eof() && *buf.position() != ']')
|
||||
{
|
||||
if (!first)
|
||||
{
|
||||
skipWhitespaceIfAny(buf);
|
||||
if (!checkChar(',', buf))
|
||||
return nullptr;
|
||||
skipWhitespaceIfAny(buf);
|
||||
}
|
||||
else
|
||||
first = false;
|
||||
|
||||
auto nested_type = determineDataTypeForSingleFieldImpl(buf);
|
||||
if (!nested_type)
|
||||
return nullptr;
|
||||
|
||||
nested_types.push_back(nested_type);
|
||||
}
|
||||
|
||||
if (buf.eof())
|
||||
return nullptr;
|
||||
|
||||
++buf.position();
|
||||
|
||||
if (nested_types.empty())
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
|
||||
|
||||
auto least_supertype = tryGetLeastSupertype(nested_types);
|
||||
if (!least_supertype)
|
||||
return nullptr;
|
||||
|
||||
return std::make_shared<DataTypeArray>(least_supertype);
|
||||
}
|
||||
catch (...)
|
||||
|
||||
/// Tuple
|
||||
if (checkChar('(', buf))
|
||||
{
|
||||
return false;
|
||||
skipWhitespaceIfAny(buf);
|
||||
|
||||
DataTypes nested_types;
|
||||
bool first = true;
|
||||
while (!buf.eof() && *buf.position() != ')')
|
||||
{
|
||||
if (!first)
|
||||
{
|
||||
skipWhitespaceIfAny(buf);
|
||||
if (!checkChar(',', buf))
|
||||
return nullptr;
|
||||
skipWhitespaceIfAny(buf);
|
||||
}
|
||||
else
|
||||
first = false;
|
||||
|
||||
auto nested_type = determineDataTypeForSingleFieldImpl(buf);
|
||||
if (!nested_type)
|
||||
return nullptr;
|
||||
|
||||
nested_types.push_back(nested_type);
|
||||
}
|
||||
|
||||
if (buf.eof() || nested_types.empty())
|
||||
return nullptr;
|
||||
|
||||
++buf.position();
|
||||
|
||||
return std::make_shared<DataTypeTuple>(nested_types);
|
||||
}
|
||||
|
||||
/// Map
|
||||
if (checkChar('{', buf))
|
||||
{
|
||||
skipWhitespaceIfAny(buf);
|
||||
|
||||
DataTypes key_types;
|
||||
DataTypes value_types;
|
||||
bool first = true;
|
||||
while (!buf.eof() && *buf.position() != '}')
|
||||
{
|
||||
if (!first)
|
||||
{
|
||||
skipWhitespaceIfAny(buf);
|
||||
if (!checkChar(',', buf))
|
||||
return nullptr;
|
||||
skipWhitespaceIfAny(buf);
|
||||
}
|
||||
else
|
||||
first = false;
|
||||
|
||||
auto key_type = determineDataTypeForSingleFieldImpl(buf);
|
||||
if (!key_type)
|
||||
return nullptr;
|
||||
|
||||
key_types.push_back(key_type);
|
||||
|
||||
skipWhitespaceIfAny(buf);
|
||||
if (!checkChar(':', buf))
|
||||
return nullptr;
|
||||
skipWhitespaceIfAny(buf);
|
||||
|
||||
auto value_type = determineDataTypeForSingleFieldImpl(buf);
|
||||
if (!value_type)
|
||||
return nullptr;
|
||||
|
||||
value_types.push_back(value_type);
|
||||
}
|
||||
|
||||
if (buf.eof())
|
||||
return nullptr;
|
||||
|
||||
++buf.position();
|
||||
skipWhitespaceIfAny(buf);
|
||||
|
||||
if (key_types.empty())
|
||||
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>());
|
||||
|
||||
auto key_least_supertype = tryGetLeastSupertype(key_types);
|
||||
|
||||
auto value_least_supertype = tryGetLeastSupertype(value_types);
|
||||
if (!key_least_supertype || !value_least_supertype)
|
||||
return nullptr;
|
||||
|
||||
if (!DataTypeMap::checkKeyType(key_least_supertype))
|
||||
return nullptr;
|
||||
|
||||
return std::make_shared<DataTypeMap>(key_least_supertype, value_least_supertype);
|
||||
}
|
||||
|
||||
/// String
|
||||
if (*buf.position() == '\'')
|
||||
{
|
||||
++buf.position();
|
||||
while (!buf.eof())
|
||||
{
|
||||
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
|
||||
buf.position() = next_pos;
|
||||
|
||||
if (!buf.hasPendingData())
|
||||
continue;
|
||||
|
||||
if (*buf.position() == '\'')
|
||||
break;
|
||||
|
||||
if (*buf.position() == '\\')
|
||||
++buf.position();
|
||||
}
|
||||
|
||||
if (buf.eof())
|
||||
return nullptr;
|
||||
|
||||
++buf.position();
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
||||
/// Bool
|
||||
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
|
||||
return DataTypeFactory::instance().get("Bool");
|
||||
|
||||
/// Null
|
||||
if (checkStringCaseInsensitive("NULL", buf))
|
||||
return std::make_shared<DataTypeNothing>();
|
||||
|
||||
/// Number
|
||||
Float64 tmp;
|
||||
if (tryReadFloatText(tmp, buf))
|
||||
return std::make_shared<DataTypeFloat64>();
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context)
|
||||
static DataTypePtr determineDataTypeForSingleField(ReadBuffer & buf)
|
||||
{
|
||||
return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf));
|
||||
}
|
||||
|
||||
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
|
||||
{
|
||||
switch (escaping_rule)
|
||||
{
|
||||
case FormatSettings::EscapingRule::Quoted:
|
||||
{
|
||||
DataTypePtr type;
|
||||
bool parsed = evaluateConstantExpressionFromString(field, type, context);
|
||||
return parsed ? type : nullptr;
|
||||
ReadBufferFromString buf(field);
|
||||
auto type = determineDataTypeForSingleField(buf);
|
||||
return buf.eof() ? type : nullptr;
|
||||
}
|
||||
case FormatSettings::EscapingRule::JSON:
|
||||
return getDataTypeFromJSONField(field);
|
||||
case FormatSettings::EscapingRule::CSV:
|
||||
{
|
||||
if (!format_settings.csv.input_format_use_best_effort_in_schema_inference)
|
||||
return makeNullable(std::make_shared<DataTypeString>());
|
||||
|
||||
if (field.empty() || field == format_settings.csv.null_representation)
|
||||
return nullptr;
|
||||
|
||||
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
return DataTypeFactory::instance().get("Nullable(Bool)");
|
||||
|
||||
DataTypePtr type;
|
||||
bool parsed;
|
||||
if (field[0] == '\'' || field[0] == '"')
|
||||
if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"')))
|
||||
{
|
||||
/// Try to evaluate expression inside quotes.
|
||||
parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context);
|
||||
/// If it's a number in quotes we determine it as a string.
|
||||
if (parsed && type && isNumber(removeNullable(type)))
|
||||
return makeNullable(std::make_shared<DataTypeString>());
|
||||
}
|
||||
else
|
||||
parsed = evaluateConstantExpressionFromString(field, type, context);
|
||||
ReadBufferFromString buf(std::string_view(field.data() + 1, field.size() - 2));
|
||||
/// Try to determine the type of value inside quotes
|
||||
auto type = determineDataTypeForSingleField(buf);
|
||||
|
||||
/// If we couldn't parse an expression, determine it as a string.
|
||||
return parsed ? type : makeNullable(std::make_shared<DataTypeString>());
|
||||
if (!type)
|
||||
return nullptr;
|
||||
|
||||
/// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string.
|
||||
if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof())
|
||||
return makeNullable(std::make_shared<DataTypeString>());
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
/// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string.
|
||||
ReadBufferFromString buf(field);
|
||||
Float64 tmp;
|
||||
if (tryReadFloatText(tmp, buf) && buf.eof())
|
||||
return makeNullable(std::make_shared<DataTypeFloat64>());
|
||||
|
||||
return makeNullable(std::make_shared<DataTypeString>());
|
||||
}
|
||||
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
|
||||
case FormatSettings::EscapingRule::Escaped:
|
||||
/// TODO: Try to use some heuristics here to determine the type of data.
|
||||
return field.empty() ? nullptr : makeNullable(std::make_shared<DataTypeString>());
|
||||
{
|
||||
if (!format_settings.tsv.input_format_use_best_effort_in_schema_inference)
|
||||
return makeNullable(std::make_shared<DataTypeString>());
|
||||
|
||||
if (field.empty() || field == format_settings.tsv.null_representation)
|
||||
return nullptr;
|
||||
|
||||
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
|
||||
return DataTypeFactory::instance().get("Nullable(Bool)");
|
||||
|
||||
ReadBufferFromString buf(field);
|
||||
auto type = determineDataTypeForSingleField(buf);
|
||||
if (!buf.eof())
|
||||
return makeNullable(std::make_shared<DataTypeString>());
|
||||
|
||||
return type;
|
||||
}
|
||||
default:
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule));
|
||||
}
|
||||
}
|
||||
|
||||
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context)
|
||||
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
|
||||
{
|
||||
DataTypes data_types;
|
||||
data_types.reserve(fields.size());
|
||||
for (const auto & field : fields)
|
||||
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context));
|
||||
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule));
|
||||
return data_types;
|
||||
}
|
||||
|
||||
@ -344,4 +534,12 @@ DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escap
|
||||
}
|
||||
}
|
||||
|
||||
DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules)
|
||||
{
|
||||
DataTypes data_types;
|
||||
for (const auto & rule : escaping_rules)
|
||||
data_types.push_back(getDefaultDataTypeForEscapingRule(rule));
|
||||
return data_types;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -43,15 +43,21 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es
|
||||
/// - For JSON escaping rule we can use JSON parser to parse a single field
|
||||
/// and then convert JSON type of this field to ClickHouse type.
|
||||
/// - For CSV escaping rule we can do the next:
|
||||
/// - If the field is an unquoted string, then we could try to evaluate it
|
||||
/// as a constant expression, and if it fails, treat it as a String.
|
||||
/// - If the field is a string in quotes, then we can try to evaluate
|
||||
/// expression inside quotes as a constant expression, and if it fails or
|
||||
/// the result is a number (we don't parse numbers in quotes) we treat it as a String.
|
||||
/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here)
|
||||
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr);
|
||||
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr);
|
||||
/// - If the field is an unquoted string, then we try to parse it as a number,
|
||||
/// and if we cannot, treat it as a String.
|
||||
/// - If the field is a string in quotes, then we try to use some
|
||||
/// tweaks and heuristics to determine the type inside quotes, and if we can't or
|
||||
/// the result is a number or tuple (we don't parse numbers in quotes and don't
|
||||
/// support tuples in CSV) we treat it as a String.
|
||||
/// - If input_format_csv_use_best_effort_in_schema_inference is disabled, we
|
||||
/// treat everything as a string.
|
||||
/// - For TSV and TSVRaw we try to use some tweaks and heuristics to determine the type
|
||||
/// of value if setting input_format_tsv_use_best_effort_in_schema_inference is enabled,
|
||||
/// otherwise we treat everything as a string.
|
||||
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
|
||||
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
|
||||
|
||||
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule);
|
||||
DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules);
|
||||
|
||||
}
|
||||
|
@ -65,6 +65,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
|
||||
format_settings.csv.null_representation = settings.format_csv_null_representation;
|
||||
format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
|
||||
format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||
@ -97,6 +98,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.parquet.import_nested = settings.input_format_parquet_import_nested;
|
||||
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
|
||||
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
|
||||
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
||||
format_settings.pretty.color = settings.output_format_pretty_color;
|
||||
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
||||
@ -117,6 +119,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
|
||||
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
|
||||
format_settings.tsv.null_representation = settings.format_tsv_null_representation;
|
||||
format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
|
||||
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
|
||||
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
||||
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
||||
@ -126,10 +129,17 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
|
||||
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
|
||||
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
|
||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
|
||||
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
||||
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
||||
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
|
||||
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
|
||||
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
||||
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
||||
@ -137,6 +147,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns;
|
||||
format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation;
|
||||
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
|
||||
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
|
||||
|
||||
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
|
||||
if (format_settings.schema.is_server)
|
||||
@ -371,7 +382,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader(
|
||||
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
||||
return schema_reader_creator(buf, format_settings, context);
|
||||
return schema_reader_creator(buf, format_settings);
|
||||
}
|
||||
|
||||
ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
|
||||
|
@ -97,7 +97,7 @@ private:
|
||||
/// The checker should return true if format support append.
|
||||
using AppendSupportChecker = std::function<bool(const FormatSettings & settings)>;
|
||||
|
||||
using SchemaReaderCreator = std::function<SchemaReaderPtr(ReadBuffer & in, const FormatSettings & settings, ContextPtr context)>;
|
||||
using SchemaReaderCreator = std::function<SchemaReaderPtr(ReadBuffer & in, const FormatSettings & settings)>;
|
||||
using ExternalSchemaReaderCreator = std::function<ExternalSchemaReaderPtr(const FormatSettings & settings)>;
|
||||
|
||||
struct Creators
|
||||
|
@ -36,6 +36,8 @@ struct FormatSettings
|
||||
bool seekable_read = true;
|
||||
UInt64 max_rows_to_read_for_schema_inference = 100;
|
||||
|
||||
String column_names_for_schema_inference = "";
|
||||
|
||||
enum class DateTimeInputFormat
|
||||
{
|
||||
Basic, /// Default format for fast parsing: YYYY-MM-DD hh:mm:ss (ISO-8601 without fractional part and timezone) or NNNNNNNNNN unix timestamp.
|
||||
@ -77,6 +79,7 @@ struct FormatSettings
|
||||
bool low_cardinality_as_dictionary = false;
|
||||
bool import_nested = false;
|
||||
bool allow_missing_columns = false;
|
||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
} arrow;
|
||||
|
||||
@ -104,6 +107,7 @@ struct FormatSettings
|
||||
bool input_format_arrays_as_nested_csv = false;
|
||||
String null_representation = "\\N";
|
||||
char tuple_delimiter = ',';
|
||||
bool input_format_use_best_effort_in_schema_inference = true;
|
||||
} csv;
|
||||
|
||||
struct HiveText
|
||||
@ -141,6 +145,7 @@ struct FormatSettings
|
||||
UInt64 row_group_size = 1000000;
|
||||
bool import_nested = false;
|
||||
bool allow_missing_columns = false;
|
||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
std::unordered_set<int> skip_row_groups = {};
|
||||
} parquet;
|
||||
@ -209,6 +214,7 @@ struct FormatSettings
|
||||
bool crlf_end_of_line = false;
|
||||
String null_representation = "\\N";
|
||||
bool input_format_enum_as_number = false;
|
||||
bool input_format_use_best_effort_in_schema_inference = true;
|
||||
} tsv;
|
||||
|
||||
struct
|
||||
@ -223,6 +229,7 @@ struct FormatSettings
|
||||
bool import_nested = false;
|
||||
bool allow_missing_columns = false;
|
||||
int64_t row_batch_size = 100'000;
|
||||
bool skip_columns_with_unsupported_types_in_schema_inference = false;
|
||||
bool case_insensitive_column_matching = false;
|
||||
std::unordered_set<int> skip_stripes = {};
|
||||
} orc;
|
||||
|
@ -105,8 +105,11 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
|
||||
return readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, buf_out);
|
||||
}
|
||||
|
||||
DataTypePtr generalizeDataType(DataTypePtr type)
|
||||
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type)
|
||||
{
|
||||
if (!type)
|
||||
return nullptr;
|
||||
|
||||
WhichDataType which(type);
|
||||
|
||||
if (which.isNothing())
|
||||
@ -115,16 +118,13 @@ DataTypePtr generalizeDataType(DataTypePtr type)
|
||||
if (which.isNullable())
|
||||
{
|
||||
const auto * nullable_type = assert_cast<const DataTypeNullable *>(type.get());
|
||||
return generalizeDataType(nullable_type->getNestedType());
|
||||
return makeNullableRecursivelyAndCheckForNothing(nullable_type->getNestedType());
|
||||
}
|
||||
|
||||
if (isNumber(type))
|
||||
return makeNullable(std::make_shared<DataTypeFloat64>());
|
||||
|
||||
if (which.isArray())
|
||||
{
|
||||
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
|
||||
auto nested_type = generalizeDataType(array_type->getNestedType());
|
||||
auto nested_type = makeNullableRecursivelyAndCheckForNothing(array_type->getNestedType());
|
||||
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
|
||||
}
|
||||
|
||||
@ -134,7 +134,7 @@ DataTypePtr generalizeDataType(DataTypePtr type)
|
||||
DataTypes nested_types;
|
||||
for (const auto & element : tuple_type->getElements())
|
||||
{
|
||||
auto nested_type = generalizeDataType(element);
|
||||
auto nested_type = makeNullableRecursivelyAndCheckForNothing(element);
|
||||
if (!nested_type)
|
||||
return nullptr;
|
||||
nested_types.push_back(nested_type);
|
||||
@ -145,19 +145,27 @@ DataTypePtr generalizeDataType(DataTypePtr type)
|
||||
if (which.isMap())
|
||||
{
|
||||
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
|
||||
auto key_type = removeNullable(generalizeDataType(map_type->getKeyType()));
|
||||
auto value_type = generalizeDataType(map_type->getValueType());
|
||||
return key_type && value_type ? std::make_shared<DataTypeMap>(key_type, value_type) : nullptr;
|
||||
auto key_type = makeNullableRecursivelyAndCheckForNothing(map_type->getKeyType());
|
||||
auto value_type = makeNullableRecursivelyAndCheckForNothing(map_type->getValueType());
|
||||
return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr;
|
||||
}
|
||||
|
||||
if (which.isLowCarnality())
|
||||
{
|
||||
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
|
||||
auto nested_type = generalizeDataType(lc_type->getDictionaryType());
|
||||
auto nested_type = makeNullableRecursivelyAndCheckForNothing(lc_type->getDictionaryType());
|
||||
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
|
||||
}
|
||||
|
||||
return makeNullable(type);
|
||||
}
|
||||
|
||||
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)
|
||||
{
|
||||
NamesAndTypesList result;
|
||||
for (auto & [name, type] : header.getNamesAndTypesList())
|
||||
result.emplace_back(name, makeNullableRecursivelyAndCheckForNothing(type));
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -29,14 +29,16 @@ ColumnsDescription readSchemaFromFormat(
|
||||
ContextPtr context,
|
||||
std::unique_ptr<ReadBuffer> & buf_out);
|
||||
|
||||
/// Convert type to the most general type:
|
||||
/// - IntN, UIntN, FloatN, Decimal -> Float64
|
||||
/// Make type Nullable recursively:
|
||||
/// - Type -> Nullable(type)
|
||||
/// - Array(Type) -> Array(Nullable(Type))
|
||||
/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN))
|
||||
/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType))
|
||||
/// - LowCardinality(Type) -> LowCardinality(Nullable(Type))
|
||||
/// If type is Nothing or one of the nested types is Nothing, return nullptr.
|
||||
DataTypePtr generalizeDataType(DataTypePtr type);
|
||||
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type);
|
||||
|
||||
/// Call makeNullableRecursivelyAndCheckForNothing for all types
|
||||
/// in the block and return names and types.
|
||||
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header);
|
||||
}
|
||||
|
@ -1366,6 +1366,7 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
|
||||
/// - Tuples: (...)
|
||||
/// - Maps: {...}
|
||||
/// - NULL
|
||||
/// - Bool: true/false
|
||||
/// - Number: integer, float, decimal.
|
||||
|
||||
if (*buf.position() == '\'')
|
||||
@ -1394,6 +1395,16 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
|
||||
s.append("NaN");
|
||||
}
|
||||
}
|
||||
else if (checkCharCaseInsensitive('t', buf))
|
||||
{
|
||||
assertStringCaseInsensitive("rue", buf);
|
||||
s.append("true");
|
||||
}
|
||||
else if (checkCharCaseInsensitive('f', buf))
|
||||
{
|
||||
assertStringCaseInsensitive("alse", buf);
|
||||
s.append("false");
|
||||
}
|
||||
else
|
||||
{
|
||||
/// It's an integer, float or decimal. They all can be parsed as float.
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <Formats/ReadSchemaUtils.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -66,9 +67,32 @@ static void checkTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, c
|
||||
result.emplace_back(name, type);
|
||||
}
|
||||
|
||||
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_)
|
||||
: ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_)
|
||||
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, bool allow_bools_as_numbers_)
|
||||
: ISchemaReader(in_), max_rows_to_read(format_settings.max_rows_to_read_for_schema_inference), allow_bools_as_numbers(allow_bools_as_numbers_)
|
||||
{
|
||||
if (!format_settings.column_names_for_schema_inference.empty())
|
||||
{
|
||||
/// column_names_for_schema_inference is a string in format 'column1,column2,column3,...'
|
||||
boost::split(column_names, format_settings.column_names_for_schema_inference, boost::is_any_of(","));
|
||||
for (auto & column_name : column_names)
|
||||
{
|
||||
std::string col_name_trimmed = boost::trim_copy(column_name);
|
||||
if (!col_name_trimmed.empty())
|
||||
column_name = col_name_trimmed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, bool allow_bools_as_numbers_)
|
||||
: IRowSchemaReader(in_, format_settings, allow_bools_as_numbers_)
|
||||
{
|
||||
default_type = default_type_;
|
||||
}
|
||||
|
||||
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_, bool allow_bools_as_numbers_)
|
||||
: IRowSchemaReader(in_, format_settings, allow_bools_as_numbers_)
|
||||
{
|
||||
default_types = default_types_;
|
||||
}
|
||||
|
||||
NamesAndTypesList IRowSchemaReader::readSchema()
|
||||
@ -90,7 +114,7 @@ NamesAndTypesList IRowSchemaReader::readSchema()
|
||||
if (!new_data_types[i])
|
||||
continue;
|
||||
|
||||
chooseResultType(data_types[i], new_data_types[i], allow_bools_as_numbers, default_type, std::to_string(i + 1), row);
|
||||
chooseResultType(data_types[i], new_data_types[i], allow_bools_as_numbers, getDefaultType(i), std::to_string(i + 1), row);
|
||||
}
|
||||
}
|
||||
|
||||
@ -115,12 +139,21 @@ NamesAndTypesList IRowSchemaReader::readSchema()
|
||||
for (size_t i = 0; i != data_types.size(); ++i)
|
||||
{
|
||||
/// Check that we could determine the type of this column.
|
||||
checkTypeAndAppend(result, data_types[i], column_names[i], default_type, max_rows_to_read);
|
||||
checkTypeAndAppend(result, data_types[i], column_names[i], getDefaultType(i), max_rows_to_read);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const
|
||||
{
|
||||
if (default_type)
|
||||
return default_type;
|
||||
if (column < default_types.size() && default_types[column])
|
||||
return default_types[column];
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_)
|
||||
: ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_)
|
||||
{
|
||||
|
@ -31,12 +31,17 @@ protected:
|
||||
/// Base class for schema inference for formats that read data row by row.
|
||||
/// It reads data row by row (up to max_rows_to_read), determines types of columns
|
||||
/// for each row and compare them with types from the previous rows. If some column
|
||||
/// contains values with different types in different rows, the default type will be
|
||||
/// used for this column or the exception will be thrown (if default type is not set).
|
||||
/// contains values with different types in different rows, the default type
|
||||
/// (from argument default_type_) will be used for this column or the exception
|
||||
/// will be thrown (if default type is not set). If different columns have different
|
||||
/// default types, you can provide them by default_types_ argument.
|
||||
class IRowSchemaReader : public ISchemaReader
|
||||
{
|
||||
public:
|
||||
IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr, bool allow_bools_as_numbers_ = false);
|
||||
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, bool allow_bools_as_numbers_ = false);
|
||||
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, bool allow_bools_as_numbers_ = false);
|
||||
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_, bool allow_bools_as_numbers_ = false);
|
||||
|
||||
NamesAndTypesList readSchema() override;
|
||||
|
||||
protected:
|
||||
@ -49,8 +54,11 @@ protected:
|
||||
void setColumnNames(const std::vector<String> & names) { column_names = names; }
|
||||
|
||||
private:
|
||||
|
||||
DataTypePtr getDefaultType(size_t column) const;
|
||||
size_t max_rows_to_read;
|
||||
DataTypePtr default_type;
|
||||
DataTypes default_types;
|
||||
bool allow_bools_as_numbers;
|
||||
std::vector<String> column_names;
|
||||
};
|
||||
|
@ -3,6 +3,7 @@
|
||||
#if USE_ARROW
|
||||
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Formats/ReadSchemaUtils.h>
|
||||
#include <IO/ReadBufferFromMemory.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/copyData.h>
|
||||
@ -171,8 +172,9 @@ NamesAndTypesList ArrowSchemaReader::readSchema()
|
||||
schema = createFileReader(in, format_settings, is_stopped)->schema();
|
||||
}
|
||||
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow");
|
||||
return header.getNamesAndTypesList();
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
*schema, stream ? "ArrowStream" : "Arrow", format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference);
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
}
|
||||
|
||||
void registerInputFormatArrow(FormatFactory & factory)
|
||||
@ -202,13 +204,13 @@ void registerArrowSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader(
|
||||
"Arrow",
|
||||
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
[](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ArrowSchemaReader>(buf, false, settings);
|
||||
});
|
||||
factory.registerSchemaReader(
|
||||
"ArrowStream",
|
||||
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
[](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ArrowSchemaReader>(buf, true, settings);
|
||||
});}
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <DataTypes/DataTypeDate.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <DataTypes/DataTypeDateTime64.h>
|
||||
#include <DataTypes/DataTypeNothing.h>
|
||||
#include <Common/DateLUTImpl.h>
|
||||
#include <base/types.h>
|
||||
#include <Processors/Chunk.h>
|
||||
@ -26,11 +27,13 @@
|
||||
#include <Columns/ColumnUnique.h>
|
||||
#include <Columns/ColumnMap.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnNothing.h>
|
||||
#include <Interpreters/castColumn.h>
|
||||
#include <Common/quoteString.h>
|
||||
#include <algorithm>
|
||||
#include <arrow/builder.h>
|
||||
#include <arrow/array.h>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
|
||||
@ -329,12 +332,17 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
const std::string & format_name,
|
||||
bool is_nullable,
|
||||
std::unordered_map<String, std::shared_ptr<ColumnWithTypeAndName>> & dictionary_values,
|
||||
bool read_ints_as_dates)
|
||||
bool read_ints_as_dates,
|
||||
bool allow_null_type,
|
||||
bool skip_columns_with_unsupported_types,
|
||||
bool & skipped)
|
||||
{
|
||||
if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST
|
||||
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT)
|
||||
{
|
||||
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates);
|
||||
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
|
||||
auto nullable_type = std::make_shared<DataTypeNullable>(std::move(nested_column.type));
|
||||
auto nullable_column = ColumnNullable::create(nested_column.column, nullmap_column);
|
||||
@ -379,7 +387,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
case arrow::Type::MAP:
|
||||
{
|
||||
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
|
||||
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
|
||||
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
|
||||
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
|
||||
|
||||
const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get());
|
||||
@ -391,7 +402,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
case arrow::Type::LIST:
|
||||
{
|
||||
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
|
||||
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
|
||||
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
|
||||
auto array_column = ColumnArray::create(nested_column.column, offsets_column);
|
||||
auto array_type = std::make_shared<DataTypeArray>(nested_column.type);
|
||||
@ -416,7 +429,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
for (int i = 0; i != arrow_struct_type->num_fields(); ++i)
|
||||
{
|
||||
auto nested_arrow_column = std::make_shared<arrow::ChunkedArray>(nested_arrow_columns[i]);
|
||||
auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates);
|
||||
auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
if (skipped)
|
||||
return {};
|
||||
tuple_elements.emplace_back(std::move(element.column));
|
||||
tuple_types.emplace_back(std::move(element.type));
|
||||
tuple_names.emplace_back(std::move(element.name));
|
||||
@ -439,7 +454,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
dict_array.emplace_back(dict_chunk.dictionary());
|
||||
}
|
||||
auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array);
|
||||
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
|
||||
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
|
||||
/// We should convert read column to ColumnUnique.
|
||||
auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn();
|
||||
@ -469,9 +484,33 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
# undef DISPATCH
|
||||
// TODO: read JSON as a string?
|
||||
// TODO: read UUID as a string?
|
||||
case arrow::Type::NA:
|
||||
{
|
||||
if (allow_null_type)
|
||||
{
|
||||
auto type = std::make_shared<DataTypeNothing>();
|
||||
auto column = ColumnNothing::create(arrow_column->length());
|
||||
return {std::move(column), type, column_name};
|
||||
}
|
||||
[[fallthrough]];
|
||||
}
|
||||
default:
|
||||
throw Exception(ErrorCodes::UNKNOWN_TYPE,
|
||||
"Unsupported {} type '{}' of an input column '{}'.", format_name, arrow_column->type()->name(), column_name);
|
||||
{
|
||||
if (skip_columns_with_unsupported_types)
|
||||
{
|
||||
skipped = true;
|
||||
return {};
|
||||
}
|
||||
|
||||
throw Exception(
|
||||
ErrorCodes::UNKNOWN_TYPE,
|
||||
"Unsupported {} type '{}' of an input column '{}'. If it happens during schema inference and you want to skip columns with "
|
||||
"unsupported types, you can enable setting input_format_{}_skip_columns_with_unsupported_types_in_schema_inference",
|
||||
format_name,
|
||||
arrow_column->type()->name(),
|
||||
column_name,
|
||||
boost::algorithm::to_lower_copy(format_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -485,8 +524,9 @@ static void checkStatus(const arrow::Status & status, const String & column_name
|
||||
throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()};
|
||||
}
|
||||
|
||||
|
||||
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header, bool ignore_case)
|
||||
const arrow::Schema & schema, const std::string & format_name, bool skip_columns_with_unsupported_types, const Block * hint_header, bool ignore_case)
|
||||
{
|
||||
ColumnsWithTypeAndName sample_columns;
|
||||
std::unordered_set<String> nested_table_names;
|
||||
@ -512,9 +552,14 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
arrow::ArrayVector array_vector = {arrow_array};
|
||||
auto arrow_column = std::make_shared<arrow::ChunkedArray>(array_vector);
|
||||
std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dict_values;
|
||||
ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false);
|
||||
|
||||
sample_columns.emplace_back(std::move(sample_column));
|
||||
bool skipped = false;
|
||||
bool allow_null_type = false;
|
||||
if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable())
|
||||
allow_null_type = true;
|
||||
ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(
|
||||
arrow_column, field->name(), format_name, false, dict_values, false, allow_null_type, skip_columns_with_unsupported_types, skipped);
|
||||
if (!skipped)
|
||||
sample_columns.emplace_back(std::move(sample_column));
|
||||
}
|
||||
return Block(std::move(sample_columns));
|
||||
}
|
||||
@ -559,6 +604,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
UInt64 num_rows = name_to_column_ptr.begin()->second->length();
|
||||
columns_list.reserve(header.rows());
|
||||
std::unordered_map<String, BlockPtr> nested_tables;
|
||||
bool skipped = false;
|
||||
for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i)
|
||||
{
|
||||
const ColumnWithTypeAndName & header_column = header.getByPosition(column_i);
|
||||
@ -582,7 +628,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
{
|
||||
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[search_nested_table_name];
|
||||
ColumnsWithTypeAndName cols
|
||||
= {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)};
|
||||
= {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true, true, false, skipped)};
|
||||
Block block(cols);
|
||||
nested_tables[search_nested_table_name] = std::make_shared<Block>(Nested::flatten(block));
|
||||
}
|
||||
@ -615,7 +661,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
else
|
||||
{
|
||||
auto arrow_column = name_to_column_ptr[search_column_name];
|
||||
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true);
|
||||
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true, true, false, skipped);
|
||||
}
|
||||
|
||||
try
|
||||
@ -642,7 +688,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
|
||||
std::vector<size_t> ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const
|
||||
{
|
||||
std::vector<size_t> missing_columns;
|
||||
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching);
|
||||
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, false, &header, case_insensitive_matching);
|
||||
auto flatten_block_from_arrow = Nested::flatten(block_from_arrow);
|
||||
|
||||
for (size_t i = 0, columns = header.columns(); i < columns; ++i)
|
||||
|
@ -38,7 +38,11 @@ public:
|
||||
/// Transform arrow schema to ClickHouse header. If hint_header is provided,
|
||||
/// we will skip columns in schema that are not in hint_header.
|
||||
static Block arrowSchemaToCHHeader(
|
||||
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr, bool ignore_case = false);
|
||||
const arrow::Schema & schema,
|
||||
const std::string & format_name,
|
||||
bool skip_columns_with_unsupported_types = false,
|
||||
const Block * hint_header = nullptr,
|
||||
bool ignore_case = false);
|
||||
|
||||
private:
|
||||
const Block & header;
|
||||
|
@ -924,12 +924,12 @@ void registerInputFormatAvro(FormatFactory & factory)
|
||||
|
||||
void registerAvroSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<AvroSchemaReader>(buf, false, settings);
|
||||
});
|
||||
|
||||
factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<AvroSchemaReader>(buf, true, settings);
|
||||
});
|
||||
|
@ -95,7 +95,7 @@ void BinaryFormatReader::skipField(size_t file_column)
|
||||
}
|
||||
|
||||
BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(in_, format_settings_, true, true, &reader), reader(in_, format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -119,7 +119,7 @@ void registerInputFormatRowBinary(FormatFactory & factory)
|
||||
|
||||
void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<BinaryWithNamesAndTypesSchemaReader>(buf, settings);
|
||||
});
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include <Formats/EscapingRuleUtils.h>
|
||||
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
|
||||
#include <DataTypes/Serializations/SerializationNullable.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
|
||||
|
||||
@ -259,16 +258,15 @@ bool CSVFormatReader::readField(
|
||||
}
|
||||
|
||||
|
||||
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_)
|
||||
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
in_,
|
||||
format_setting_.max_rows_to_read_for_schema_inference,
|
||||
format_setting_,
|
||||
with_names_,
|
||||
with_types_,
|
||||
&reader,
|
||||
getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV))
|
||||
, reader(in_, format_setting_)
|
||||
, context(context_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -279,7 +277,7 @@ DataTypes CSVSchemaReader::readRowAndGetDataTypes()
|
||||
return {};
|
||||
|
||||
auto fields = reader.readRow();
|
||||
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context);
|
||||
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV);
|
||||
}
|
||||
|
||||
|
||||
@ -382,9 +380,9 @@ void registerCSVSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
|
||||
{
|
||||
factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
|
||||
factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<CSVSchemaReader>(buf, with_names, with_types, settings, context);
|
||||
return std::make_shared<CSVSchemaReader>(buf, with_names, with_types, settings);
|
||||
});
|
||||
};
|
||||
|
||||
|
@ -74,13 +74,12 @@ public:
|
||||
class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader
|
||||
{
|
||||
public:
|
||||
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_);
|
||||
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_);
|
||||
|
||||
private:
|
||||
DataTypes readRowAndGetDataTypes() override;
|
||||
|
||||
CSVFormatReader reader;
|
||||
ContextPtr context;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -289,17 +289,16 @@ void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_)
|
||||
}
|
||||
|
||||
CustomSeparatedSchemaReader::CustomSeparatedSchemaReader(
|
||||
ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_)
|
||||
ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
buf,
|
||||
format_setting_.max_rows_to_read_for_schema_inference,
|
||||
format_setting_,
|
||||
with_names_,
|
||||
with_types_,
|
||||
&reader,
|
||||
getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule))
|
||||
, buf(in_)
|
||||
, reader(buf, ignore_spaces_, updateFormatSettings(format_setting_))
|
||||
, context(context_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -315,7 +314,7 @@ DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes()
|
||||
first_row = false;
|
||||
|
||||
auto fields = reader.readRow();
|
||||
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context);
|
||||
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
|
||||
}
|
||||
|
||||
void registerInputFormatCustomSeparated(FormatFactory & factory)
|
||||
@ -343,9 +342,9 @@ void registerCustomSeparatedSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
|
||||
{
|
||||
factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
|
||||
factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<CustomSeparatedSchemaReader>(buf, with_names, with_types, ignore_spaces, settings, context);
|
||||
return std::make_shared<CustomSeparatedSchemaReader>(buf, with_names, with_types, ignore_spaces, settings);
|
||||
});
|
||||
};
|
||||
|
||||
|
@ -92,14 +92,13 @@ private:
|
||||
class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader
|
||||
{
|
||||
public:
|
||||
CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_);
|
||||
CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_);
|
||||
|
||||
private:
|
||||
DataTypes readRowAndGetDataTypes() override;
|
||||
|
||||
PeekableReadBuffer buf;
|
||||
CustomSeparatedFormatReader reader;
|
||||
ContextPtr context;
|
||||
bool first_row = true;
|
||||
};
|
||||
|
||||
|
@ -181,15 +181,10 @@ bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer &
|
||||
return true;
|
||||
}
|
||||
|
||||
JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_)
|
||||
JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(
|
||||
ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
in_,
|
||||
format_settings_.max_rows_to_read_for_schema_inference,
|
||||
with_names_,
|
||||
with_types_,
|
||||
&reader,
|
||||
nullptr,
|
||||
format_settings_.json.read_bools_as_numbers)
|
||||
in_, format_settings_, with_names_, with_types_, &reader, nullptr, format_settings_.json.read_bools_as_numbers)
|
||||
, reader(in_, yield_strings_, format_settings_)
|
||||
{
|
||||
}
|
||||
@ -239,7 +234,7 @@ void registerJSONCompactEachRowSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
|
||||
{
|
||||
factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<JSONCompactEachRowRowSchemaReader>(buf, with_names, with_types, json_strings, settings);
|
||||
});
|
||||
|
@ -387,12 +387,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory
|
||||
|
||||
void registerJSONEachRowSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_unique<JSONEachRowSchemaReader>(buf, false, settings);
|
||||
});
|
||||
|
||||
factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_unique<JSONEachRowSchemaReader>(buf, true, settings);
|
||||
});
|
||||
|
@ -414,7 +414,7 @@ void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_)
|
||||
}
|
||||
|
||||
MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns)
|
||||
: IRowSchemaReader(buf, format_settings_), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns)
|
||||
{
|
||||
if (!number_of_columns)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data");
|
||||
@ -535,7 +535,7 @@ void registerInputFormatMsgPack(FormatFactory & factory)
|
||||
|
||||
void registerMsgPackSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<MsgPackSchemaReader>(buf, settings);
|
||||
});
|
||||
|
@ -133,7 +133,7 @@ void registerOutputFormatNative(FormatFactory & factory)
|
||||
|
||||
void registerNativeSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr)
|
||||
factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &)
|
||||
{
|
||||
return std::make_shared<NativeSchemaReader>(buf);
|
||||
});
|
||||
|
@ -3,6 +3,7 @@
|
||||
#if USE_ORC
|
||||
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Formats/ReadSchemaUtils.h>
|
||||
#include <IO/ReadBufferFromMemory.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/copyData.h>
|
||||
@ -183,8 +184,9 @@ NamesAndTypesList ORCSchemaReader::readSchema()
|
||||
std::shared_ptr<arrow::Schema> schema;
|
||||
std::atomic<int> is_stopped = 0;
|
||||
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC");
|
||||
return header.getNamesAndTypesList();
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
*schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference);
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
}
|
||||
|
||||
void registerInputFormatORC(FormatFactory & factory)
|
||||
@ -205,7 +207,7 @@ void registerORCSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader(
|
||||
"ORC",
|
||||
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
[](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ORCSchemaReader>(buf, settings);
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
#if USE_PARQUET
|
||||
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Formats/ReadSchemaUtils.h>
|
||||
#include <IO/ReadBufferFromMemory.h>
|
||||
#include <IO/copyData.h>
|
||||
#include <arrow/api.h>
|
||||
@ -176,8 +177,9 @@ NamesAndTypesList ParquetSchemaReader::readSchema()
|
||||
std::shared_ptr<arrow::Schema> schema;
|
||||
std::atomic<int> is_stopped = 0;
|
||||
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet");
|
||||
return header.getNamesAndTypesList();
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
*schema, "Parquet", format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference);
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
}
|
||||
|
||||
void registerInputFormatParquet(FormatFactory & factory)
|
||||
@ -198,7 +200,7 @@ void registerParquetSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader(
|
||||
"Parquet",
|
||||
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
[](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ParquetSchemaReader>(buf, settings);
|
||||
}
|
||||
|
@ -128,15 +128,14 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_)
|
||||
IInputFormat::setReadBuffer(*buf);
|
||||
}
|
||||
|
||||
RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_)
|
||||
RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||
: IRowSchemaReader(
|
||||
buf,
|
||||
format_settings_.max_rows_to_read_for_schema_inference,
|
||||
format_settings_,
|
||||
getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule))
|
||||
, format_settings(format_settings_)
|
||||
, field_extractor(format_settings)
|
||||
, buf(in_)
|
||||
, context(context_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -152,7 +151,7 @@ DataTypes RegexpSchemaReader::readRowAndGetDataTypes()
|
||||
for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i)
|
||||
{
|
||||
String field(field_extractor.getField(i));
|
||||
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context));
|
||||
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule));
|
||||
}
|
||||
|
||||
return data_types;
|
||||
@ -203,9 +202,9 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory)
|
||||
|
||||
void registerRegexpSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
|
||||
factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<RegexpSchemaReader>(buf, settings, context);
|
||||
return std::make_shared<RegexpSchemaReader>(buf, settings);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -76,7 +76,7 @@ private:
|
||||
class RegexpSchemaReader : public IRowSchemaReader
|
||||
{
|
||||
public:
|
||||
RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_);
|
||||
RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings);
|
||||
|
||||
private:
|
||||
DataTypes readRowAndGetDataTypes() override;
|
||||
@ -85,7 +85,6 @@ private:
|
||||
const FormatSettings format_settings;
|
||||
RegexpFieldExtractor field_extractor;
|
||||
PeekableReadBuffer buf;
|
||||
ContextPtr context;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -283,7 +283,7 @@ void registerInputFormatTSKV(FormatFactory & factory)
|
||||
}
|
||||
void registerTSKVSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<TSKVSchemaReader>(buf, settings);
|
||||
});
|
||||
|
@ -235,7 +235,7 @@ TabSeparatedSchemaReader::TabSeparatedSchemaReader(
|
||||
ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
in_,
|
||||
format_settings_.max_rows_to_read_for_schema_inference,
|
||||
format_settings_,
|
||||
with_names_,
|
||||
with_types_,
|
||||
&reader,
|
||||
@ -280,7 +280,7 @@ void registerTSVSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
|
||||
{
|
||||
factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
|
||||
factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<TabSeparatedSchemaReader>(buf, with_names, with_types, is_raw, settings);
|
||||
});
|
||||
|
@ -453,14 +453,12 @@ TemplateSchemaReader::TemplateSchemaReader(
|
||||
const ParsedTemplateFormatString & format_,
|
||||
const ParsedTemplateFormatString & row_format_,
|
||||
std::string row_between_delimiter,
|
||||
const FormatSettings & format_settings_,
|
||||
ContextPtr context_)
|
||||
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference)
|
||||
const FormatSettings & format_settings_)
|
||||
: IRowSchemaReader(buf, format_settings_, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules))
|
||||
, buf(in_)
|
||||
, format(format_)
|
||||
, row_format(row_format_)
|
||||
, format_settings(format_settings_)
|
||||
, context(context_)
|
||||
, format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings)
|
||||
{
|
||||
setColumnNames(row_format.column_names);
|
||||
@ -489,7 +487,7 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
|
||||
format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front();
|
||||
|
||||
field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
|
||||
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context));
|
||||
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i]));
|
||||
}
|
||||
|
||||
format_reader.skipRowEndDelimiter();
|
||||
@ -564,12 +562,12 @@ void registerTemplateSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
for (bool ignore_spaces : {false, true})
|
||||
{
|
||||
factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
|
||||
factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
size_t index = 0;
|
||||
auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
|
||||
auto row_format = fillRowFormat(settings, idx_getter, false);
|
||||
return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context);
|
||||
return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -116,8 +116,7 @@ public:
|
||||
const ParsedTemplateFormatString & format_,
|
||||
const ParsedTemplateFormatString & row_format_,
|
||||
std::string row_between_delimiter,
|
||||
const FormatSettings & format_settings_,
|
||||
ContextPtr context_);
|
||||
const FormatSettings & format_settings_);
|
||||
|
||||
DataTypes readRowAndGetDataTypes() override;
|
||||
|
||||
@ -126,7 +125,6 @@ private:
|
||||
const ParsedTemplateFormatString format;
|
||||
const ParsedTemplateFormatString row_format;
|
||||
FormatSettings format_settings;
|
||||
ContextPtr context;
|
||||
TemplateFormatReader format_reader;
|
||||
bool first_row = true;
|
||||
};
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Formats/ReadSchemaUtils.h>
|
||||
#include <Formats/EscapingRuleUtils.h>
|
||||
#include <Core/Block.h>
|
||||
#include <base/find_symbols.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
@ -571,8 +572,8 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_)
|
||||
IInputFormat::setReadBuffer(*buf);
|
||||
}
|
||||
|
||||
ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_)
|
||||
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_)
|
||||
ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||
: IRowSchemaReader(buf, format_settings_), buf(in_), format_settings(format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -589,38 +590,25 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes()
|
||||
return {};
|
||||
|
||||
assertChar('(', buf);
|
||||
PeekableReadBufferCheckpoint checkpoint(buf);
|
||||
skipToNextRow(&buf, 0, 1);
|
||||
buf.makeContinuousMemoryFromCheckpointToPos();
|
||||
buf.rollbackToCheckpoint();
|
||||
|
||||
Tokens tokens(buf.position(), buf.buffer().end());
|
||||
IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth);
|
||||
|
||||
skipWhitespaceIfAny(buf);
|
||||
DataTypes data_types;
|
||||
bool finish = false;
|
||||
while (!finish)
|
||||
String value;
|
||||
while (!buf.eof() && *buf.position() != ')')
|
||||
{
|
||||
Expected expected;
|
||||
ASTPtr ast;
|
||||
if (!data_types.empty())
|
||||
{
|
||||
skipWhitespaceIfAny(buf);
|
||||
assertChar(',', buf);
|
||||
skipWhitespaceIfAny(buf);
|
||||
}
|
||||
|
||||
bool parsed = parser.parse(token_iterator, ast, expected);
|
||||
/// Consider delimiter after value (',' or ')') as part of expression
|
||||
parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket;
|
||||
|
||||
if (!parsed)
|
||||
throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}",
|
||||
String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end));
|
||||
|
||||
std::pair<Field, DataTypePtr> result = evaluateConstantExpression(ast, context);
|
||||
data_types.push_back(generalizeDataType(result.second));
|
||||
|
||||
if (token_iterator->type == TokenType::ClosingRoundBracket)
|
||||
finish = true;
|
||||
++token_iterator;
|
||||
buf.position() = const_cast<char *>(token_iterator->begin);
|
||||
readQuotedFieldIntoString(value, buf);
|
||||
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
|
||||
data_types.push_back(std::move(type));
|
||||
}
|
||||
|
||||
assertChar(')', buf);
|
||||
|
||||
skipWhitespaceIfAny(buf);
|
||||
if (!buf.eof() && *buf.position() == ',')
|
||||
++buf.position();
|
||||
@ -642,9 +630,9 @@ void registerInputFormatValues(FormatFactory & factory)
|
||||
|
||||
void registerValuesSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
|
||||
factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings)
|
||||
{
|
||||
return std::make_shared<ValuesSchemaReader>(buf, settings, context);
|
||||
return std::make_shared<ValuesSchemaReader>(buf, settings);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -97,13 +97,13 @@ private:
|
||||
class ValuesSchemaReader : public IRowSchemaReader
|
||||
{
|
||||
public:
|
||||
ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_);
|
||||
ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings);
|
||||
|
||||
private:
|
||||
DataTypes readRowAndGetDataTypes() override;
|
||||
|
||||
PeekableReadBuffer buf;
|
||||
ContextPtr context;
|
||||
const FormatSettings format_settings;
|
||||
ParserExpression parser;
|
||||
bool first_row = true;
|
||||
};
|
||||
|
@ -293,13 +293,13 @@ void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_)
|
||||
|
||||
FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader(
|
||||
ReadBuffer & in_,
|
||||
size_t max_rows_to_read_,
|
||||
const FormatSettings & format_settings,
|
||||
bool with_names_,
|
||||
bool with_types_,
|
||||
FormatWithNamesAndTypesReader * format_reader_,
|
||||
DataTypePtr default_type_,
|
||||
bool allow_bools_as_numbers_)
|
||||
: IRowSchemaReader(in_, max_rows_to_read_, default_type_, allow_bools_as_numbers_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_)
|
||||
: IRowSchemaReader(in_, format_settings, default_type_, allow_bools_as_numbers_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -124,7 +124,7 @@ class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader
|
||||
public:
|
||||
FormatWithNamesAndTypesSchemaReader(
|
||||
ReadBuffer & in,
|
||||
size_t max_rows_to_read_,
|
||||
const FormatSettings & format_settings,
|
||||
bool with_names_,
|
||||
bool with_types_,
|
||||
FormatWithNamesAndTypesReader * format_reader_,
|
||||
|
@ -2,30 +2,6 @@
|
||||
0 0 0
|
||||
0 0 0
|
||||
1 2 3
|
||||
10 11 12
|
||||
13 14 15
|
||||
16 17 18
|
||||
20 21 22
|
||||
23 24 25
|
||||
26 27 28
|
||||
4 5 6
|
||||
7 8 9
|
||||
0 0 0
|
||||
0 0 0
|
||||
0 0 0
|
||||
1 2 3
|
||||
10 11 12
|
||||
13 14 15
|
||||
16 17 18
|
||||
20 21 22
|
||||
23 24 25
|
||||
26 27 28
|
||||
4 5 6
|
||||
7 8 9
|
||||
0 0 0
|
||||
0 0 0
|
||||
0 0 0
|
||||
1 2 3
|
||||
4 5 6
|
||||
7 8 9
|
||||
10 11 12
|
||||
@ -38,14 +14,26 @@
|
||||
0 0 0
|
||||
0 0 0
|
||||
1 2 3
|
||||
4 5 6
|
||||
7 8 9
|
||||
10 11 12
|
||||
13 14 15
|
||||
16 17 18
|
||||
20 21 22
|
||||
23 24 25
|
||||
26 27 28
|
||||
0 0 0
|
||||
0 0 0
|
||||
0 0 0
|
||||
1 2 3
|
||||
4 5 6
|
||||
7 8 9
|
||||
10 11 12
|
||||
13 14 15
|
||||
16 17 18
|
||||
20 21 22
|
||||
23 24 25
|
||||
26 27 28
|
||||
0 0 0
|
||||
0 0 0
|
||||
0 0 0
|
||||
@ -62,14 +50,26 @@
|
||||
0 0 0
|
||||
0 0 0
|
||||
1 2 3
|
||||
4 5 6
|
||||
7 8 9
|
||||
10 11 12
|
||||
13 14 15
|
||||
16 17 18
|
||||
20 21 22
|
||||
23 24 25
|
||||
26 27 28
|
||||
0 0 0
|
||||
0 0 0
|
||||
0 0 0
|
||||
1 2 3
|
||||
4 5 6
|
||||
7 8 9
|
||||
10 11 12
|
||||
13 14 15
|
||||
16 17 18
|
||||
20 21 22
|
||||
23 24 25
|
||||
26 27 28
|
||||
0 0 0
|
||||
0 0 0
|
||||
0 0 0
|
||||
|
@ -1,17 +1,17 @@
|
||||
TSV
|
||||
c1 Nullable(String)
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(String)
|
||||
c3 Nullable(String)
|
||||
c4 Nullable(String)
|
||||
42 Some string [1, 2, 3, 4] (1, 2, 3)
|
||||
42 abcd [] (4, 5, 6)
|
||||
c3 Array(Nullable(Float64))
|
||||
c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))
|
||||
42 Some string [1,2,3,4] (1,2,3)
|
||||
42 abcd [] (4,5,6)
|
||||
TSVWithNames
|
||||
number Nullable(String)
|
||||
number Nullable(Float64)
|
||||
string Nullable(String)
|
||||
array Nullable(String)
|
||||
tuple Nullable(String)
|
||||
42 Some string [1, 2, 3, 4] (1, 2, 3)
|
||||
42 abcd [] (4, 5, 6)
|
||||
array Array(Nullable(Float64))
|
||||
tuple Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))
|
||||
42 Some string [1,2,3,4] (1,2,3)
|
||||
42 abcd [] (4,5,6)
|
||||
CSV
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(String)
|
||||
@ -73,13 +73,13 @@ c Array(Nullable(Float64))
|
||||
\N \N []
|
||||
\N \N [3]
|
||||
TSKV
|
||||
a Nullable(String)
|
||||
a Nullable(Float64)
|
||||
b Nullable(String)
|
||||
c Nullable(String)
|
||||
1 s1 \N
|
||||
c Array(Nullable(Float64))
|
||||
1 s1 []
|
||||
2 } [2]
|
||||
\N \N \N
|
||||
\N \N \N
|
||||
\N \N []
|
||||
\N \N []
|
||||
\N \N [3]
|
||||
Values
|
||||
c1 Nullable(Float64)
|
||||
@ -96,7 +96,7 @@ c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(Strin
|
||||
42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')])
|
||||
\N Some string [10] (1,2) ([],[])
|
||||
Regexp
|
||||
c1 Nullable(String)
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(String)
|
||||
c3 Nullable(String)
|
||||
42 Some string 1 [([1, 2, 3], String 1), ([], String 1)]
|
||||
|
@ -1,137 +1,137 @@
|
||||
Arrow
|
||||
int8 Int8
|
||||
uint8 UInt8
|
||||
int16 Int16
|
||||
uint16 UInt16
|
||||
int32 Int32
|
||||
uint32 UInt32
|
||||
int64 Int64
|
||||
uint64 UInt64
|
||||
int8 Nullable(Int8)
|
||||
uint8 Nullable(UInt8)
|
||||
int16 Nullable(Int16)
|
||||
uint16 Nullable(UInt16)
|
||||
int32 Nullable(Int32)
|
||||
uint32 Nullable(UInt32)
|
||||
int64 Nullable(Int64)
|
||||
uint64 Nullable(UInt64)
|
||||
0 0 0 0 0 0 0 0
|
||||
-1 1 -1 1 -1 1 -1 1
|
||||
float32 Float32
|
||||
float64 Float64
|
||||
decimal32 Decimal(9, 5)
|
||||
decimal64 Decimal(18, 5)
|
||||
float32 Nullable(Float32)
|
||||
float64 Nullable(Float64)
|
||||
decimal32 Nullable(Decimal(9, 5))
|
||||
decimal64 Nullable(Decimal(18, 5))
|
||||
0 0 0 0
|
||||
1.2 0.7692307692307692 3.33333 333.33333
|
||||
date UInt16
|
||||
date32 Date32
|
||||
date Nullable(UInt16)
|
||||
date32 Nullable(Date32)
|
||||
0 1970-01-01
|
||||
1 1970-01-02
|
||||
str String
|
||||
fixed_string String
|
||||
str Nullable(String)
|
||||
fixed_string Nullable(String)
|
||||
Str: 0 100
|
||||
Str: 1 200
|
||||
array Array(UInt64)
|
||||
tuple Tuple(`tuple.0` UInt64, `tuple.1` String)
|
||||
map Map(String, UInt64)
|
||||
array Array(Nullable(UInt64))
|
||||
tuple Tuple(Nullable(UInt64), Nullable(String))
|
||||
map Map(String, Nullable(UInt64))
|
||||
[0,1] (0,'0') {'0':0}
|
||||
[1,2] (1,'1') {'1':1}
|
||||
nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64)))
|
||||
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8)
|
||||
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
|
||||
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
|
||||
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
|
||||
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
|
||||
ArrowStream
|
||||
int8 Int8
|
||||
uint8 UInt8
|
||||
int16 Int16
|
||||
uint16 UInt16
|
||||
int32 Int32
|
||||
uint32 UInt32
|
||||
int64 Int64
|
||||
uint64 UInt64
|
||||
int8 Nullable(Int8)
|
||||
uint8 Nullable(UInt8)
|
||||
int16 Nullable(Int16)
|
||||
uint16 Nullable(UInt16)
|
||||
int32 Nullable(Int32)
|
||||
uint32 Nullable(UInt32)
|
||||
int64 Nullable(Int64)
|
||||
uint64 Nullable(UInt64)
|
||||
0 0 0 0 0 0 0 0
|
||||
-1 1 -1 1 -1 1 -1 1
|
||||
float32 Float32
|
||||
float64 Float64
|
||||
decimal32 Decimal(9, 5)
|
||||
decimal64 Decimal(18, 5)
|
||||
float32 Nullable(Float32)
|
||||
float64 Nullable(Float64)
|
||||
decimal32 Nullable(Decimal(9, 5))
|
||||
decimal64 Nullable(Decimal(18, 5))
|
||||
0 0 0 0
|
||||
1.2 0.7692307692307692 3.33333 333.33333
|
||||
date UInt16
|
||||
date32 Date32
|
||||
date Nullable(UInt16)
|
||||
date32 Nullable(Date32)
|
||||
0 1970-01-01
|
||||
1 1970-01-02
|
||||
str String
|
||||
fixed_string String
|
||||
str Nullable(String)
|
||||
fixed_string Nullable(String)
|
||||
Str: 0 100
|
||||
Str: 1 200
|
||||
array Array(UInt64)
|
||||
tuple Tuple(`tuple.0` UInt64, `tuple.1` String)
|
||||
map Map(String, UInt64)
|
||||
array Array(Nullable(UInt64))
|
||||
tuple Tuple(Nullable(UInt64), Nullable(String))
|
||||
map Map(String, Nullable(UInt64))
|
||||
[0,1] (0,'0') {'0':0}
|
||||
[1,2] (1,'1') {'1':1}
|
||||
nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64)))
|
||||
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8)
|
||||
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
|
||||
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
|
||||
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
|
||||
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
|
||||
Parquet
|
||||
int8 Int8
|
||||
uint8 UInt8
|
||||
int16 Int16
|
||||
uint16 UInt16
|
||||
int32 Int32
|
||||
uint32 Int64
|
||||
int64 Int64
|
||||
uint64 UInt64
|
||||
int8 Nullable(Int8)
|
||||
uint8 Nullable(UInt8)
|
||||
int16 Nullable(Int16)
|
||||
uint16 Nullable(UInt16)
|
||||
int32 Nullable(Int32)
|
||||
uint32 Nullable(Int64)
|
||||
int64 Nullable(Int64)
|
||||
uint64 Nullable(UInt64)
|
||||
0 0 0 0 0 0 0 0
|
||||
-1 1 -1 1 -1 1 -1 1
|
||||
float32 Float32
|
||||
float64 Float64
|
||||
decimal32 Decimal(9, 5)
|
||||
decimal64 Decimal(18, 5)
|
||||
float32 Nullable(Float32)
|
||||
float64 Nullable(Float64)
|
||||
decimal32 Nullable(Decimal(9, 5))
|
||||
decimal64 Nullable(Decimal(18, 5))
|
||||
0 0 0 0
|
||||
1.2 0.7692307692307692 3.33333 333.33333
|
||||
date UInt16
|
||||
date32 Date32
|
||||
date Nullable(UInt16)
|
||||
date32 Nullable(Date32)
|
||||
0 1970-01-01
|
||||
1 1970-01-02
|
||||
str String
|
||||
fixed_string String
|
||||
str Nullable(String)
|
||||
fixed_string Nullable(String)
|
||||
Str: 0 100
|
||||
Str: 1 200
|
||||
array Array(UInt64)
|
||||
tuple Tuple(`tuple.0` UInt64, `tuple.1` String)
|
||||
map Map(String, UInt64)
|
||||
array Array(Nullable(UInt64))
|
||||
tuple Tuple(Nullable(UInt64), Nullable(String))
|
||||
map Map(String, Nullable(UInt64))
|
||||
[0,1] (0,'0') {'0':0}
|
||||
[1,2] (1,'1') {'1':1}
|
||||
nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64)))
|
||||
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8)
|
||||
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
|
||||
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
|
||||
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
|
||||
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
|
||||
ORC
|
||||
int8 Int8
|
||||
uint8 Int8
|
||||
int16 Int16
|
||||
uint16 Int16
|
||||
int32 Int32
|
||||
uint32 Int32
|
||||
int64 Int64
|
||||
uint64 Int64
|
||||
int8 Nullable(Int8)
|
||||
uint8 Nullable(Int8)
|
||||
int16 Nullable(Int16)
|
||||
uint16 Nullable(Int16)
|
||||
int32 Nullable(Int32)
|
||||
uint32 Nullable(Int32)
|
||||
int64 Nullable(Int64)
|
||||
uint64 Nullable(Int64)
|
||||
0 0 0 0 0 0 0 0
|
||||
-1 1 -1 1 -1 1 -1 1
|
||||
float32 Float32
|
||||
float64 Float64
|
||||
decimal32 Decimal(9, 5)
|
||||
decimal64 Decimal(18, 5)
|
||||
float32 Nullable(Float32)
|
||||
float64 Nullable(Float64)
|
||||
decimal32 Nullable(Decimal(9, 5))
|
||||
decimal64 Nullable(Decimal(18, 5))
|
||||
0 0 0 0
|
||||
1.2 0.7692307692307692 3.33333 333.33333
|
||||
date Date32
|
||||
date32 Date32
|
||||
date Nullable(Date32)
|
||||
date32 Nullable(Date32)
|
||||
1970-01-01 1970-01-01
|
||||
1970-01-02 1970-01-02
|
||||
str String
|
||||
fixed_string String
|
||||
str Nullable(String)
|
||||
fixed_string Nullable(String)
|
||||
Str: 0 100
|
||||
Str: 1 200
|
||||
array Array(Int64)
|
||||
tuple Tuple(`tuple.0` Int64, `tuple.1` String)
|
||||
map Map(String, Int64)
|
||||
array Array(Nullable(Int64))
|
||||
tuple Tuple(Nullable(Int64), Nullable(String))
|
||||
map Map(String, Nullable(Int64))
|
||||
[0,1] (0,'0') {'0':0}
|
||||
[1,2] (1,'1') {'1':1}
|
||||
nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64)))
|
||||
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8)
|
||||
nested1 Array(Tuple(Array(Nullable(Int64)), Map(String, Nullable(Int64))))
|
||||
nested2 Tuple(Tuple(Array(Array(Nullable(Int64))), Map(Int64, Array(Tuple(Nullable(Int64), Nullable(String))))), Nullable(Int8))
|
||||
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
|
||||
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
|
||||
Native
|
||||
|
@ -1 +1 @@
|
||||
x LowCardinality(UInt64)
|
||||
x LowCardinality(Nullable(UInt64))
|
||||
|
@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1"
|
||||
$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1, engine_file_truncate_on_insert=1"
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('arrow.dict', 'Arrow')"
|
||||
|
||||
|
@ -9,7 +9,7 @@ x Nullable(Float64)
|
||||
7
|
||||
8
|
||||
9
|
||||
c1 Nullable(String)
|
||||
c2 Nullable(String)
|
||||
c3 Nullable(String)
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(Float64)
|
||||
c3 Nullable(Float64)
|
||||
1 2 3
|
||||
|
@ -1,8 +1,8 @@
|
||||
a Nullable(String)
|
||||
a Nullable(Float64)
|
||||
b Nullable(String)
|
||||
c Nullable(String)
|
||||
1 s1 \N
|
||||
c Array(Nullable(Float64))
|
||||
1 s1 []
|
||||
2 } [2]
|
||||
\N \N \N
|
||||
\N \N \N
|
||||
\N \N []
|
||||
\N \N []
|
||||
\N \N [3]
|
||||
|
@ -0,0 +1,40 @@
|
||||
Arrow
|
||||
x Nullable(UInt64)
|
||||
arr1 Array(Nullable(UInt64))
|
||||
arr2 Array(Array(Nullable(String)))
|
||||
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
|
||||
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
|
||||
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
|
||||
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
|
||||
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
|
||||
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]
|
||||
ArrowStream
|
||||
x Nullable(UInt64)
|
||||
arr1 Array(Nullable(UInt64))
|
||||
arr2 Array(Array(Nullable(String)))
|
||||
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
|
||||
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
|
||||
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
|
||||
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
|
||||
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
|
||||
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]
|
||||
Parquet
|
||||
x Nullable(UInt64)
|
||||
arr1 Array(Nullable(UInt64))
|
||||
arr2 Array(Array(Nullable(String)))
|
||||
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
|
||||
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
|
||||
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
|
||||
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
|
||||
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
|
||||
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]
|
||||
ORC
|
||||
x Nullable(Int64)
|
||||
arr1 Array(Nullable(Int64))
|
||||
arr2 Array(Array(Nullable(String)))
|
||||
arr3 Array(Tuple(Nullable(String), Nullable(Int64)))
|
||||
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
|
||||
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
|
||||
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
|
||||
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
|
||||
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]
|
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel, no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
|
||||
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
||||
FILE_NAME=test_02242.data
|
||||
DATA_FILE=$USER_FILES_PATH/$FILE_NAME
|
||||
|
||||
for format in Arrow ArrowStream Parquet ORC
|
||||
do
|
||||
echo $format
|
||||
$CLICKHOUSE_CLIENT -q "select number % 2 ? NULL : number as x, [number % 2 ? NULL : number, number + 1] as arr1, [[NULL, 'String'], [NULL], []] as arr2, [(NULL, NULL), ('String', NULL), (NULL, number)] as arr3 from numbers(5) format $format" > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')"
|
||||
done
|
||||
|
||||
rm $DATA_FILE
|
@ -0,0 +1 @@
|
||||
10
|
28
tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh
Executable file
28
tests/queries/0_stateless/02243_arrow_read_null_type_to_nullable_column.sh
Executable file
@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists test_02243"
|
||||
$CLICKHOUSE_CLIENT -q "create table test_02243 (image_path Nullable(String),
|
||||
caption Nullable(String),
|
||||
NSFW Nullable(String),
|
||||
similarity Nullable(Float64),
|
||||
LICENSE Nullable(String),
|
||||
url Nullable(String),
|
||||
key Nullable(UInt64),
|
||||
shard_id Nullable(UInt64),
|
||||
status Nullable(String),
|
||||
error_message Nullable(String),
|
||||
width Nullable(UInt32),
|
||||
height Nullable(UInt32),
|
||||
exif Nullable(String),
|
||||
original_width Nullable(UInt32),
|
||||
original_height Nullable(UInt32)) engine=Memory"
|
||||
|
||||
cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT --stacktrace -q "insert into test_02243 format Parquet"
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select count() from test_02243"
|
||||
$CLICKHOUSE_CLIENT -q "drop table test_02243"
|
@ -0,0 +1,8 @@
|
||||
x Nullable(String)
|
||||
y Nullable(Float64)
|
||||
x Nullable(String)
|
||||
y Nullable(Float64)
|
||||
x Nullable(String)
|
||||
y Nullable(Float64)
|
||||
x Nullable(String)
|
||||
y Nullable(Float64)
|
@ -0,0 +1,14 @@
|
||||
-- Tags: no-fasttest, no-parallel
|
||||
|
||||
insert into function file('test_02244', 'TSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
|
||||
desc file('test_02244', 'TSV') settings column_names_for_schema_inference='x,y';
|
||||
|
||||
insert into function file('test_02244', 'CSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
|
||||
desc file('test_02244', 'CSV') settings column_names_for_schema_inference='x,y';
|
||||
|
||||
insert into function file('test_02244', 'JSONCompactEachRow', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
|
||||
desc file('test_02244', 'JSONCompactEachRow') settings column_names_for_schema_inference='x,y';
|
||||
|
||||
insert into function file('test_02244', 'Values', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
|
||||
desc file('test_02244', 'Values') settings column_names_for_schema_inference='x,y';
|
||||
|
@ -0,0 +1,16 @@
|
||||
OK
|
||||
image_path Nullable(String)
|
||||
caption Nullable(String)
|
||||
NSFW Nullable(String)
|
||||
similarity Nullable(Float64)
|
||||
LICENSE Nullable(String)
|
||||
url Nullable(String)
|
||||
key Nullable(Int64)
|
||||
shard_id Nullable(Int64)
|
||||
status Nullable(String)
|
||||
width Nullable(Int64)
|
||||
height Nullable(Int64)
|
||||
exif Nullable(String)
|
||||
original_width Nullable(Int64)
|
||||
original_height Nullable(Int64)
|
||||
10
|
18
tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh
Executable file
18
tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
||||
FILE_NAME=test_02245.parquet
|
||||
DATA_FILE=$USER_FILES_PATH/$FILE_NAME
|
||||
|
||||
cp $CUR_DIR/data_parquet_bad_column/metadata_0.parquet $DATA_FILE
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "CANNOT_EXTRACT_TABLE_STRUCTURE" && echo "OK" || echo "FAIL"
|
||||
$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1"
|
||||
$CLICKHOUSE_CLIENT -q "select count(*) from file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1"
|
||||
|
@ -1,21 +1,21 @@
|
||||
c1 Nullable(String)
|
||||
c2 Nullable(String)
|
||||
c3 Nullable(String)
|
||||
c1 Nullable(String)
|
||||
c2 Nullable(String)
|
||||
c3 Nullable(String)
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(Float64)
|
||||
c3 Nullable(Float64)
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(Float64)
|
||||
c3 Nullable(Float64)
|
||||
c1 UInt64
|
||||
c2 UInt64
|
||||
c3 UInt64
|
||||
c1 Nullable(String)
|
||||
c2 Nullable(String)
|
||||
c3 Nullable(String)
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(Float64)
|
||||
c3 Nullable(Float64)
|
||||
c1 UInt64
|
||||
c2 UInt64
|
||||
c3 UInt64
|
||||
c1 Nullable(String)
|
||||
c2 Nullable(String)
|
||||
c3 Nullable(String)
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(Float64)
|
||||
c3 Nullable(Float64)
|
||||
c1 UInt64
|
||||
c2 UInt64
|
||||
c3 UInt64
|
||||
|
@ -10,4 +10,5 @@ desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test
|
||||
desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64');
|
||||
desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto');
|
||||
|
||||
|
||||
SELECT * FROM s3(decodeURLComponent(NULL), [NULL]); --{serverError 170}
|
||||
|
@ -0,0 +1,107 @@
|
||||
TSV
|
||||
c1 Nullable(Float64)
|
||||
c2 Nullable(String)
|
||||
c3 Array(Nullable(Float64))
|
||||
c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))
|
||||
42 Some string [1,2,3,4] (1,2,3)
|
||||
42 abcd [] (4,5,6)
|
||||
c1 Nullable(String)
|
||||
[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)]
|
||||
[]
|
||||
[({}, [], 0)]
|
||||
[({}, [NULL], NULL)]
|
||||
[({}, [\'String3\'], NULL)]
|
||||
[({\'key3\': NULL}, []), NULL]
|
||||
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
|
||||
[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)]
|
||||
[]
|
||||
[({},[],0)]
|
||||
[({},[NULL],NULL)]
|
||||
[({},['String3'],NULL)]
|
||||
[({'key3':NULL},[],NULL)]
|
||||
c1 Nullable(Bool)
|
||||
true
|
||||
false
|
||||
\N
|
||||
c1 Array(Nullable(Bool))
|
||||
[true,NULL]
|
||||
[]
|
||||
[NULL]
|
||||
[false]
|
||||
c1 Nullable(String)
|
||||
[]
|
||||
c1 Nullable(String)
|
||||
{}
|
||||
c1 Nullable(String)
|
||||
()
|
||||
c1 Nullable(String)
|
||||
[1, 2, 3
|
||||
c1 Nullable(String)
|
||||
[(1, 2, 3 4)]
|
||||
c1 Nullable(String)
|
||||
[1, 2, 3 + 4]
|
||||
c1 Nullable(String)
|
||||
(1, 2,
|
||||
c1 Nullable(String)
|
||||
[1, Some trash, 42.2]
|
||||
c1 Nullable(String)
|
||||
[1, \'String\', {\'key\' : 2}]
|
||||
c1 Nullable(String)
|
||||
{\'key\' : 1, [1] : 10}
|
||||
c1 Nullable(String)
|
||||
{}{}
|
||||
c1 Nullable(String)
|
||||
[1, 2, 3
|
||||
c1 Nullable(String)
|
||||
[abc, def]
|
||||
c1 Array(Nullable(String))
|
||||
['abc','def']
|
||||
c1 Nullable(String)
|
||||
[\'string]
|
||||
c1 Nullable(String)
|
||||
\'string
|
||||
c1 Nullable(Float64)
|
||||
42.42
|
||||
c1 Nullable(String)
|
||||
42.42sometrash
|
||||
c1 Nullable(String)
|
||||
[42.42sometrash, 42.42]
|
||||
|
||||
CSV
|
||||
c1 Nullable(String)
|
||||
c2 Nullable(String)
|
||||
c3 Array(Nullable(Float64))
|
||||
c4 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)))
|
||||
42 Some string [1,2,3,4] [(1,2,3)]
|
||||
42\\ abcd [] [(4,5,6)]
|
||||
c1 Nullable(String)
|
||||
[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)]
|
||||
[]
|
||||
[({}, [], 0)]
|
||||
[({}, [NULL], NULL)]
|
||||
[({}, [\'String3\'], NULL)]
|
||||
[({\'key3\': NULL}, []), NULL]
|
||||
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
|
||||
[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)]
|
||||
[]
|
||||
[({},[],0)]
|
||||
[({},[NULL],NULL)]
|
||||
[({},['String3'],NULL)]
|
||||
[({'key3':NULL},[],NULL)]
|
||||
c1 Nullable(Bool)
|
||||
true
|
||||
false
|
||||
\N
|
||||
c1 Array(Nullable(Bool))
|
||||
[true,NULL]
|
||||
[]
|
||||
[NULL]
|
||||
[false]
|
||||
c1 Nullable(String)
|
||||
(1, 2, 3)
|
||||
c1 Nullable(String)
|
||||
123.123
|
||||
c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)))
|
||||
[(1,2,3)]
|
||||
c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)))
|
||||
[(1,2,3)]
|
220
tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh
Executable file
220
tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.sh
Executable file
@ -0,0 +1,220 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel, no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
|
||||
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
||||
FILE_NAME=test_02149.data
|
||||
DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME
|
||||
|
||||
touch $DATA_FILE
|
||||
|
||||
echo "TSV"
|
||||
|
||||
echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3)
|
||||
42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]
|
||||
[]
|
||||
[({}, [], 0)]
|
||||
[({}, [NULL], NULL)]
|
||||
[({}, ['String3'], NULL)]
|
||||
[({'key3': NULL}, []), NULL]"> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false"
|
||||
|
||||
|
||||
echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]
|
||||
[]
|
||||
[({}, [], 0)]
|
||||
[({}, [NULL], NULL)]
|
||||
[({}, ['String3'], NULL)]
|
||||
[({'key3': NULL}, [], NULL)]"> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "true
|
||||
false
|
||||
\N" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[true, NULL]
|
||||
[]
|
||||
[NULL]
|
||||
[false]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "{}" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "()" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[1, 2, 3" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[(1, 2, 3 4)]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[1, 2, 3 + 4]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "(1, 2," > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[1, Some trash, 42.2]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[1, 'String', {'key' : 2}]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "{'key' : 1, [1] : 10}" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "{}{}" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[1, 2, 3" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[abc, def]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "['abc', 'def']" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "['string]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "'string" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "42.42" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "42.42sometrash" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
echo -e "[42.42sometrash, 42.42]" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
|
||||
|
||||
|
||||
echo
|
||||
echo "CSV"
|
||||
|
||||
echo -e "42,Some string,'[1, 2, 3, 4]','[(1, 2, 3)]'
|
||||
42\,abcd,'[]','[(4, 5, 6)]'" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\"
|
||||
'[]'
|
||||
'[({}, [], 0)]'
|
||||
'[({}, [NULL], NULL)]'
|
||||
\"[({}, ['String3'], NULL)]\"
|
||||
\"[({'key3': NULL}, []), NULL]\""> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false"
|
||||
|
||||
echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\"
|
||||
'[]'
|
||||
'[({}, [], 0)]'
|
||||
'[({}, [NULL], NULL)]'
|
||||
\"[({}, ['String3'], NULL)]\"
|
||||
\"[({'key3': NULL}, [], NULL)]\""> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
echo -e "true
|
||||
false
|
||||
\N" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
echo -e "'[true, NULL]'
|
||||
'[]'
|
||||
'[NULL]'
|
||||
'[false]'" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
|
||||
echo -e "'(1, 2, 3)'"> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
echo -e "'123.123'"> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
echo -e "'[(1, 2, 3)]'"> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
echo -e "\"[(1, 2, 3)]\""> $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
|
||||
|
||||
|
@ -1,15 +1,15 @@
|
||||
a Nullable(String)
|
||||
a Nullable(Float64)
|
||||
b Nullable(String)
|
||||
c Nullable(String)
|
||||
1 s1 \N
|
||||
c Array(Nullable(Float64))
|
||||
1 s1 []
|
||||
2 } [2]
|
||||
\N \N \N
|
||||
\N \N \N
|
||||
\N \N []
|
||||
\N \N []
|
||||
\N \N [3]
|
||||
b Nullable(String)
|
||||
a Nullable(String)
|
||||
c Nullable(String)
|
||||
e Nullable(String)
|
||||
b Nullable(Float64)
|
||||
a Nullable(Float64)
|
||||
c Nullable(Float64)
|
||||
e Nullable(Float64)
|
||||
1 \N \N \N
|
||||
\N 2 3 \N
|
||||
\N \N \N \N
|
||||
|
@ -10,3 +10,4 @@ ths
|
||||
offsett
|
||||
numer
|
||||
ue
|
||||
alse
|
||||
|
Loading…
Reference in New Issue
Block a user