mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-28 10:31:57 +00:00
Allow to specify structure hints in schema inference
This commit is contained in:
parent
110470809b
commit
e1ff996ec3
@ -3470,6 +3470,24 @@ Default value: `25'000`.
|
|||||||
|
|
||||||
The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'
|
The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'
|
||||||
|
|
||||||
|
## schema_inference_hints {#schema_inference_hints}
|
||||||
|
|
||||||
|
The list of column names and types to use as hints in schema inference for formats without schema.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
Query:
|
||||||
|
```sql
|
||||||
|
desc format(JSONEachRow, '{"x" : 1, "y" : "String", "z" : "0.0.0.0" }') settings schema_inference_hints='x UInt8, z IPv4';
|
||||||
|
```
|
||||||
|
|
||||||
|
Result:
|
||||||
|
```sql
|
||||||
|
x UInt8
|
||||||
|
y Nullable(String)
|
||||||
|
z IPv4
|
||||||
|
```
|
||||||
|
|
||||||
## date_time_input_format {#date_time_input_format}
|
## date_time_input_format {#date_time_input_format}
|
||||||
|
|
||||||
Allows choosing a parser of the text representation of date and time.
|
Allows choosing a parser of the text representation of date and time.
|
||||||
|
@ -704,6 +704,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
|||||||
M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \
|
M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \
|
||||||
M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \
|
M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \
|
||||||
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
|
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
|
||||||
|
M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
|
||||||
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
|
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
|
||||||
M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \
|
M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \
|
||||||
M(Bool, input_format_try_infer_integers, true, "Try to infer numbers from string fields while schema inference in text formats", 0) \
|
M(Bool, input_format_try_infer_integers, true, "Try to infer numbers from string fields while schema inference in text formats", 0) \
|
||||||
|
@ -159,6 +159,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation;
|
format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation;
|
||||||
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
|
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
|
||||||
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
|
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
|
||||||
|
format_settings.schema_inference_hints = settings.schema_inference_hints;
|
||||||
format_settings.mysql_dump.table_name = settings.input_format_mysql_dump_table_name;
|
format_settings.mysql_dump.table_name = settings.input_format_mysql_dump_table_name;
|
||||||
format_settings.mysql_dump.map_column_names = settings.input_format_mysql_dump_map_column_names;
|
format_settings.mysql_dump.map_column_names = settings.input_format_mysql_dump_map_column_names;
|
||||||
format_settings.sql_insert.max_batch_size = settings.output_format_sql_insert_max_batch_size;
|
format_settings.sql_insert.max_batch_size = settings.output_format_sql_insert_max_batch_size;
|
||||||
@ -402,7 +403,10 @@ SchemaReaderPtr FormatFactory::getSchemaReader(
|
|||||||
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
|
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
|
||||||
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
||||||
return schema_reader_creator(buf, format_settings);
|
auto schema_reader = schema_reader_creator(buf, format_settings);
|
||||||
|
if (schema_reader->needContext())
|
||||||
|
schema_reader->setContext(context);
|
||||||
|
return schema_reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
|
ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
|
||||||
|
@ -38,6 +38,8 @@ struct FormatSettings
|
|||||||
UInt64 max_rows_to_read_for_schema_inference = 100;
|
UInt64 max_rows_to_read_for_schema_inference = 100;
|
||||||
|
|
||||||
String column_names_for_schema_inference;
|
String column_names_for_schema_inference;
|
||||||
|
String schema_inference_hints;
|
||||||
|
|
||||||
bool try_infer_integers = false;
|
bool try_infer_integers = false;
|
||||||
bool try_infer_dates = false;
|
bool try_infer_dates = false;
|
||||||
bool try_infer_datetimes = false;
|
bool try_infer_datetimes = false;
|
||||||
|
@ -44,8 +44,15 @@ bool tryParseColumnsListFromString(const std::string & structure, ColumnsDescrip
|
|||||||
if (!columns_list)
|
if (!columns_list)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
columns = InterpreterCreateQuery::getColumnsDescription(*columns_list, context, false);
|
try
|
||||||
return true;
|
{
|
||||||
|
columns = InterpreterCreateQuery::getColumnsDescription(*columns_list, context, false);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -2,6 +2,7 @@
|
|||||||
#include <Formats/ReadSchemaUtils.h>
|
#include <Formats/ReadSchemaUtils.h>
|
||||||
#include <Formats/EscapingRuleUtils.h>
|
#include <Formats/EscapingRuleUtils.h>
|
||||||
#include <DataTypes/DataTypeString.h>
|
#include <DataTypes/DataTypeString.h>
|
||||||
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -45,7 +46,8 @@ void chooseResultColumnType(
|
|||||||
{
|
{
|
||||||
throw Exception(
|
throw Exception(
|
||||||
ErrorCodes::TYPE_MISMATCH,
|
ErrorCodes::TYPE_MISMATCH,
|
||||||
"Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}",
|
"Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}. "
|
||||||
|
"You can specify the type for this column using setting schema_inference_hints",
|
||||||
type->getName(),
|
type->getName(),
|
||||||
column_name,
|
column_name,
|
||||||
row,
|
row,
|
||||||
@ -60,33 +62,42 @@ void checkResultColumnTypeAndAppend(NamesAndTypesList & result, DataTypePtr & ty
|
|||||||
if (!default_type)
|
if (!default_type)
|
||||||
throw Exception(
|
throw Exception(
|
||||||
ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA,
|
ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA,
|
||||||
"Cannot determine table structure by first {} rows of data, because some columns contain only Nulls", rows_read);
|
"Cannot determine type for column {} by first {} rows of data, most likely this column contains only Nulls or empty "
|
||||||
|
"Arrays/Maps. You can specify the type for this column using setting schema_inference_hints",
|
||||||
|
name,
|
||||||
|
rows_read);
|
||||||
|
|
||||||
type = default_type;
|
type = default_type;
|
||||||
}
|
}
|
||||||
result.emplace_back(name, type);
|
result.emplace_back(name, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
|
||||||
: ISchemaReader(in_), format_settings(format_settings_)
|
: ISchemaReader(in_), default_type(default_type_), hints_str(format_settings_.schema_inference_hints), format_settings(format_settings_)
|
||||||
{
|
{
|
||||||
if (!format_settings.column_names_for_schema_inference.empty())
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void IIRowSchemaReader::setContext(ContextPtr & context)
|
||||||
|
{
|
||||||
|
ColumnsDescription columns;
|
||||||
|
if (tryParseColumnsListFromString(hints_str, columns, context))
|
||||||
{
|
{
|
||||||
/// column_names_for_schema_inference is a string in format 'column1,column2,column3,...'
|
for (const auto & [name, type] : columns.getAll())
|
||||||
boost::split(column_names, format_settings.column_names_for_schema_inference, boost::is_any_of(","));
|
hints[name] = type;
|
||||||
for (auto & column_name : column_names)
|
|
||||||
{
|
|
||||||
std::string col_name_trimmed = boost::trim_copy(column_name);
|
|
||||||
if (!col_name_trimmed.empty())
|
|
||||||
column_name = col_name_trimmed;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
|
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||||
: IRowSchemaReader(in_, format_settings_)
|
: IIRowSchemaReader(in_, format_settings_)
|
||||||
{
|
{
|
||||||
default_type = default_type_;
|
initColumnNames(format_settings.column_names_for_schema_inference);
|
||||||
|
}
|
||||||
|
|
||||||
|
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
|
||||||
|
: IIRowSchemaReader(in_, format_settings_, default_type_)
|
||||||
|
{
|
||||||
|
initColumnNames(format_settings.column_names_for_schema_inference);
|
||||||
}
|
}
|
||||||
|
|
||||||
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, const DataTypes & default_types_)
|
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, const DataTypes & default_types_)
|
||||||
@ -104,31 +115,6 @@ NamesAndTypesList IRowSchemaReader::readSchema()
|
|||||||
"Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0");
|
"Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0");
|
||||||
|
|
||||||
DataTypes data_types = readRowAndGetDataTypes();
|
DataTypes data_types = readRowAndGetDataTypes();
|
||||||
for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read)
|
|
||||||
{
|
|
||||||
DataTypes new_data_types = readRowAndGetDataTypes();
|
|
||||||
if (new_data_types.empty())
|
|
||||||
/// We reached eof.
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (new_data_types.size() != data_types.size())
|
|
||||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values");
|
|
||||||
|
|
||||||
for (size_t i = 0; i != data_types.size(); ++i)
|
|
||||||
{
|
|
||||||
/// We couldn't determine the type of this column in a new row, just skip it.
|
|
||||||
if (!new_data_types[i])
|
|
||||||
continue;
|
|
||||||
|
|
||||||
auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type, i); };
|
|
||||||
chooseResultColumnType(data_types[i], new_data_types[i], transform_types_if_needed, getDefaultType(i), std::to_string(i + 1), rows_read);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check that we read at list one column.
|
|
||||||
if (data_types.empty())
|
|
||||||
throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data");
|
|
||||||
|
|
||||||
/// If column names weren't set, use default names 'c1', 'c2', ...
|
/// If column names weren't set, use default names 'c1', 'c2', ...
|
||||||
if (column_names.empty())
|
if (column_names.empty())
|
||||||
{
|
{
|
||||||
@ -142,6 +128,39 @@ NamesAndTypesList IRowSchemaReader::readSchema()
|
|||||||
ErrorCodes::INCORRECT_DATA,
|
ErrorCodes::INCORRECT_DATA,
|
||||||
"The number of column names {} differs with the number of types {}", column_names.size(), data_types.size());
|
"The number of column names {} differs with the number of types {}", column_names.size(), data_types.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i != column_names.size(); ++i)
|
||||||
|
{
|
||||||
|
auto hint_it = hints.find(column_names[i]);
|
||||||
|
if (hint_it != hints.end())
|
||||||
|
data_types[i] = hint_it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read)
|
||||||
|
{
|
||||||
|
DataTypes new_data_types = readRowAndGetDataTypes();
|
||||||
|
if (new_data_types.empty())
|
||||||
|
/// We reached eof.
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (new_data_types.size() != data_types.size())
|
||||||
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values");
|
||||||
|
|
||||||
|
for (size_t i = 0; i != data_types.size(); ++i)
|
||||||
|
{
|
||||||
|
/// Check if we couldn't determine the type of this column in a new row
|
||||||
|
/// or the type for this column was taken from hints.
|
||||||
|
if (!new_data_types[i] || hints.contains(column_names[i]))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type, i); };
|
||||||
|
chooseResultColumnType(data_types[i], new_data_types[i], transform_types_if_needed, getDefaultType(i), std::to_string(i + 1), rows_read);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check that we read at list one column.
|
||||||
|
if (data_types.empty())
|
||||||
|
throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data");
|
||||||
|
|
||||||
NamesAndTypesList result;
|
NamesAndTypesList result;
|
||||||
for (size_t i = 0; i != data_types.size(); ++i)
|
for (size_t i = 0; i != data_types.size(); ++i)
|
||||||
{
|
{
|
||||||
@ -152,6 +171,21 @@ NamesAndTypesList IRowSchemaReader::readSchema()
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void IRowSchemaReader::initColumnNames(const String & column_names_str)
|
||||||
|
{
|
||||||
|
if (column_names_str.empty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
/// column_names_for_schema_inference is a string in format 'column1,column2,column3,...'
|
||||||
|
boost::split(column_names, column_names_str, boost::is_any_of(","));
|
||||||
|
for (auto & column_name : column_names)
|
||||||
|
{
|
||||||
|
std::string col_name_trimmed = boost::trim_copy(column_name);
|
||||||
|
if (!col_name_trimmed.empty())
|
||||||
|
column_name = col_name_trimmed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const
|
DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const
|
||||||
{
|
{
|
||||||
if (default_type)
|
if (default_type)
|
||||||
@ -167,7 +201,7 @@ void IRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr &
|
|||||||
}
|
}
|
||||||
|
|
||||||
IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
|
IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
|
||||||
: ISchemaReader(in_), format_settings(format_settings_), default_type(default_type_)
|
: IIRowSchemaReader(in_, format_settings_, default_type_)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,7 +221,11 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
|
|||||||
names_order.reserve(names_and_types.size());
|
names_order.reserve(names_and_types.size());
|
||||||
for (const auto & [name, type] : names_and_types)
|
for (const auto & [name, type] : names_and_types)
|
||||||
{
|
{
|
||||||
names_to_types[name] = type;
|
auto hint_it = hints.find(name);
|
||||||
|
if (hint_it != hints.end())
|
||||||
|
names_to_types[name] = hint_it->second;
|
||||||
|
else
|
||||||
|
names_to_types[name] = type;
|
||||||
names_order.push_back(name);
|
names_order.push_back(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -205,11 +243,18 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
|
|||||||
/// If we didn't see this column before, just add it.
|
/// If we didn't see this column before, just add it.
|
||||||
if (it == names_to_types.end())
|
if (it == names_to_types.end())
|
||||||
{
|
{
|
||||||
names_to_types[name] = new_type;
|
auto hint_it = hints.find(name);
|
||||||
|
if (hint_it != hints.end())
|
||||||
|
names_to_types[name] = hint_it->second;
|
||||||
|
else
|
||||||
|
names_to_types[name] = new_type;
|
||||||
names_order.push_back(name);
|
names_order.push_back(name);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hints.contains(name))
|
||||||
|
continue;
|
||||||
|
|
||||||
auto & type = it->second;
|
auto & type = it->second;
|
||||||
chooseResultColumnType(type, new_type, transform_types_if_needed, default_type, name, rows_read);
|
chooseResultColumnType(type, new_type, transform_types_if_needed, default_type, name, rows_read);
|
||||||
}
|
}
|
||||||
|
@ -37,6 +37,27 @@ protected:
|
|||||||
|
|
||||||
using CommonDataTypeChecker = std::function<DataTypePtr(const DataTypePtr &, const DataTypePtr &)>;
|
using CommonDataTypeChecker = std::function<DataTypePtr(const DataTypePtr &, const DataTypePtr &)>;
|
||||||
|
|
||||||
|
class IIRowSchemaReader : public ISchemaReader
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_ = nullptr);
|
||||||
|
|
||||||
|
bool needContext() const override { return !hints_str.empty(); }
|
||||||
|
void setContext(ContextPtr & context) override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; }
|
||||||
|
size_t getNumRowsRead() const override { return rows_read; }
|
||||||
|
|
||||||
|
size_t max_rows_to_read;
|
||||||
|
size_t rows_read = 0;
|
||||||
|
CommonDataTypeChecker common_type_checker;
|
||||||
|
DataTypePtr default_type;
|
||||||
|
String hints_str;
|
||||||
|
FormatSettings format_settings;
|
||||||
|
std::unordered_map<String, DataTypePtr> hints;
|
||||||
|
};
|
||||||
|
|
||||||
/// Base class for schema inference for formats that read data row by row.
|
/// Base class for schema inference for formats that read data row by row.
|
||||||
/// It reads data row by row (up to max_rows_to_read), determines types of columns
|
/// It reads data row by row (up to max_rows_to_read), determines types of columns
|
||||||
/// for each row and compare them with types from the previous rows. If some column
|
/// for each row and compare them with types from the previous rows. If some column
|
||||||
@ -44,12 +65,12 @@ using CommonDataTypeChecker = std::function<DataTypePtr(const DataTypePtr &, con
|
|||||||
/// (from argument default_type_) will be used for this column or the exception
|
/// (from argument default_type_) will be used for this column or the exception
|
||||||
/// will be thrown (if default type is not set). If different columns have different
|
/// will be thrown (if default type is not set). If different columns have different
|
||||||
/// default types, you can provide them by default_types_ argument.
|
/// default types, you can provide them by default_types_ argument.
|
||||||
class IRowSchemaReader : public ISchemaReader
|
class IRowSchemaReader : public IIRowSchemaReader
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings);
|
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
|
||||||
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_);
|
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_);
|
||||||
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_);
|
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, const DataTypes & default_types_);
|
||||||
|
|
||||||
NamesAndTypesList readSchema() override;
|
NamesAndTypesList readSchema() override;
|
||||||
|
|
||||||
@ -62,19 +83,12 @@ protected:
|
|||||||
|
|
||||||
void setColumnNames(const std::vector<String> & names) { column_names = names; }
|
void setColumnNames(const std::vector<String> & names) { column_names = names; }
|
||||||
|
|
||||||
void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; }
|
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t index);
|
||||||
size_t getNumRowsRead() const override { return rows_read; }
|
|
||||||
|
|
||||||
FormatSettings format_settings;
|
|
||||||
|
|
||||||
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
DataTypePtr getDefaultType(size_t column) const;
|
DataTypePtr getDefaultType(size_t column) const;
|
||||||
size_t max_rows_to_read;
|
void initColumnNames(const String & column_names_str);
|
||||||
size_t rows_read = 0;
|
|
||||||
DataTypePtr default_type;
|
|
||||||
DataTypes default_types;
|
DataTypes default_types;
|
||||||
std::vector<String> column_names;
|
std::vector<String> column_names;
|
||||||
};
|
};
|
||||||
@ -84,7 +98,7 @@ private:
|
|||||||
/// Differ from IRowSchemaReader in that after reading a row we get
|
/// Differ from IRowSchemaReader in that after reading a row we get
|
||||||
/// a map {column_name : type} and some columns may be missed in a single row
|
/// a map {column_name : type} and some columns may be missed in a single row
|
||||||
/// (in this case we will use types from the previous rows for missed columns).
|
/// (in this case we will use types from the previous rows for missed columns).
|
||||||
class IRowWithNamesSchemaReader : public ISchemaReader
|
class IRowWithNamesSchemaReader : public IIRowSchemaReader
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_ = nullptr);
|
IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_ = nullptr);
|
||||||
@ -98,17 +112,7 @@ protected:
|
|||||||
/// Set eof = true if can't read more data.
|
/// Set eof = true if can't read more data.
|
||||||
virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0;
|
virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0;
|
||||||
|
|
||||||
void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; }
|
|
||||||
size_t getNumRowsRead() const override { return rows_read; }
|
|
||||||
|
|
||||||
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
|
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
|
||||||
|
|
||||||
FormatSettings format_settings;
|
|
||||||
|
|
||||||
private:
|
|
||||||
size_t max_rows_to_read;
|
|
||||||
size_t rows_read = 0;
|
|
||||||
DataTypePtr default_type;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Base class for schema inference for formats that don't need any data to
|
/// Base class for schema inference for formats that don't need any data to
|
||||||
|
@ -88,7 +88,7 @@ private:
|
|||||||
class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader
|
class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings);
|
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings_);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
|
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
#include <Storages/Hive/StorageHive.h>
|
#include <Storages/Hive/StorageHive.h>
|
||||||
#include <Storages/checkAndGetLiteralArgument.h>
|
#include <Storages/checkAndGetLiteralArgument.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Common/logger_useful.h>
|
#include <Common/logger_useful.h>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include <TableFunctions/ITableFunctionFileLike.h>
|
#include <TableFunctions/ITableFunctionFileLike.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
|
|
||||||
#include <Parsers/ASTFunction.h>
|
#include <Parsers/ASTFunction.h>
|
||||||
#include <Parsers/ASTLiteral.h>
|
#include <Parsers/ASTLiteral.h>
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
#include <Common/Exception.h>
|
#include <Common/Exception.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Parsers/ASTFunction.h>
|
#include <Parsers/ASTFunction.h>
|
||||||
#include <Parsers/ASTIdentifier.h>
|
#include <Parsers/ASTIdentifier.h>
|
||||||
#include <Parsers/ASTSelectWithUnionQuery.h>
|
#include <Parsers/ASTSelectWithUnionQuery.h>
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include <TableFunctions/TableFunctionFile.h>
|
#include <TableFunctions/TableFunctionFile.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
|
|
||||||
#include "Parsers/IAST_fwd.h"
|
#include "Parsers/IAST_fwd.h"
|
||||||
#include "registerTableFunctions.h"
|
#include "registerTableFunctions.h"
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#include <TableFunctions/ITableFunction.h>
|
#include <TableFunctions/ITableFunction.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/TableFunctionGenerateRandom.h>
|
#include <TableFunctions/TableFunctionGenerateRandom.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
|
|
||||||
#include "registerTableFunctions.h"
|
#include "registerTableFunctions.h"
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
#include <Storages/ColumnsDescription.h>
|
#include <Storages/ColumnsDescription.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/TableFunctionHDFS.h>
|
#include <TableFunctions/TableFunctionHDFS.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/TableFunctionHDFS.h>
|
#include <TableFunctions/TableFunctionHDFS.h>
|
||||||
#include <TableFunctions/TableFunctionHDFSCluster.h>
|
#include <TableFunctions/TableFunctionHDFSCluster.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Parsers/ASTLiteral.h>
|
#include <Parsers/ASTLiteral.h>
|
||||||
#include <Parsers/IAST_fwd.h>
|
#include <Parsers/IAST_fwd.h>
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#include <TableFunctions/TableFunctionInput.h>
|
#include <TableFunctions/TableFunctionInput.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Parsers/ASTFunction.h>
|
#include <Parsers/ASTFunction.h>
|
||||||
#include <Parsers/ASTLiteral.h>
|
#include <Parsers/ASTLiteral.h>
|
||||||
#include <Common/Exception.h>
|
#include <Common/Exception.h>
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
#include <TableFunctions/TableFunctionMongoDB.h>
|
#include <TableFunctions/TableFunctionMongoDB.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <TableFunctions/registerTableFunctions.h>
|
#include <TableFunctions/registerTableFunctions.h>
|
||||||
#include <Storages/checkAndGetLiteralArgument.h>
|
#include <Storages/checkAndGetLiteralArgument.h>
|
||||||
#include <Storages/ColumnsDescription.h>
|
#include <Storages/ColumnsDescription.h>
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
#include <Parsers/ASTFunction.h>
|
#include <Parsers/ASTFunction.h>
|
||||||
#include <Storages/checkAndGetLiteralArgument.h>
|
#include <Storages/checkAndGetLiteralArgument.h>
|
||||||
#include <Storages/StorageNull.h>
|
#include <Storages/StorageNull.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/TableFunctionNull.h>
|
#include <TableFunctions/TableFunctionNull.h>
|
||||||
#include <Interpreters/evaluateConstantExpression.h>
|
#include <Interpreters/evaluateConstantExpression.h>
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include <Interpreters/Context.h>
|
#include <Interpreters/Context.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/TableFunctionS3.h>
|
#include <TableFunctions/TableFunctionS3.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Parsers/ASTLiteral.h>
|
#include <Parsers/ASTLiteral.h>
|
||||||
#include <Storages/checkAndGetLiteralArgument.h>
|
#include <Storages/checkAndGetLiteralArgument.h>
|
||||||
#include <Storages/StorageS3.h>
|
#include <Storages/StorageS3.h>
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/TableFunctionS3.h>
|
#include <TableFunctions/TableFunctionS3.h>
|
||||||
#include <TableFunctions/TableFunctionS3Cluster.h>
|
#include <TableFunctions/TableFunctionS3Cluster.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Parsers/ASTLiteral.h>
|
#include <Parsers/ASTLiteral.h>
|
||||||
#include <Parsers/ASTExpressionList.h>
|
#include <Parsers/ASTExpressionList.h>
|
||||||
#include <Parsers/IAST_fwd.h>
|
#include <Parsers/IAST_fwd.h>
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
#include <Storages/StorageURL.h>
|
#include <Storages/StorageURL.h>
|
||||||
#include <Storages/StorageExternalDistributed.h>
|
#include <Storages/StorageExternalDistributed.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Formats/FormatFactory.h>
|
#include <Formats/FormatFactory.h>
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
#include <TableFunctions/TableFunctionValues.h>
|
#include <TableFunctions/TableFunctionValues.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
|
|
||||||
#include <Interpreters/convertFieldToType.h>
|
#include <Interpreters/convertFieldToType.h>
|
||||||
#include <Interpreters/evaluateConstantExpression.h>
|
#include <Interpreters/evaluateConstantExpression.h>
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include <TableFunctions/ITableFunction.h>
|
#include <TableFunctions/ITableFunction.h>
|
||||||
#include <TableFunctions/TableFunctionFactory.h>
|
#include <TableFunctions/TableFunctionFactory.h>
|
||||||
#include <TableFunctions/TableFunctionViewIfPermitted.h>
|
#include <TableFunctions/TableFunctionViewIfPermitted.h>
|
||||||
#include <TableFunctions/parseColumnsListForTableFunction.h>
|
#include <Interpreters/parseColumnsListForTableFunction.h>
|
||||||
#include <Interpreters/evaluateConstantExpression.h>
|
#include <Interpreters/evaluateConstantExpression.h>
|
||||||
#include "registerTableFunctions.h"
|
#include "registerTableFunctions.h"
|
||||||
|
|
||||||
|
@ -0,0 +1,16 @@
|
|||||||
|
x UInt8
|
||||||
|
y Nullable(String)
|
||||||
|
z IPv4
|
||||||
|
x UInt8
|
||||||
|
y Nullable(String)
|
||||||
|
z IPv4
|
||||||
|
x Nullable(UInt32)
|
||||||
|
x Array(UInt32)
|
||||||
|
x Map(String, String)
|
||||||
|
c1 UInt8
|
||||||
|
c2 Nullable(String)
|
||||||
|
c3 IPv4
|
||||||
|
x UInt8
|
||||||
|
y Nullable(String)
|
||||||
|
z IPv4
|
||||||
|
x Nullable(UInt32)
|
10
tests/queries/0_stateless/02383_schema_inference_hints.sql
Normal file
10
tests/queries/0_stateless/02383_schema_inference_hints.sql
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
-- Tags: no-fasttest
|
||||||
|
desc format(JSONEachRow, '{"x" : 1, "y" : "String", "z" : "0.0.0.0" }') settings schema_inference_hints='x UInt8, z IPv4';
|
||||||
|
desc format(JSONEachRow, '{"x" : 1, "y" : "String"}\n{"z" : "0.0.0.0", "y" : "String2"}\n{"x" : 2}') settings schema_inference_hints='x UInt8, z IPv4';
|
||||||
|
desc format(JSONEachRow, '{"x" : null}') settings schema_inference_hints='x Nullable(UInt32)';
|
||||||
|
desc format(JSONEachRow, '{"x" : []}') settings schema_inference_hints='x Array(UInt32)';
|
||||||
|
desc format(JSONEachRow, '{"x" : {}}') settings schema_inference_hints='x Map(String, String)';
|
||||||
|
|
||||||
|
desc format(CSV, '1,"String","0.0.0.0"') settings schema_inference_hints='c1 UInt8, c3 IPv4';
|
||||||
|
desc format(CSV, '1,"String","0.0.0.0"') settings schema_inference_hints='x UInt8, z IPv4', column_names_for_schema_inference='x, y, z';
|
||||||
|
desc format(CSV, '\\N') settings schema_inference_hints='x Nullable(UInt32)', column_names_for_schema_inference='x';
|
Loading…
Reference in New Issue
Block a user