mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-02 12:32:04 +00:00
271 lines
11 KiB
C++
271 lines
11 KiB
C++
#include <DataTypes/DataTypeNullable.h>
|
|
#include <DataTypes/DataTypeArray.h>
|
|
#include <DataTypes/DataTypeTuple.h>
|
|
#include <DataTypes/DataTypeMap.h>
|
|
#include <DataTypes/DataTypeLowCardinality.h>
|
|
#include <Formats/ReadSchemaUtils.h>
|
|
#include <Processors/Formats/ISchemaReader.h>
|
|
#include <Common/assert_cast.h>
|
|
#include <Interpreters/Context.h>
|
|
#include <Storages/IStorage.h>
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int EMPTY_DATA_PASSED;
|
|
extern const int BAD_ARGUMENTS;
|
|
extern const int ONLY_NULLS_WHILE_READING_SCHEMA;
|
|
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
|
|
}
|
|
|
|
static std::optional<NamesAndTypesList> getOrderedColumnsList(
|
|
const NamesAndTypesList & columns_list, const Names & columns_order_hint)
|
|
{
|
|
if (columns_list.size() != columns_order_hint.size())
|
|
return {};
|
|
|
|
std::unordered_map<String, DataTypePtr> available_columns;
|
|
for (const auto & [name, type] : columns_list)
|
|
available_columns.emplace(name, type);
|
|
|
|
NamesAndTypesList res;
|
|
for (const auto & name : columns_order_hint)
|
|
{
|
|
auto it = available_columns.find(name);
|
|
if (it == available_columns.end())
|
|
return {};
|
|
|
|
res.emplace_back(name, it->second);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
bool isRetryableSchemaInferenceError(int code)
|
|
{
|
|
return code == ErrorCodes::EMPTY_DATA_PASSED || code == ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA;
|
|
}
|
|
|
|
ColumnsDescription readSchemaFromFormat(
|
|
const String & format_name,
|
|
const std::optional<FormatSettings> & format_settings,
|
|
ReadBufferIterator & read_buffer_iterator,
|
|
bool retry,
|
|
ContextPtr & context,
|
|
std::unique_ptr<ReadBuffer> & buf)
|
|
{
|
|
NamesAndTypesList names_and_types;
|
|
if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name))
|
|
{
|
|
auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings);
|
|
try
|
|
{
|
|
names_and_types = external_schema_reader->readSchema();
|
|
}
|
|
catch (const DB::Exception & e)
|
|
{
|
|
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}. You can specify the structure manually", format_name, e.message());
|
|
}
|
|
}
|
|
else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name))
|
|
{
|
|
std::string exception_messages;
|
|
SchemaReaderPtr schema_reader;
|
|
size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference;
|
|
size_t iterations = 0;
|
|
ColumnsDescription cached_columns;
|
|
while (true)
|
|
{
|
|
bool is_eof = false;
|
|
try
|
|
{
|
|
buf = read_buffer_iterator(cached_columns);
|
|
if (!buf)
|
|
break;
|
|
is_eof = buf->eof();
|
|
}
|
|
catch (...)
|
|
{
|
|
auto exception_message = getCurrentExceptionMessage(false);
|
|
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file: {}. You can specify the structure manually", format_name, exception_message);
|
|
}
|
|
++iterations;
|
|
|
|
if (is_eof)
|
|
{
|
|
auto exception_message = fmt::format("Cannot extract table structure from {} format file, file is empty", format_name);
|
|
|
|
if (!retry)
|
|
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message);
|
|
|
|
exception_messages += "\n" + exception_message;
|
|
continue;
|
|
}
|
|
|
|
try
|
|
{
|
|
schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings);
|
|
schema_reader->setMaxRowsToRead(max_rows_to_read);
|
|
names_and_types = schema_reader->readSchema();
|
|
break;
|
|
}
|
|
catch (...)
|
|
{
|
|
auto exception_message = getCurrentExceptionMessage(false);
|
|
if (schema_reader)
|
|
{
|
|
size_t rows_read = schema_reader->getNumRowsRead();
|
|
assert(rows_read <= max_rows_to_read);
|
|
max_rows_to_read -= schema_reader->getNumRowsRead();
|
|
if (rows_read != 0 && max_rows_to_read == 0)
|
|
{
|
|
exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference";
|
|
if (iterations > 1)
|
|
{
|
|
exception_messages += "\n" + exception_message;
|
|
break;
|
|
}
|
|
retry = false;
|
|
}
|
|
}
|
|
|
|
if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode()))
|
|
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}. You can specify the structure manually", format_name, exception_message);
|
|
|
|
exception_messages += "\n" + exception_message;
|
|
}
|
|
}
|
|
|
|
if (!cached_columns.empty())
|
|
return cached_columns;
|
|
|
|
if (names_and_types.empty())
|
|
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "All attempts to extract table structure from files failed. Errors:{}\nYou can specify the structure manually", exception_messages);
|
|
|
|
/// If we have "INSERT SELECT" query then try to order
|
|
/// columns as they are ordered in table schema for formats
|
|
/// without strict column order (like JSON and TSKV).
|
|
/// It will allow to execute simple data loading with query
|
|
/// "INSERT INTO table SELECT * FROM ..."
|
|
const auto & insertion_table = context->getInsertionTable();
|
|
if (!schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty())
|
|
{
|
|
auto storage = DatabaseCatalog::instance().getTable(insertion_table, context);
|
|
auto metadata = storage->getInMemoryMetadataPtr();
|
|
auto names_in_storage = metadata->getColumns().getNamesOfPhysical();
|
|
auto ordered_list = getOrderedColumnsList(names_and_types, names_in_storage);
|
|
if (ordered_list)
|
|
names_and_types = *ordered_list;
|
|
}
|
|
}
|
|
else
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference. You must specify the structure manually", format_name);
|
|
|
|
return ColumnsDescription(names_and_types);
|
|
}
|
|
|
|
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferIterator & read_buffer_iterator, bool retry, ContextPtr & context)
|
|
{
|
|
std::unique_ptr<ReadBuffer> buf_out;
|
|
return readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, retry, context, buf_out);
|
|
}
|
|
|
|
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type)
|
|
{
|
|
if (!type)
|
|
return nullptr;
|
|
|
|
WhichDataType which(type);
|
|
|
|
if (which.isNothing())
|
|
return nullptr;
|
|
|
|
if (which.isNullable())
|
|
{
|
|
const auto * nullable_type = assert_cast<const DataTypeNullable *>(type.get());
|
|
return makeNullableRecursivelyAndCheckForNothing(nullable_type->getNestedType());
|
|
}
|
|
|
|
if (which.isArray())
|
|
{
|
|
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
|
|
auto nested_type = makeNullableRecursivelyAndCheckForNothing(array_type->getNestedType());
|
|
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
|
|
}
|
|
|
|
if (which.isTuple())
|
|
{
|
|
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
|
|
DataTypes nested_types;
|
|
for (const auto & element : tuple_type->getElements())
|
|
{
|
|
auto nested_type = makeNullableRecursivelyAndCheckForNothing(element);
|
|
if (!nested_type)
|
|
return nullptr;
|
|
nested_types.push_back(nested_type);
|
|
}
|
|
return std::make_shared<DataTypeTuple>(std::move(nested_types));
|
|
}
|
|
|
|
if (which.isMap())
|
|
{
|
|
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
|
|
auto key_type = makeNullableRecursivelyAndCheckForNothing(map_type->getKeyType());
|
|
auto value_type = makeNullableRecursivelyAndCheckForNothing(map_type->getValueType());
|
|
return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr;
|
|
}
|
|
|
|
if (which.isLowCarnality())
|
|
{
|
|
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
|
|
auto nested_type = makeNullableRecursivelyAndCheckForNothing(lc_type->getDictionaryType());
|
|
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
|
|
}
|
|
|
|
return makeNullable(type);
|
|
}
|
|
|
|
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)
|
|
{
|
|
NamesAndTypesList result;
|
|
for (auto & [name, type] : header.getNamesAndTypesList())
|
|
result.emplace_back(name, makeNullableRecursivelyAndCheckForNothing(type));
|
|
return result;
|
|
}
|
|
|
|
String getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context)
|
|
{
|
|
return getKeysForSchemaCache({source}, format, format_settings, context).front();
|
|
}
|
|
|
|
static String makeSchemaCacheKey(const String & source, const String & format, const String & additional_format_info)
|
|
{
|
|
return source + "@@" + format + "@@" + additional_format_info;
|
|
}
|
|
|
|
void splitSchemaCacheKey(const String & key, String & source, String & format, String & additional_format_info)
|
|
{
|
|
size_t additional_format_info_pos = key.rfind("@@");
|
|
additional_format_info = key.substr(additional_format_info_pos + 2, key.size() - additional_format_info_pos - 2);
|
|
size_t format_pos = key.rfind("@@", additional_format_info_pos - 1);
|
|
format = key.substr(format_pos + 2, additional_format_info_pos - format_pos - 2);
|
|
source = key.substr(0, format_pos);
|
|
}
|
|
|
|
Strings getKeysForSchemaCache(const Strings & sources, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context)
|
|
{
|
|
/// For some formats data schema depends on some settings, so it's possible that
|
|
/// two queries to the same source will get two different schemas. To process this
|
|
/// case we add some additional information specific for the format to the cache key.
|
|
/// For example, for Protobuf format additional information is the path to the schema
|
|
/// and message name.
|
|
String additional_format_info = FormatFactory::instance().getAdditionalInfoForSchemaCache(format, context, format_settings);
|
|
Strings cache_keys;
|
|
cache_keys.reserve(sources.size());
|
|
std::transform(sources.begin(), sources.end(), std::back_inserter(cache_keys), [&](const auto & source){ return makeSchemaCacheKey(source, format, additional_format_info); });
|
|
return cache_keys;
|
|
}
|
|
|
|
}
|