fixes due to review

This commit is contained in:
yariks5s 2024-07-10 17:44:06 +00:00
parent d7f08ffdb7
commit a751719a33
25 changed files with 152 additions and 66 deletions

View File

@ -5591,3 +5591,9 @@ Default value: `10000000`.
Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached.
Default value: `1GiB`. Default value: `1GiB`.
## use_hive_partitioning
Allows the usage of Hive-style partitioning in queries. When enabled, ClickHouse interprets and maintains table partitions in a way that is consistent with the Hive partitioning scheme, which is commonly used in Hadoop ecosystems.
Default value: `0`.

View File

@ -198,6 +198,23 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`.
- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`.
## Hive-style partitioning {#hive-style-patitioning}
When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure.
**Example**
Use virtual column, created with Hive-style partitioning
``` sql
SET use_hive_patitioning = 1;
SELECT _specified_column from file('/specified_column=specified_data/file.txt');
```
``` reference
specified_data
```
## Settings {#settings} ## Settings {#settings}
- [engine_file_empty_if_not_exists](/docs/en/operations/settings/settings.md#engine-file-empty_if-not-exists) - allows to select empty data from a file that doesn't exist. Disabled by default. - [engine_file_empty_if_not_exists](/docs/en/operations/settings/settings.md#engine-file-empty_if-not-exists) - allows to select empty data from a file that doesn't exist. Disabled by default.

View File

@ -99,6 +99,23 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`.
- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`.
## Hive-style partitioning {#hive-style-patitioning}
When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure.
**Example**
Use virtual column, created with Hive-style partitioning
``` sql
SET use_hive_patitioning = 1;
SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt');
```
``` reference
specified_data
```
## Storage Settings {#storage-settings} ## Storage Settings {#storage-settings}
- [hdfs_truncate_on_insert](/docs/en/operations/settings/settings.md#hdfs_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. - [hdfs_truncate_on_insert](/docs/en/operations/settings/settings.md#hdfs_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default.

View File

@ -274,6 +274,23 @@ FROM s3(
- `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive.
- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`.
## Hive-style partitioning {#hive-style-patitioning}
When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure.
**Example**
Use virtual column, created with Hive-style partitioning
``` sql
SET use_hive_patitioning = 1;
SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt');
```
``` reference
specified_data
```
## Storage Settings {#storage-settings} ## Storage Settings {#storage-settings}
- [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. - [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default.

View File

@ -1308,8 +1308,7 @@ try
SingleReadBufferIterator read_buffer_iterator(std::move(file)); SingleReadBufferIterator read_buffer_iterator(std::move(file));
std::string sample_string; schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, context_const);
schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, sample_string, context_const);
} }
else else
{ {

View File

@ -94,7 +94,6 @@ std::pair<ColumnsDescription, String> readSchemaFromFormatImpl(
std::optional<String> format_name, std::optional<String> format_name,
const std::optional<FormatSettings> & format_settings, const std::optional<FormatSettings> & format_settings,
IReadBufferIterator & read_buffer_iterator, IReadBufferIterator & read_buffer_iterator,
std::string & sample_path,
const ContextPtr & context) const ContextPtr & context)
try try
{ {
@ -144,10 +143,6 @@ try
{ {
iterator_data = read_buffer_iterator.next(); iterator_data = read_buffer_iterator.next();
/// Extracting the File path for hive-style partitioning
if (sample_path.empty())
sample_path = read_buffer_iterator.getLastFilePath();
/// Read buffer iterator can determine the data format if it's unknown. /// Read buffer iterator can determine the data format if it's unknown.
/// For example by scanning schema cache or by finding new file with format extension. /// For example by scanning schema cache or by finding new file with format extension.
if (!format_name && iterator_data.format_name) if (!format_name && iterator_data.format_name)
@ -541,19 +536,17 @@ ColumnsDescription readSchemaFromFormat(
const String & format_name, const String & format_name,
const std::optional<FormatSettings> & format_settings, const std::optional<FormatSettings> & format_settings,
IReadBufferIterator & read_buffer_iterator, IReadBufferIterator & read_buffer_iterator,
std::string & sample_path,
const ContextPtr & context) const ContextPtr & context)
{ {
return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, sample_path, context).first; return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, context).first;
} }
std::pair<ColumnsDescription, String> detectFormatAndReadSchema( std::pair<ColumnsDescription, String> detectFormatAndReadSchema(
const std::optional<FormatSettings> & format_settings, const std::optional<FormatSettings> & format_settings,
IReadBufferIterator & read_buffer_iterator, IReadBufferIterator & read_buffer_iterator,
std::string & sample_path,
const ContextPtr & context) const ContextPtr & context)
{ {
return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, sample_path, context); return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, context);
} }
SchemaCache::Key getKeyForSchemaCache( SchemaCache::Key getKeyForSchemaCache(

View File

@ -122,7 +122,6 @@ ColumnsDescription readSchemaFromFormat(
const String & format_name, const String & format_name,
const std::optional<FormatSettings> & format_settings, const std::optional<FormatSettings> & format_settings,
IReadBufferIterator & read_buffer_iterator, IReadBufferIterator & read_buffer_iterator,
std::string & sample_path,
const ContextPtr & context); const ContextPtr & context);
/// Try to detect the format of the data and it's schema. /// Try to detect the format of the data and it's schema.
@ -132,7 +131,6 @@ ColumnsDescription readSchemaFromFormat(
std::pair<ColumnsDescription, String> detectFormatAndReadSchema( std::pair<ColumnsDescription, String> detectFormatAndReadSchema(
const std::optional<FormatSettings> & format_settings, const std::optional<FormatSettings> & format_settings,
IReadBufferIterator & read_buffer_iterator, IReadBufferIterator & read_buffer_iterator,
std::string & sample_path,
const ContextPtr & context); const ContextPtr & context);
SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context); SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context);

View File

@ -445,7 +445,7 @@ StorageHive::StorageHive(
storage_metadata.partition_key = KeyDescription::getKeyFromAST(partition_by_ast, storage_metadata.columns, getContext()); storage_metadata.partition_key = KeyDescription::getKeyFromAST(partition_by_ast, storage_metadata.columns, getContext());
setInMemoryMetadata(storage_metadata); setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), getContext()));
} }
void StorageHive::lazyInitialize() void StorageHive::lazyInitialize()

View File

@ -89,9 +89,9 @@ public:
{ {
ConfigurationPtr configuration = base_configuration->clone(); ConfigurationPtr configuration = base_configuration->clone();
configuration->setPaths(metadata->getDataFiles()); configuration->setPaths(metadata->getDataFiles());
std::string sample_string; std::string sample_path;
return Storage::resolveSchemaFromData( return Storage::resolveSchemaFromData(
object_storage_, configuration, format_settings_, sample_string, local_context); object_storage_, configuration, format_settings_, sample_path, local_context);
} }
} }

View File

@ -43,7 +43,8 @@ std::string StorageObjectStorage::getPathSample(StorageInMemoryMetadata metadata
{}, // predicate {}, // predicate
metadata.getColumns().getAll(), // virtual_columns metadata.getColumns().getAll(), // virtual_columns
nullptr, // read_keys nullptr, // read_keys
{} // file_progress_callback {}, // file_progress_callback
true // override_settings_for_hive_partitioning
); );
if (auto file = file_iterator->next(0)) if (auto file = file_iterator->next(0))
@ -86,7 +87,7 @@ StorageObjectStorage::StorageObjectStorage(
else if (!context->getSettings().use_hive_partitioning) else if (!context->getSettings().use_hive_partitioning)
sample_path = ""; sample_path = "";
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), sample_path)); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context, sample_path));
setInMemoryMetadata(metadata); setInMemoryMetadata(metadata);
} }
@ -396,7 +397,8 @@ ColumnsDescription StorageObjectStorage::resolveSchemaFromData(
{ {
ObjectInfos read_keys; ObjectInfos read_keys;
auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context);
return readSchemaFromFormat(configuration->format, format_settings, *iterator, sample_path, context); sample_path = iterator->getLastFilePath();
return readSchemaFromFormat(configuration->format, format_settings, *iterator, context);
} }
std::string StorageObjectStorage::resolveFormatFromData( std::string StorageObjectStorage::resolveFormatFromData(
@ -408,7 +410,8 @@ std::string StorageObjectStorage::resolveFormatFromData(
{ {
ObjectInfos read_keys; ObjectInfos read_keys;
auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context);
return detectFormatAndReadSchema(format_settings, *iterator, sample_path, context).second; sample_path = iterator->getLastFilePath();
return detectFormatAndReadSchema(format_settings, *iterator, context).second;
} }
std::pair<ColumnsDescription, std::string> StorageObjectStorage::resolveSchemaAndFormatFromData( std::pair<ColumnsDescription, std::string> StorageObjectStorage::resolveSchemaAndFormatFromData(
@ -420,7 +423,8 @@ std::pair<ColumnsDescription, std::string> StorageObjectStorage::resolveSchemaAn
{ {
ObjectInfos read_keys; ObjectInfos read_keys;
auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context);
auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, sample_path, context); sample_path = iterator->getLastFilePath();
auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, context);
configuration->format = format; configuration->format = format;
return std::pair(columns, format); return std::pair(columns, format);
} }

View File

@ -41,7 +41,7 @@ StorageObjectStorageCluster::StorageObjectStorageCluster(
metadata.setColumns(columns); metadata.setColumns(columns);
metadata.setConstraints(constraints_); metadata.setConstraints(constraints_);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_));
setInMemoryMetadata(metadata); setInMemoryMetadata(metadata);
} }

View File

@ -105,7 +105,8 @@ std::shared_ptr<StorageObjectStorageSource::IIterator> StorageObjectStorageSourc
const ActionsDAG::Node * predicate, const ActionsDAG::Node * predicate,
const NamesAndTypesList & virtual_columns, const NamesAndTypesList & virtual_columns,
ObjectInfos * read_keys, ObjectInfos * read_keys,
std::function<void(FileProgress)> file_progress_callback) std::function<void(FileProgress)> file_progress_callback,
bool override_settings_for_hive_partitioning)
{ {
if (distributed_processing) if (distributed_processing)
return std::make_shared<ReadTaskIterator>( return std::make_shared<ReadTaskIterator>(
@ -122,11 +123,14 @@ std::shared_ptr<StorageObjectStorageSource::IIterator> StorageObjectStorageSourc
std::unique_ptr<IIterator> iterator; std::unique_ptr<IIterator> iterator;
if (configuration->isPathWithGlobs()) if (configuration->isPathWithGlobs())
{ {
bool throw_on_zero_files_match = settings.throw_on_zero_files_match;
if (override_settings_for_hive_partitioning)
throw_on_zero_files_match = false;
/// Iterate through disclosed globs and make a source for each file /// Iterate through disclosed globs and make a source for each file
iterator = std::make_unique<GlobIterator>( iterator = std::make_unique<GlobIterator>(
object_storage, configuration, predicate, virtual_columns, object_storage, configuration, predicate, virtual_columns,
local_context, is_archive ? nullptr : read_keys, settings.list_object_keys_size, local_context, is_archive ? nullptr : read_keys, settings.list_object_keys_size,
settings.throw_on_zero_files_match, file_progress_callback); throw_on_zero_files_match, file_progress_callback);
} }
else else
{ {
@ -204,7 +208,8 @@ Chunk StorageObjectStorageSource::generate()
.size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes, .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes,
.filename = &filename, .filename = &filename,
.last_modified = object_info->metadata->last_modified, .last_modified = object_info->metadata->last_modified,
}, object_info->getPath()); .hive_partitioning_path = object_info->getPath(),
});
const auto & partition_columns = configuration->getPartitionColumns(); const auto & partition_columns = configuration->getPartitionColumns();
if (!partition_columns.empty() && chunk_size && chunk.hasColumns()) if (!partition_columns.empty() && chunk_size && chunk.hasColumns())

View File

@ -58,7 +58,8 @@ public:
const ActionsDAG::Node * predicate, const ActionsDAG::Node * predicate,
const NamesAndTypesList & virtual_columns, const NamesAndTypesList & virtual_columns,
ObjectInfos * read_keys, ObjectInfos * read_keys,
std::function<void(FileProgress)> file_progress_callback = {}); std::function<void(FileProgress)> file_progress_callback = {},
bool override_settings_for_hive_partitioning = false);
static std::string getUniqueStoragePathIdentifier( static std::string getUniqueStoragePathIdentifier(
const Configuration & configuration, const Configuration & configuration,

View File

@ -168,7 +168,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue(
storage_metadata.setColumns(columns); storage_metadata.setColumns(columns);
storage_metadata.setConstraints(constraints_); storage_metadata.setConstraints(constraints_);
storage_metadata.setComment(comment); storage_metadata.setComment(comment);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_));
setInMemoryMetadata(storage_metadata); setInMemoryMetadata(storage_metadata);
LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); LOG_INFO(log, "Using zookeeper path: {}", zk_path.string());

View File

@ -52,6 +52,7 @@
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include <Common/re2.h> #include <Common/re2.h>
#include "Formats/FormatSettings.h"
#include <Formats/SchemaInferenceUtils.h> #include <Formats/SchemaInferenceUtils.h>
#include <QueryPipeline/Pipe.h> #include <QueryPipeline/Pipe.h>
@ -880,11 +881,10 @@ std::pair<ColumnsDescription, String> StorageFile::getTableStructureAndFormatFro
auto read_buffer_iterator = SingleReadBufferIterator(std::move(read_buf)); auto read_buffer_iterator = SingleReadBufferIterator(std::move(read_buf));
ColumnsDescription columns; ColumnsDescription columns;
std::string sample_path;
if (format) if (format)
columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context); columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context);
else else
std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, context);
peekable_read_buffer_from_fd = read_buffer_iterator.releaseBuffer(); peekable_read_buffer_from_fd = read_buffer_iterator.releaseBuffer();
if (peekable_read_buffer_from_fd) if (peekable_read_buffer_from_fd)
@ -929,21 +929,20 @@ std::pair<ColumnsDescription, String> StorageFile::getTableStructureAndFormatFro
} }
std::string sample_path;
if (archive_info) if (archive_info)
{ {
ReadBufferFromArchiveIterator read_buffer_iterator(*archive_info, format, format_settings, context); ReadBufferFromArchiveIterator read_buffer_iterator(*archive_info, format, format_settings, context);
if (format) if (format)
return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format};
return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context);
} }
ReadBufferFromFileIterator read_buffer_iterator(paths, format, compression_method, format_settings, context); ReadBufferFromFileIterator read_buffer_iterator(paths, format, compression_method, format_settings, context);
if (format) if (format)
return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format};
return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context);
} }
ColumnsDescription StorageFile::getTableStructureFromFile( ColumnsDescription StorageFile::getTableStructureFromFile(
@ -1102,7 +1101,7 @@ void StorageFile::setStorageMetadata(CommonArguments args)
std::string path_for_virtuals; std::string path_for_virtuals;
if (args.getContext()->getSettingsRef().use_hive_partitioning && !paths.empty()) if (args.getContext()->getSettingsRef().use_hive_partitioning && !paths.empty())
path_for_virtuals = paths[0]; path_for_virtuals = paths[0];
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), path_for_virtuals, format_settings.value_or(FormatSettings{}))); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), path_for_virtuals, format_settings.value_or(FormatSettings{})));
} }
@ -1456,7 +1455,8 @@ Chunk StorageFileSource::generate()
.size = current_file_size, .size = current_file_size,
.filename = (filename_override.has_value() ? &filename_override.value() : nullptr), .filename = (filename_override.has_value() ? &filename_override.value() : nullptr),
.last_modified = current_file_last_modified, .last_modified = current_file_last_modified,
}, hive_partitioning_path); .hive_partitioning_path = hive_partitioning_path,
});
return chunk; return chunk;
} }

View File

@ -61,7 +61,7 @@ StorageFileCluster::StorageFileCluster(
storage_metadata.setConstraints(constraints_); storage_metadata.setConstraints(constraints_);
setInMemoryMetadata(storage_metadata); setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context));
} }
void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context)

View File

@ -99,6 +99,17 @@ static ConnectionTimeouts getHTTPTimeouts(ContextPtr context)
return ConnectionTimeouts::getHTTPTimeouts(context->getSettingsRef(), context->getServerSettings().keep_alive_timeout); return ConnectionTimeouts::getHTTPTimeouts(context->getSettingsRef(), context->getServerSettings().keep_alive_timeout);
} }
String getSampleURI(String uri, ContextPtr context)
{
if (urlWithGlobs(uri))
{
auto uris = parseRemoteDescription(uri, 0, uri.size(), ',', context->getSettingsRef().glob_expansion_max_elements);
if (!uris.empty())
return uris[0];
}
return uri;
}
IStorageURLBase::IStorageURLBase( IStorageURLBase::IStorageURLBase(
const String & uri_, const String & uri_,
const ContextPtr & context_, const ContextPtr & context_,
@ -155,8 +166,8 @@ IStorageURLBase::IStorageURLBase(
std::string uri_for_partitioning; std::string uri_for_partitioning;
if (context_->getSettingsRef().use_hive_partitioning) if (context_->getSettingsRef().use_hive_partitioning)
uri_for_partitioning = uri; uri_for_partitioning = getSampleURI(uri, context_);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), uri_for_partitioning, format_settings.value_or(FormatSettings{}))); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_, uri_for_partitioning, format_settings.value_or(FormatSettings{})));
} }
@ -425,7 +436,8 @@ Chunk StorageURLSource::generate()
{ {
.path = curr_uri.getPath(), .path = curr_uri.getPath(),
.size = current_file_size, .size = current_file_size,
}, hive_partitioning_path); .hive_partitioning_path = hive_partitioning_path,
});
return chunk; return chunk;
} }
@ -959,10 +971,9 @@ std::pair<ColumnsDescription, String> IStorageURLBase::getTableStructureAndForma
urls_to_check = {uri}; urls_to_check = {uri};
ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context);
std::string sample_path;
if (format) if (format)
return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format};
return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context);
} }
ColumnsDescription IStorageURLBase::getTableStructureFromData( ColumnsDescription IStorageURLBase::getTableStructureFromData(

View File

@ -75,7 +75,7 @@ StorageURLCluster::StorageURLCluster(
storage_metadata.setConstraints(constraints_); storage_metadata.setConstraints(constraints_);
setInMemoryMetadata(storage_metadata); setInMemoryMetadata(storage_metadata);
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context));
} }
void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context)

View File

@ -39,10 +39,13 @@
#include <Common/re2.h> #include <Common/re2.h>
#include <Common/typeid_cast.h> #include <Common/typeid_cast.h>
#include <Formats/SchemaInferenceUtils.h> #include <Formats/SchemaInferenceUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/FormatFactory.h>
#include "Functions/FunctionsLogical.h" #include "Functions/FunctionsLogical.h"
#include "Functions/IFunction.h" #include "Functions/IFunction.h"
#include "Functions/IFunctionAdaptors.h" #include "Functions/IFunctionAdaptors.h"
#include "Functions/indexHint.h" #include "Functions/indexHint.h"
#include <Interpreters/convertFieldToType.h>
#include <Parsers/makeASTForLogicalFunction.h> #include <Parsers/makeASTForLogicalFunction.h>
#include <Columns/ColumnSet.h> #include <Columns/ColumnSet.h>
#include <Functions/FunctionHelpers.h> #include <Functions/FunctionHelpers.h>
@ -116,7 +119,7 @@ NameSet getVirtualNamesForFileLikeStorage()
return {"_path", "_file", "_size", "_time"}; return {"_path", "_file", "_size", "_time"};
} }
std::map<std::string, std::string> parseFromPath(const std::string& path) std::map<std::string, std::string> parseHivePartitioningKeysAndValues(const std::string& path)
{ {
std::string pattern = "/([^/]+)=([^/]+)"; std::string pattern = "/([^/]+)=([^/]+)";
re2::StringPiece input_piece(path); re2::StringPiece input_piece(path);
@ -128,7 +131,7 @@ std::map<std::string, std::string> parseFromPath(const std::string& path)
return key_values; return key_values;
} }
VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, std::string path, FormatSettings settings) VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, const ContextPtr & context, std::string path, std::optional<FormatSettings> format_settings_)
{ {
VirtualColumnsDescription desc; VirtualColumnsDescription desc;
@ -145,13 +148,17 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription
add_virtual("_size", makeNullable(std::make_shared<DataTypeUInt64>())); add_virtual("_size", makeNullable(std::make_shared<DataTypeUInt64>()));
add_virtual("_time", makeNullable(std::make_shared<DataTypeDateTime>())); add_virtual("_time", makeNullable(std::make_shared<DataTypeDateTime>()));
auto map = parseFromPath(path); auto map = parseHivePartitioningKeysAndValues(path);
for (const auto& item : map) for (auto& item : map)
{ {
auto type = tryInferDataTypeForSingleField(item.second, settings); auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
auto type = tryInferDataTypeByEscapingRule(item.second, format_settings, FormatSettings::EscapingRule::Raw);
if (type == nullptr) if (type == nullptr)
type = std::make_shared<DataTypeString>(); type = std::make_shared<DataTypeString>();
add_virtual(item.first, std::make_shared<DataTypeLowCardinality>(type)); if (type->canBeInsideLowCardinality())
add_virtual(item.first, std::make_shared<DataTypeLowCardinality>(type));
else
add_virtual(item.first, type);
} }
return desc; return desc;
@ -215,9 +222,9 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const
void addRequestedFileLikeStorageVirtualsToChunk( void addRequestedFileLikeStorageVirtualsToChunk(
Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, Chunk & chunk, const NamesAndTypesList & requested_virtual_columns,
VirtualsForFileLikeStorage virtual_values, const std::string & hive_partitioning_path) VirtualsForFileLikeStorage virtual_values)
{ {
auto hive_map = parseFromPath(hive_partitioning_path); auto hive_map = parseHivePartitioningKeysAndValues(virtual_values.hive_partitioning_path);
for (const auto & virtual_column : requested_virtual_columns) for (const auto & virtual_column : requested_virtual_columns)
{ {
if (virtual_column.name == "_path") if (virtual_column.name == "_path")
@ -265,7 +272,7 @@ void addRequestedFileLikeStorageVirtualsToChunk(
auto it = hive_map.find(virtual_column.getNameInStorage()); auto it = hive_map.find(virtual_column.getNameInStorage());
if (it != hive_map.end()) if (it != hive_map.end())
{ {
chunk.addColumn(virtual_column.getTypeInStorage()->createColumnConst(chunk.getNumRows(), it->second)->convertToFullColumnIfConst()); chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), convertFieldToType(Field(it->second), *virtual_column.type))->convertToFullColumnIfConst());
hive_map.erase(it); hive_map.erase(it);
} }
} }

View File

@ -50,7 +50,11 @@ auto extractSingleValueFromBlock(const Block & block, const String & name)
} }
NameSet getVirtualNamesForFileLikeStorage(); NameSet getVirtualNamesForFileLikeStorage();
VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, std::string path = "", FormatSettings settings = FormatSettings()); VirtualColumnsDescription getVirtualsForFileLikeStorage(
const ColumnsDescription & storage_columns,
const ContextPtr & context,
std::string sample_path = "",
std::optional<FormatSettings> format_settings_ = std::nullopt);
ActionsDAGPtr createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); ActionsDAGPtr createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns);
@ -77,13 +81,14 @@ struct VirtualsForFileLikeStorage
std::optional<size_t> size { std::nullopt }; std::optional<size_t> size { std::nullopt };
const String * filename { nullptr }; const String * filename { nullptr };
std::optional<Poco::Timestamp> last_modified { std::nullopt }; std::optional<Poco::Timestamp> last_modified { std::nullopt };
const String & hive_partitioning_path = "";
}; };
std::map<std::string, std::string> parseFromPath(const std::string& path); std::map<std::string, std::string> parseFromPath(const std::string& path);
void addRequestedFileLikeStorageVirtualsToChunk( void addRequestedFileLikeStorageVirtualsToChunk(
Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, Chunk & chunk, const NamesAndTypesList & requested_virtual_columns,
VirtualsForFileLikeStorage virtual_values, const std::string & hive_partitioning_path = ""); VirtualsForFileLikeStorage virtual_values);
} }
} }

View File

@ -85,10 +85,9 @@ ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr conte
if (structure == "auto") if (structure == "auto")
{ {
SingleReadBufferIterator read_buffer_iterator(std::make_unique<ReadBufferFromString>(data)); SingleReadBufferIterator read_buffer_iterator(std::make_unique<ReadBufferFromString>(data));
std::string sample_path;
if (format == "auto") if (format == "auto")
return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, sample_path, context).first; return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context).first;
return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, sample_path, context); return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context);
} }
return parseColumnsListFromString(structure, context); return parseColumnsListFromString(structure, context);
} }
@ -132,12 +131,11 @@ StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, Con
String format_name = format; String format_name = format;
if (structure == "auto") if (structure == "auto")
{ {
std::string sample_path;
SingleReadBufferIterator read_buffer_iterator(std::make_unique<ReadBufferFromString>(data)); SingleReadBufferIterator read_buffer_iterator(std::make_unique<ReadBufferFromString>(data));
if (format_name == "auto") if (format_name == "auto")
std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, sample_path, context); std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context);
else else
columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, sample_path, context); columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context);
} }
else else
{ {

View File

@ -60,7 +60,10 @@ Stanley Gibson Elizabeth
Eugenia Greer Elizabeth Eugenia Greer Elizabeth
Jeffery Delgado Elizabeth Jeffery Delgado Elizabeth
Clara Cross Elizabeth Clara Cross Elizabeth
Elizabeth Gordon Elizabeth 42 2020-01-01
[1,2,3] 42.42
Array(Int64) LowCardinality(Float64)
101
1 1
TESTING THE S3 PARTITIONING TESTING THE S3 PARTITIONING
first last Elizabeth first last Elizabeth

View File

@ -28,7 +28,13 @@ SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=E
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;
SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;
SELECT _number, _date FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') LIMIT 1;
SELECT _array, _float FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
SELECT toTypeName(_array), toTypeName(_float) FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1;
SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE _number = 42;
"""
$CLICKHOUSE_LOCAL -n -q """ $CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 0; set use_hive_partitioning = 0;
@ -59,8 +65,7 @@ SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/colum
SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1;
SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;"""
SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = _column0;"""
$CLICKHOUSE_LOCAL -n -q """ $CLICKHOUSE_LOCAL -n -q """
set use_hive_partitioning = 0; set use_hive_partitioning = 0;