mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 00:22:29 +00:00
Merge pull request #49536 from ClickHouse/propa
Make input_format_parquet_preserve_order imply !parallelize_output_from_storages
This commit is contained in:
commit
c23600fb56
@ -809,6 +809,14 @@ bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) c
|
||||
return target.prefers_large_blocks;
|
||||
}
|
||||
|
||||
bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const
|
||||
{
|
||||
if (name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void FormatFactory::checkFormatName(const String & name) const
|
||||
{
|
||||
auto it = dict.find(name);
|
||||
|
@ -250,6 +250,8 @@ public:
|
||||
bool checkIfFormatHasAnySchemaReader(const String & name) const;
|
||||
bool checkIfOutputFormatPrefersLargeBlocks(const String & name) const;
|
||||
|
||||
bool checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const;
|
||||
|
||||
void registerAdditionalInfoForSchemaCacheGetter(const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter);
|
||||
String getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_ = std::nullopt);
|
||||
|
||||
|
@ -137,7 +137,7 @@ void IStorage::read(
|
||||
/// parallelize processing if not yet
|
||||
const size_t output_ports = pipe.numOutputPorts();
|
||||
const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages;
|
||||
if (parallelize_output && parallelizeOutputAfterReading() && output_ports > 0 && output_ports < num_streams)
|
||||
if (parallelize_output && parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < num_streams)
|
||||
pipe.resize(num_streams);
|
||||
|
||||
readFromPipe(query_plan, std::move(pipe), column_names, storage_snapshot, query_info, context, getName());
|
||||
|
@ -377,7 +377,7 @@ private:
|
||||
/// This is enabled by default, but in some cases shouldn't be done.
|
||||
/// For example, when you read from system.numbers instead of system.numbers_mt,
|
||||
/// you still expect the data to be processed sequentially.
|
||||
virtual bool parallelizeOutputAfterReading() const { return true; }
|
||||
virtual bool parallelizeOutputAfterReading(ContextPtr) const { return true; }
|
||||
|
||||
public:
|
||||
/// Other version of read which adds reading step to query plan.
|
||||
|
@ -76,7 +76,7 @@ public:
|
||||
|
||||
/// FIXME: processing after reading from dictionaries are not parallelized due to some bug:
|
||||
/// count() can return wrong result, see test_dictionaries_redis/test_long.py::test_redis_dict_long
|
||||
bool parallelizeOutputAfterReading() const override { return false; }
|
||||
bool parallelizeOutputAfterReading(ContextPtr) const override { return false; }
|
||||
|
||||
std::shared_ptr<const IDictionary> getDictionary() const;
|
||||
|
||||
|
@ -412,6 +412,11 @@ bool StorageFile::prefersLargeBlocks() const
|
||||
return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(format_name);
|
||||
}
|
||||
|
||||
bool StorageFile::parallelizeOutputAfterReading(ContextPtr context) const
|
||||
{
|
||||
return FormatFactory::instance().checkParallelizeOutputAfterReading(format_name, context);
|
||||
}
|
||||
|
||||
StorageFile::StorageFile(int table_fd_, CommonArguments args)
|
||||
: StorageFile(args)
|
||||
{
|
||||
|
@ -75,6 +75,8 @@ public:
|
||||
|
||||
bool prefersLargeBlocks() const override;
|
||||
|
||||
bool parallelizeOutputAfterReading(ContextPtr context) const override;
|
||||
|
||||
bool supportsPartitionBy() const override { return true; }
|
||||
|
||||
ColumnsDescription getTableStructureFromFileDescriptor(ContextPtr context);
|
||||
|
@ -42,7 +42,7 @@ public:
|
||||
std::make_shared<NullSource>(storage_snapshot->getSampleBlockForColumns(column_names)));
|
||||
}
|
||||
|
||||
bool parallelizeOutputAfterReading() const override { return false; }
|
||||
bool parallelizeOutputAfterReading(ContextPtr) const override { return false; }
|
||||
|
||||
bool supportsParallelInsert() const override { return true; }
|
||||
|
||||
|
@ -1008,6 +1008,11 @@ bool StorageS3::prefersLargeBlocks() const
|
||||
return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration.format);
|
||||
}
|
||||
|
||||
bool StorageS3::parallelizeOutputAfterReading(ContextPtr context) const
|
||||
{
|
||||
return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration.format, context);
|
||||
}
|
||||
|
||||
Pipe StorageS3::read(
|
||||
const Names & column_names,
|
||||
const StorageSnapshotPtr & storage_snapshot,
|
||||
|
@ -238,7 +238,7 @@ private:
|
||||
* It sends HTTP GET to server when select is called and
|
||||
* HTTP PUT when insert is called.
|
||||
*/
|
||||
class StorageS3 : public IStorage, WithContext
|
||||
class StorageS3 : public IStorage
|
||||
{
|
||||
public:
|
||||
struct Configuration : public StatelessTableEngineConfiguration
|
||||
@ -366,6 +366,8 @@ private:
|
||||
|
||||
bool prefersLargeBlocks() const override;
|
||||
|
||||
bool parallelizeOutputAfterReading(ContextPtr context) const override;
|
||||
|
||||
static std::optional<ColumnsDescription> tryGetColumnsFromCache(
|
||||
const KeysWithInfo::const_iterator & begin,
|
||||
const KeysWithInfo::const_iterator & end,
|
||||
|
@ -600,6 +600,11 @@ bool IStorageURLBase::prefersLargeBlocks() const
|
||||
return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(format_name);
|
||||
}
|
||||
|
||||
bool IStorageURLBase::parallelizeOutputAfterReading(ContextPtr context) const
|
||||
{
|
||||
return FormatFactory::instance().checkParallelizeOutputAfterReading(format_name, context);
|
||||
}
|
||||
|
||||
Pipe IStorageURLBase::read(
|
||||
const Names & column_names,
|
||||
const StorageSnapshotPtr & storage_snapshot,
|
||||
|
@ -102,6 +102,8 @@ protected:
|
||||
|
||||
bool prefersLargeBlocks() const override;
|
||||
|
||||
bool parallelizeOutputAfterReading(ContextPtr context) const override;
|
||||
|
||||
private:
|
||||
virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0;
|
||||
|
||||
|
@ -40,7 +40,7 @@ public:
|
||||
size_t max_block_size,
|
||||
size_t num_streams) override;
|
||||
|
||||
bool parallelizeOutputAfterReading() const override { return false; }
|
||||
bool parallelizeOutputAfterReading(ContextPtr) const override { return false; }
|
||||
|
||||
bool hasEvenlyDistributedRead() const override { return true; }
|
||||
bool isSystemStorage() const override { return true; }
|
||||
|
@ -30,7 +30,7 @@ public:
|
||||
size_t max_block_size,
|
||||
size_t num_streams) override;
|
||||
|
||||
bool parallelizeOutputAfterReading() const override { return false; }
|
||||
bool parallelizeOutputAfterReading(ContextPtr) const override { return false; }
|
||||
|
||||
bool isSystemStorage() const override { return true; }
|
||||
|
||||
|
@ -31,7 +31,7 @@ public:
|
||||
size_t max_block_size,
|
||||
size_t num_streams) override;
|
||||
|
||||
bool parallelizeOutputAfterReading() const override { return false; }
|
||||
bool parallelizeOutputAfterReading(ContextPtr) const override { return false; }
|
||||
|
||||
bool hasEvenlyDistributedRead() const override { return true; }
|
||||
bool isSystemStorage() const override { return true; }
|
||||
|
@ -0,0 +1,12 @@
|
||||
0
|
||||
1
|
||||
2
|
||||
(Expression)
|
||||
ExpressionTransform
|
||||
(ReadFromStorage)
|
||||
File 0 → 1
|
||||
(Expression)
|
||||
ExpressionTransform × 2
|
||||
(ReadFromStorage)
|
||||
Resize 1 → 2
|
||||
File 0 → 1
|
16
tests/queries/0_stateless/02725_parquet_preserve_order.sh
Executable file
16
tests/queries/0_stateless/02725_parquet_preserve_order.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
|
||||
# This file has a row group with 2 rows, then a row group with 1 row.
|
||||
# It'll be read into two blocks. The first block will sleep 2x longer than the second.
|
||||
# So reordering is very likely if the order-preservation doesn't work.
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select number+sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=1"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "explain pipeline select number+sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=1, max_threads=2"
|
||||
$CLICKHOUSE_LOCAL -q "explain pipeline select number+sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=0, parallelize_output_from_storages=1, max_threads=2"
|
BIN
tests/queries/0_stateless/data_parquet/02725_data.parquet
Normal file
BIN
tests/queries/0_stateless/data_parquet/02725_data.parquet
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user