2020-04-28 13:55:50 +00:00
|
|
|
#include <Core/SortCursor.h>
|
|
|
|
#include <Interpreters/SortedBlocksWriter.h>
|
2021-10-16 14:03:50 +00:00
|
|
|
#include <QueryPipeline/QueryPipelineBuilder.h>
|
2021-07-22 16:05:52 +00:00
|
|
|
#include <Processors/Executors/PullingPipelineExecutor.h>
|
|
|
|
#include <Processors/Sources/SourceFromSingleChunk.h>
|
|
|
|
#include <Processors/Merges/MergingSortedTransform.h>
|
2021-10-15 20:18:20 +00:00
|
|
|
#include <Processors/Sources/TemporaryFileLazySource.h>
|
2022-09-21 12:51:46 +00:00
|
|
|
#include <Formats/TemporaryFileStreamLegacy.h>
|
2021-01-23 15:20:15 +00:00
|
|
|
#include <Disks/IVolume.h>
|
2022-08-23 10:52:38 +00:00
|
|
|
#include <Disks/TemporaryFileOnDisk.h>
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2021-07-17 18:06:46 +00:00
|
|
|
|
2022-08-15 18:04:25 +00:00
|
|
|
namespace ProfileEvents
|
|
|
|
{
|
|
|
|
extern const Event ExternalJoinWritePart;
|
|
|
|
extern const Event ExternalJoinMerge;
|
|
|
|
extern const Event ExternalJoinCompressedBytes;
|
|
|
|
extern const Event ExternalJoinUncompressedBytes;
|
|
|
|
extern const Event ExternalProcessingCompressedBytesTotal;
|
|
|
|
extern const Event ExternalProcessingUncompressedBytesTotal;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace CurrentMetrics
|
|
|
|
{
|
|
|
|
extern const Metric TemporaryFilesForJoin;
|
|
|
|
}
|
|
|
|
|
2020-04-28 13:55:50 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
Fix SIGSEGV in SortedBlocksWriter in case of empty block
CI found one issue [1].
Here is the stack trace for invalid read:
<details>
<summary>stack trace</summary>
```
0: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__is_long(this="") const at string:1445:22
1: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::basic_string(this="", __str="") at string:1927
2: DB::TemporaryFileLazySource::TemporaryFileLazySource(this=0x00007f3aec105f58, path_="", header_=0x00007f38ffd93b40) at TemporaryFileLazySource.cpp:11
3: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] DB::TemporaryFileLazySource* std::__1::construct_at<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, DB::TemporaryFileLazySource*>(__args=0x00007f38ffd91560) at construct_at.h:38:50
4: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] void std::__1::allocator_traits<std::__1::allocator<DB::TemporaryFileLazySource> >::construct<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void, void>(__args=0x00007f38ffd91560) at allocator_traits.h:298
5: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::__shared_ptr_emplace<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource> >::__shared_ptr_emplace<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block>(this=0x00007f3aec105f40, __args=0x00007f38ffd91560) at shared_ptr.h:293
6: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::allocate_shared<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:954
7: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::make_shared<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:963
8: DB::SortedBlocksWriter::streamFromFile(this=<unavailable>, file=<unavailable>) const at SortedBlocksWriter.cpp:238
9: DB::SortedBlocksWriter::premerge(this=<unavailable>) at SortedBlocksWriter.cpp:209:32
```
</details>
[1]: https://s3.amazonaws.com/clickhouse-test-reports/41046/adea92f847373d1fcfd733d8979c63024f9b80bf/stress_test__asan_.html
So the problem here is that there was empty unique_ptr<> reference to
temporary file, because of empty block that accepted by
SortedBlocksWriter::insert(), but insert() is not a problem the problem
is premerge() that steals blocks from insert() and do not have check
that there are some rows. However this check exists in
SortedBlocksWriter::flush(), and in that case temporary file is not
created.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-09-09 17:23:45 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
}
|
|
|
|
|
2020-04-28 13:55:50 +00:00
|
|
|
namespace
|
|
|
|
{
|
2022-08-23 11:19:34 +00:00
|
|
|
|
2022-08-23 10:52:38 +00:00
|
|
|
TemporaryFileOnDiskHolder flushToFile(const DiskPtr & disk, const Block & header, QueryPipelineBuilder pipeline, const String & codec)
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
2022-08-23 11:19:34 +00:00
|
|
|
auto tmp_file = std::make_unique<TemporaryFileOnDisk>(disk, CurrentMetrics::TemporaryFilesForJoin);
|
2022-08-31 12:53:26 +00:00
|
|
|
auto write_stat = TemporaryFileStreamLegacy::write(tmp_file->getPath(), header, std::move(pipeline), codec);
|
2022-08-15 18:04:25 +00:00
|
|
|
|
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalProcessingCompressedBytesTotal, write_stat.compressed_bytes);
|
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalProcessingUncompressedBytesTotal, write_stat.uncompressed_bytes);
|
|
|
|
|
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalJoinCompressedBytes, write_stat.compressed_bytes);
|
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalJoinUncompressedBytes, write_stat.uncompressed_bytes);
|
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalJoinWritePart);
|
|
|
|
|
2020-04-28 13:55:50 +00:00
|
|
|
return tmp_file;
|
|
|
|
}
|
|
|
|
|
2022-08-15 16:42:50 +00:00
|
|
|
SortedBlocksWriter::SortedFiles flushToManyFiles(const DiskPtr & disk, const Block & header, QueryPipelineBuilder builder,
|
2020-04-28 13:55:50 +00:00
|
|
|
const String & codec, std::function<void(const Block &)> callback = [](const Block &){})
|
|
|
|
{
|
2022-08-23 10:52:38 +00:00
|
|
|
std::vector<TemporaryFileOnDiskHolder> files;
|
2021-09-16 17:40:42 +00:00
|
|
|
auto pipeline = QueryPipelineBuilder::getPipeline(std::move(builder));
|
2021-07-22 16:05:52 +00:00
|
|
|
PullingPipelineExecutor executor(pipeline);
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2021-07-22 16:05:52 +00:00
|
|
|
Block block;
|
|
|
|
while (executor.pull(block))
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
|
|
|
if (!block.rows())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
callback(block);
|
|
|
|
|
2021-09-14 17:48:18 +00:00
|
|
|
QueryPipelineBuilder one_block_pipeline;
|
2021-07-22 16:05:52 +00:00
|
|
|
Chunk chunk(block.getColumns(), block.rows());
|
|
|
|
one_block_pipeline.init(Pipe(std::make_shared<SourceFromSingleChunk>(block.cloneEmpty(), std::move(chunk))));
|
2022-08-15 16:42:50 +00:00
|
|
|
auto tmp_file = flushToFile(disk, header, std::move(one_block_pipeline), codec);
|
2020-04-28 13:55:50 +00:00
|
|
|
files.emplace_back(std::move(tmp_file));
|
|
|
|
}
|
|
|
|
|
|
|
|
return files;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void SortedBlocksWriter::insert(Block && block)
|
|
|
|
{
|
|
|
|
bool can_insert_more = false;
|
|
|
|
bool has_data_to_flush = false;
|
|
|
|
|
|
|
|
BlocksList current_blocks;
|
|
|
|
size_t row_count = 0;
|
|
|
|
size_t bytes = 0;
|
|
|
|
size_t flush_no = 0;
|
|
|
|
|
Fix SIGSEGV in SortedBlocksWriter in case of empty block
CI found one issue [1].
Here is the stack trace for invalid read:
<details>
<summary>stack trace</summary>
```
0: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__is_long(this="") const at string:1445:22
1: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::basic_string(this="", __str="") at string:1927
2: DB::TemporaryFileLazySource::TemporaryFileLazySource(this=0x00007f3aec105f58, path_="", header_=0x00007f38ffd93b40) at TemporaryFileLazySource.cpp:11
3: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] DB::TemporaryFileLazySource* std::__1::construct_at<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, DB::TemporaryFileLazySource*>(__args=0x00007f38ffd91560) at construct_at.h:38:50
4: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] void std::__1::allocator_traits<std::__1::allocator<DB::TemporaryFileLazySource> >::construct<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void, void>(__args=0x00007f38ffd91560) at allocator_traits.h:298
5: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::__shared_ptr_emplace<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource> >::__shared_ptr_emplace<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block>(this=0x00007f3aec105f40, __args=0x00007f38ffd91560) at shared_ptr.h:293
6: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::allocate_shared<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:954
7: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::make_shared<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:963
8: DB::SortedBlocksWriter::streamFromFile(this=<unavailable>, file=<unavailable>) const at SortedBlocksWriter.cpp:238
9: DB::SortedBlocksWriter::premerge(this=<unavailable>) at SortedBlocksWriter.cpp:209:32
```
</details>
[1]: https://s3.amazonaws.com/clickhouse-test-reports/41046/adea92f847373d1fcfd733d8979c63024f9b80bf/stress_test__asan_.html
So the problem here is that there was empty unique_ptr<> reference to
temporary file, because of empty block that accepted by
SortedBlocksWriter::insert(), but insert() is not a problem the problem
is premerge() that steals blocks from insert() and do not have check
that there are some rows. However this check exists in
SortedBlocksWriter::flush(), and in that case temporary file is not
created.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-09-09 17:23:45 +00:00
|
|
|
if (!block.rows())
|
|
|
|
return;
|
|
|
|
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
|
|
|
std::lock_guard lock{insert_mutex};
|
|
|
|
|
2022-09-09 17:43:14 +00:00
|
|
|
/// insert block into BlocksList under lock
|
2020-04-28 13:55:50 +00:00
|
|
|
inserted_blocks.insert(std::move(block));
|
|
|
|
|
|
|
|
size_t total_row_count = inserted_blocks.row_count + row_count_in_flush;
|
|
|
|
size_t total_bytes = inserted_blocks.bytes + bytes_in_flush;
|
|
|
|
|
|
|
|
can_insert_more = size_limits.softCheck(total_row_count, total_bytes);
|
|
|
|
has_data_to_flush = !size_limits.softCheck(inserted_blocks.row_count * num_streams, inserted_blocks.bytes * num_streams);
|
|
|
|
|
|
|
|
if (has_data_to_flush)
|
|
|
|
{
|
|
|
|
++flush_inflight;
|
|
|
|
current_blocks.swap(inserted_blocks.blocks);
|
|
|
|
row_count_in_flush = total_row_count;
|
|
|
|
bytes_in_flush = total_bytes;
|
|
|
|
|
|
|
|
row_count = inserted_blocks.row_count;
|
|
|
|
bytes = inserted_blocks.bytes;
|
|
|
|
inserted_blocks.clear();
|
|
|
|
}
|
|
|
|
else if (can_insert_more)
|
|
|
|
flush_no = flush_number;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (has_data_to_flush)
|
|
|
|
{
|
|
|
|
/// flush new blocks without lock
|
|
|
|
auto flushed = flush(current_blocks);
|
|
|
|
current_blocks.clear();
|
|
|
|
|
|
|
|
std::lock_guard lock{insert_mutex};
|
|
|
|
|
|
|
|
sorted_files.emplace_back(std::move(flushed));
|
|
|
|
row_count_in_flush -= row_count;
|
|
|
|
bytes_in_flush -= bytes;
|
|
|
|
|
|
|
|
/// notify another insert (flush_number) and merge (flush_inflight)
|
|
|
|
++flush_number;
|
|
|
|
--flush_inflight;
|
|
|
|
flush_condvar.notify_all();
|
|
|
|
}
|
|
|
|
else if (!can_insert_more)
|
|
|
|
{
|
|
|
|
/// wakeup insert blocked by out of limit
|
|
|
|
std::unique_lock lock{insert_mutex};
|
|
|
|
flush_condvar.wait(lock, [&]{ return flush_no < flush_number; });
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SortedBlocksWriter::TmpFilePtr SortedBlocksWriter::flush(const BlocksList & blocks) const
|
|
|
|
{
|
2021-07-22 16:05:52 +00:00
|
|
|
Pipes pipes;
|
|
|
|
pipes.reserve(blocks.size());
|
|
|
|
for (const auto & block : blocks)
|
|
|
|
if (auto num_rows = block.rows())
|
|
|
|
pipes.emplace_back(std::make_shared<SourceFromSingleChunk>(block.cloneEmpty(), Chunk(block.getColumns(), num_rows)));
|
|
|
|
|
|
|
|
if (pipes.empty())
|
Fix SIGSEGV in SortedBlocksWriter in case of empty block
CI found one issue [1].
Here is the stack trace for invalid read:
<details>
<summary>stack trace</summary>
```
0: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__is_long(this="") const at string:1445:22
1: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::basic_string(this="", __str="") at string:1927
2: DB::TemporaryFileLazySource::TemporaryFileLazySource(this=0x00007f3aec105f58, path_="", header_=0x00007f38ffd93b40) at TemporaryFileLazySource.cpp:11
3: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] DB::TemporaryFileLazySource* std::__1::construct_at<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, DB::TemporaryFileLazySource*>(__args=0x00007f38ffd91560) at construct_at.h:38:50
4: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] void std::__1::allocator_traits<std::__1::allocator<DB::TemporaryFileLazySource> >::construct<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void, void>(__args=0x00007f38ffd91560) at allocator_traits.h:298
5: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::__shared_ptr_emplace<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource> >::__shared_ptr_emplace<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block>(this=0x00007f3aec105f40, __args=0x00007f38ffd91560) at shared_ptr.h:293
6: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::allocate_shared<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:954
7: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::make_shared<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:963
8: DB::SortedBlocksWriter::streamFromFile(this=<unavailable>, file=<unavailable>) const at SortedBlocksWriter.cpp:238
9: DB::SortedBlocksWriter::premerge(this=<unavailable>) at SortedBlocksWriter.cpp:209:32
```
</details>
[1]: https://s3.amazonaws.com/clickhouse-test-reports/41046/adea92f847373d1fcfd733d8979c63024f9b80bf/stress_test__asan_.html
So the problem here is that there was empty unique_ptr<> reference to
temporary file, because of empty block that accepted by
SortedBlocksWriter::insert(), but insert() is not a problem the problem
is premerge() that steals blocks from insert() and do not have check
that there are some rows. However this check exists in
SortedBlocksWriter::flush(), and in that case temporary file is not
created.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-09-09 17:23:45 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty block");
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2021-09-14 17:48:18 +00:00
|
|
|
QueryPipelineBuilder pipeline;
|
2021-07-22 16:05:52 +00:00
|
|
|
pipeline.init(Pipe::unitePipes(std::move(pipes)));
|
|
|
|
|
|
|
|
if (pipeline.getNumStreams() > 1)
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
2021-07-22 16:05:52 +00:00
|
|
|
auto transform = std::make_shared<MergingSortedTransform>(
|
|
|
|
pipeline.getHeader(),
|
|
|
|
pipeline.getNumStreams(),
|
|
|
|
sort_description,
|
2022-07-05 14:15:00 +00:00
|
|
|
rows_in_block,
|
|
|
|
SortingQueueStrategy::Default);
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2021-07-22 16:05:52 +00:00
|
|
|
pipeline.addTransform(std::move(transform));
|
|
|
|
}
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2022-08-15 16:42:50 +00:00
|
|
|
return flushToFile(volume->getDisk(), sample_block, std::move(pipeline), codec);
|
2020-04-28 13:55:50 +00:00
|
|
|
}
|
|
|
|
|
2020-06-16 20:13:18 +00:00
|
|
|
SortedBlocksWriter::PremergedFiles SortedBlocksWriter::premerge()
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
2020-06-16 20:13:18 +00:00
|
|
|
SortedFiles files;
|
|
|
|
BlocksList blocks;
|
|
|
|
|
2020-04-28 13:55:50 +00:00
|
|
|
/// wait other flushes if any
|
|
|
|
{
|
|
|
|
std::unique_lock lock{insert_mutex};
|
2020-06-16 20:13:18 +00:00
|
|
|
|
|
|
|
files.swap(sorted_files);
|
|
|
|
blocks.swap(inserted_blocks.blocks);
|
|
|
|
inserted_blocks.clear();
|
|
|
|
|
2020-04-28 13:55:50 +00:00
|
|
|
flush_condvar.wait(lock, [&]{ return !flush_inflight; });
|
|
|
|
}
|
|
|
|
|
|
|
|
/// flush not flushed
|
2020-06-16 20:13:18 +00:00
|
|
|
if (!blocks.empty())
|
|
|
|
files.emplace_back(flush(blocks));
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2021-07-22 16:05:52 +00:00
|
|
|
Pipes pipes;
|
|
|
|
pipes.reserve(num_files_for_merge);
|
2020-04-28 13:55:50 +00:00
|
|
|
|
|
|
|
/// Merge by parts to save memory. It's possible to exchange disk I/O and memory by num_files_for_merge.
|
|
|
|
{
|
|
|
|
SortedFiles new_files;
|
2020-06-16 20:13:18 +00:00
|
|
|
new_files.reserve(files.size() / num_files_for_merge + 1);
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2020-06-16 20:13:18 +00:00
|
|
|
while (files.size() > num_files_for_merge)
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
2020-06-16 20:13:18 +00:00
|
|
|
for (const auto & file : files)
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
2021-07-22 16:05:52 +00:00
|
|
|
pipes.emplace_back(streamFromFile(file));
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2021-07-22 16:05:52 +00:00
|
|
|
if (pipes.size() == num_files_for_merge || &file == &files.back())
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
2021-09-14 17:48:18 +00:00
|
|
|
QueryPipelineBuilder pipeline;
|
2021-07-22 16:05:52 +00:00
|
|
|
pipeline.init(Pipe::unitePipes(std::move(pipes)));
|
|
|
|
pipes = Pipes();
|
|
|
|
|
|
|
|
if (pipeline.getNumStreams() > 1)
|
|
|
|
{
|
|
|
|
auto transform = std::make_shared<MergingSortedTransform>(
|
|
|
|
pipeline.getHeader(),
|
|
|
|
pipeline.getNumStreams(),
|
|
|
|
sort_description,
|
2022-07-05 14:15:00 +00:00
|
|
|
rows_in_block,
|
|
|
|
SortingQueueStrategy::Default);
|
2021-07-22 16:05:52 +00:00
|
|
|
|
|
|
|
pipeline.addTransform(std::move(transform));
|
|
|
|
}
|
|
|
|
|
2022-08-15 16:42:50 +00:00
|
|
|
new_files.emplace_back(flushToFile(volume->getDisk(), sample_block, std::move(pipeline), codec));
|
2020-04-28 13:55:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-16 20:13:18 +00:00
|
|
|
files.clear();
|
|
|
|
files.swap(new_files);
|
2020-04-28 13:55:50 +00:00
|
|
|
}
|
|
|
|
|
2020-06-16 20:13:18 +00:00
|
|
|
for (const auto & file : files)
|
2021-07-22 16:05:52 +00:00
|
|
|
pipes.emplace_back(streamFromFile(file));
|
2020-04-28 13:55:50 +00:00
|
|
|
}
|
|
|
|
|
2021-07-22 16:05:52 +00:00
|
|
|
return PremergedFiles{std::move(files), Pipe::unitePipes(std::move(pipes))};
|
2020-06-16 20:13:18 +00:00
|
|
|
}
|
2020-04-28 13:55:50 +00:00
|
|
|
|
2020-06-16 20:13:18 +00:00
|
|
|
SortedBlocksWriter::SortedFiles SortedBlocksWriter::finishMerge(std::function<void(const Block &)> callback)
|
|
|
|
{
|
|
|
|
PremergedFiles files = premerge();
|
2021-09-14 17:48:18 +00:00
|
|
|
QueryPipelineBuilder pipeline;
|
2021-07-22 16:05:52 +00:00
|
|
|
pipeline.init(std::move(files.pipe));
|
|
|
|
|
|
|
|
if (pipeline.getNumStreams() > 1)
|
|
|
|
{
|
2022-08-15 18:04:25 +00:00
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalJoinMerge);
|
2021-07-22 16:05:52 +00:00
|
|
|
auto transform = std::make_shared<MergingSortedTransform>(
|
|
|
|
pipeline.getHeader(),
|
|
|
|
pipeline.getNumStreams(),
|
|
|
|
sort_description,
|
2022-07-05 14:15:00 +00:00
|
|
|
rows_in_block,
|
|
|
|
SortingQueueStrategy::Default);
|
2021-07-22 16:05:52 +00:00
|
|
|
|
|
|
|
pipeline.addTransform(std::move(transform));
|
|
|
|
}
|
|
|
|
|
2022-08-15 16:42:50 +00:00
|
|
|
return flushToManyFiles(volume->getDisk(), sample_block, std::move(pipeline), codec, callback);
|
2020-04-28 13:55:50 +00:00
|
|
|
}
|
|
|
|
|
2021-07-22 16:05:52 +00:00
|
|
|
Pipe SortedBlocksWriter::streamFromFile(const TmpFilePtr & file) const
|
2020-04-28 13:55:50 +00:00
|
|
|
{
|
2021-07-22 16:05:52 +00:00
|
|
|
return Pipe(std::make_shared<TemporaryFileLazySource>(file->path(), materializeBlock(sample_block)));
|
2020-04-28 13:55:50 +00:00
|
|
|
}
|
|
|
|
|
2020-06-16 20:13:18 +00:00
|
|
|
|
|
|
|
Block SortedBlocksBuffer::exchange(Block && block)
|
|
|
|
{
|
2022-09-10 02:07:51 +00:00
|
|
|
static constexpr const double reserve_coefficient = 1.2;
|
2020-06-16 20:13:18 +00:00
|
|
|
|
|
|
|
Blocks out_blocks;
|
|
|
|
Block empty_out = block.cloneEmpty();
|
|
|
|
|
|
|
|
{
|
|
|
|
std::lock_guard lock(mutex);
|
|
|
|
|
|
|
|
if (block)
|
|
|
|
{
|
|
|
|
current_bytes += block.bytes();
|
|
|
|
buffer.emplace_back(std::move(block));
|
|
|
|
|
|
|
|
/// Saved. Return empty block with same structure.
|
|
|
|
if (current_bytes < max_bytes)
|
|
|
|
return empty_out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Not saved. Return buffered.
|
|
|
|
out_blocks.swap(buffer);
|
2022-09-10 02:07:51 +00:00
|
|
|
buffer.reserve(static_cast<size_t>(out_blocks.size() * reserve_coefficient));
|
2020-06-16 20:13:18 +00:00
|
|
|
current_bytes = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (size_t size = out_blocks.size())
|
|
|
|
{
|
|
|
|
if (size == 1)
|
|
|
|
return out_blocks[0];
|
|
|
|
return mergeBlocks(std::move(out_blocks));
|
|
|
|
}
|
|
|
|
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
Block SortedBlocksBuffer::mergeBlocks(Blocks && blocks) const
|
|
|
|
{
|
|
|
|
size_t num_rows = 0;
|
|
|
|
|
|
|
|
{ /// Merge sort blocks
|
2021-07-22 16:05:52 +00:00
|
|
|
Pipes pipes;
|
|
|
|
pipes.reserve(blocks.size());
|
2020-06-16 20:13:18 +00:00
|
|
|
|
|
|
|
for (auto & block : blocks)
|
|
|
|
{
|
|
|
|
num_rows += block.rows();
|
2021-07-22 16:05:52 +00:00
|
|
|
Chunk chunk(block.getColumns(), block.rows());
|
|
|
|
pipes.emplace_back(std::make_shared<SourceFromSingleChunk>(block.cloneEmpty(), std::move(chunk)));
|
2020-06-16 20:13:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Blocks tmp_blocks;
|
2021-07-22 16:05:52 +00:00
|
|
|
|
2021-09-16 17:40:42 +00:00
|
|
|
QueryPipelineBuilder builder;
|
|
|
|
builder.init(Pipe::unitePipes(std::move(pipes)));
|
2021-07-22 16:05:52 +00:00
|
|
|
|
2021-09-16 17:40:42 +00:00
|
|
|
if (builder.getNumStreams() > 1)
|
2021-07-22 16:05:52 +00:00
|
|
|
{
|
|
|
|
auto transform = std::make_shared<MergingSortedTransform>(
|
2021-09-16 17:40:42 +00:00
|
|
|
builder.getHeader(),
|
|
|
|
builder.getNumStreams(),
|
2021-07-22 16:05:52 +00:00
|
|
|
sort_description,
|
2022-07-05 14:15:00 +00:00
|
|
|
num_rows,
|
|
|
|
SortingQueueStrategy::Default);
|
2021-07-22 16:05:52 +00:00
|
|
|
|
2021-09-16 17:40:42 +00:00
|
|
|
builder.addTransform(std::move(transform));
|
2021-07-22 16:05:52 +00:00
|
|
|
}
|
|
|
|
|
2021-09-16 17:40:42 +00:00
|
|
|
auto pipeline = QueryPipelineBuilder::getPipeline(std::move(builder));
|
2021-07-22 16:05:52 +00:00
|
|
|
PullingPipelineExecutor executor(pipeline);
|
|
|
|
Block block;
|
|
|
|
while (executor.pull(block))
|
2020-06-16 20:13:18 +00:00
|
|
|
tmp_blocks.emplace_back(block);
|
|
|
|
|
|
|
|
blocks.swap(tmp_blocks);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (blocks.size() == 1)
|
|
|
|
return blocks[0];
|
|
|
|
|
2022-02-07 11:12:19 +00:00
|
|
|
return concatenateBlocks(blocks);
|
2020-06-16 20:13:18 +00:00
|
|
|
}
|
|
|
|
|
2020-04-28 13:55:50 +00:00
|
|
|
}
|