2017-04-01 09:19:00 +00:00
|
|
|
#include <Storages/MergeTree/MergedBlockOutputStream.h>
|
2020-05-20 20:16:32 +00:00
|
|
|
#include <Interpreters/Context.h>
|
2017-01-21 04:24:28 +00:00
|
|
|
#include <Poco/File.h>
|
|
|
|
|
2016-07-21 16:22:24 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2019-03-14 23:10:51 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-02-25 18:20:08 +00:00
|
|
|
extern const int NOT_IMPLEMENTED;
|
|
|
|
extern const int LOGICAL_ERROR;
|
2019-03-14 23:10:51 +00:00
|
|
|
}
|
|
|
|
|
2016-07-21 16:22:24 +00:00
|
|
|
|
|
|
|
MergedBlockOutputStream::MergedBlockOutputStream(
|
2019-11-07 11:11:38 +00:00
|
|
|
const MergeTreeDataPartPtr & data_part,
|
2017-12-25 21:57:29 +00:00
|
|
|
const NamesAndTypesList & columns_list_,
|
2020-04-10 13:36:51 +00:00
|
|
|
const MergeTreeIndices & skip_indices,
|
2019-11-07 11:11:38 +00:00
|
|
|
CompressionCodecPtr default_codec,
|
|
|
|
bool blocks_are_granules_size)
|
2020-02-04 13:34:57 +00:00
|
|
|
: MergedBlockOutputStream(
|
2020-04-10 13:36:51 +00:00
|
|
|
data_part, columns_list_, skip_indices, default_codec, {},
|
2020-02-04 13:34:57 +00:00
|
|
|
data_part->storage.global_context.getSettings().min_bytes_to_use_direct_io,
|
|
|
|
blocks_are_granules_size)
|
2019-12-18 16:41:11 +00:00
|
|
|
{
|
2016-07-21 16:22:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
MergedBlockOutputStream::MergedBlockOutputStream(
|
2019-11-07 11:11:38 +00:00
|
|
|
const MergeTreeDataPartPtr & data_part,
|
2017-12-25 21:57:29 +00:00
|
|
|
const NamesAndTypesList & columns_list_,
|
2020-04-10 13:36:51 +00:00
|
|
|
const MergeTreeIndices & skip_indices,
|
2019-11-07 11:11:38 +00:00
|
|
|
CompressionCodecPtr default_codec,
|
2019-11-05 11:53:22 +00:00
|
|
|
const MergeTreeData::DataPart::ColumnToSize & merged_column_to_size,
|
|
|
|
size_t aio_threshold,
|
2019-11-07 11:11:38 +00:00
|
|
|
bool blocks_are_granules_size)
|
|
|
|
: IMergedBlockOutputStream(data_part)
|
2019-07-28 11:10:35 +00:00
|
|
|
, columns_list(columns_list_)
|
2016-07-21 16:22:24 +00:00
|
|
|
{
|
2019-12-18 15:54:45 +00:00
|
|
|
MergeTreeWriterSettings writer_settings(data_part->storage.global_context.getSettings(),
|
2020-02-04 13:34:57 +00:00
|
|
|
data_part->storage.canUseAdaptiveGranularity(), aio_threshold, blocks_are_granules_size);
|
2019-11-05 11:53:22 +00:00
|
|
|
|
|
|
|
if (aio_threshold > 0 && !merged_column_to_size.empty())
|
|
|
|
{
|
2019-12-18 13:09:58 +00:00
|
|
|
for (const auto & column : columns_list)
|
2019-11-05 11:53:22 +00:00
|
|
|
{
|
2019-12-18 13:09:58 +00:00
|
|
|
auto size_it = merged_column_to_size.find(column.name);
|
|
|
|
if (size_it != merged_column_to_size.end())
|
|
|
|
writer_settings.estimated_size += size_it->second;
|
2019-11-05 11:53:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-11 17:20:26 +00:00
|
|
|
volume->getDisk()->createDirectories(part_path);
|
2020-02-04 13:34:57 +00:00
|
|
|
|
2020-04-10 13:36:51 +00:00
|
|
|
writer = data_part->getWriter(columns_list, skip_indices, default_codec, writer_settings);
|
2020-02-04 13:34:57 +00:00
|
|
|
writer->initPrimaryIndex();
|
|
|
|
writer->initSkipIndices();
|
2016-07-21 16:22:24 +00:00
|
|
|
}
|
|
|
|
|
2018-05-07 02:01:11 +00:00
|
|
|
/// If data is pre-sorted.
|
2016-07-21 16:22:24 +00:00
|
|
|
void MergedBlockOutputStream::write(const Block & block)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
writeImpl(block, nullptr);
|
2016-07-21 16:22:24 +00:00
|
|
|
}
|
|
|
|
|
2017-03-12 19:18:07 +00:00
|
|
|
/** If the data is not sorted, but we pre-calculated the permutation, after which they will be sorted.
|
2017-04-01 07:20:54 +00:00
|
|
|
* This method is used to save RAM, since you do not need to keep two blocks at once - the source and the sorted.
|
|
|
|
*/
|
2016-07-21 16:22:24 +00:00
|
|
|
void MergedBlockOutputStream::writeWithPermutation(const Block & block, const IColumn::Permutation * permutation)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
writeImpl(block, permutation);
|
2016-07-21 16:22:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void MergedBlockOutputStream::writeSuffix()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Method writeSuffix is not supported by MergedBlockOutputStream", ErrorCodes::NOT_IMPLEMENTED);
|
2016-07-21 16:22:24 +00:00
|
|
|
}
|
|
|
|
|
2017-08-30 19:03:19 +00:00
|
|
|
void MergedBlockOutputStream::writeSuffixAndFinalizePart(
|
|
|
|
MergeTreeData::MutableDataPartPtr & new_part,
|
2020-03-09 02:55:28 +00:00
|
|
|
const NamesAndTypesList * total_columns_list,
|
2017-08-30 19:03:19 +00:00
|
|
|
MergeTreeData::DataPart::Checksums * additional_column_checksums)
|
2016-07-21 16:22:24 +00:00
|
|
|
{
|
2019-10-21 17:23:06 +00:00
|
|
|
/// Finish write and get checksums.
|
|
|
|
MergeTreeData::DataPart::Checksums checksums;
|
|
|
|
|
2019-10-28 11:00:29 +00:00
|
|
|
if (additional_column_checksums)
|
|
|
|
checksums = std::move(*additional_column_checksums);
|
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
/// Finish columns serialization.
|
2019-11-18 15:18:50 +00:00
|
|
|
writer->finishDataSerialization(checksums);
|
|
|
|
writer->finishPrimaryIndexSerialization(checksums);
|
2019-11-05 11:53:22 +00:00
|
|
|
writer->finishSkipIndicesSerialization(checksums);
|
2019-06-18 12:54:27 +00:00
|
|
|
|
2020-03-13 15:09:55 +00:00
|
|
|
NamesAndTypesList part_columns;
|
2020-03-09 02:55:28 +00:00
|
|
|
if (!total_columns_list)
|
2020-03-13 15:09:55 +00:00
|
|
|
part_columns = columns_list;
|
|
|
|
else
|
|
|
|
part_columns = *total_columns_list;
|
2017-08-30 19:03:19 +00:00
|
|
|
|
2020-01-22 14:10:35 +00:00
|
|
|
if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING || isCompactPart(new_part))
|
2017-10-19 18:20:41 +00:00
|
|
|
{
|
2020-05-11 17:20:26 +00:00
|
|
|
new_part->partition.store(storage, volume->getDisk(), part_path, checksums);
|
2018-05-23 19:34:37 +00:00
|
|
|
if (new_part->minmax_idx.initialized)
|
2020-05-11 17:20:26 +00:00
|
|
|
new_part->minmax_idx.store(storage, volume->getDisk(), part_path, checksums);
|
2018-08-06 16:42:43 +00:00
|
|
|
else if (rows_count)
|
2018-08-06 16:53:34 +00:00
|
|
|
throw Exception("MinMax index was not initialized for new non-empty part " + new_part->name
|
2018-08-06 16:42:43 +00:00
|
|
|
+ ". It is a bug.", ErrorCodes::LOGICAL_ERROR);
|
2017-10-24 14:11:53 +00:00
|
|
|
|
2020-05-11 17:20:26 +00:00
|
|
|
auto count_out = volume->getDisk()->writeFile(part_path + "count.txt", 4096);
|
2020-02-27 16:47:40 +00:00
|
|
|
HashingWriteBuffer count_out_hashing(*count_out);
|
2017-10-24 14:11:53 +00:00
|
|
|
writeIntText(rows_count, count_out_hashing);
|
|
|
|
count_out_hashing.next();
|
|
|
|
checksums.files["count.txt"].file_size = count_out_hashing.count();
|
|
|
|
checksums.files["count.txt"].file_hash = count_out_hashing.getHash();
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2019-10-17 18:55:07 +00:00
|
|
|
if (!new_part->ttl_infos.empty())
|
2019-04-15 09:30:45 +00:00
|
|
|
{
|
|
|
|
/// Write a file with ttl infos in json format.
|
2020-05-11 17:20:26 +00:00
|
|
|
auto out = volume->getDisk()->writeFile(part_path + "ttl.txt", 4096);
|
2020-02-27 16:47:40 +00:00
|
|
|
HashingWriteBuffer out_hashing(*out);
|
2019-04-15 09:30:45 +00:00
|
|
|
new_part->ttl_infos.write(out_hashing);
|
|
|
|
checksums.files["ttl.txt"].file_size = out_hashing.count();
|
|
|
|
checksums.files["ttl.txt"].file_hash = out_hashing.getHash();
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-13 15:09:55 +00:00
|
|
|
removeEmptyColumnsFromPart(new_part, part_columns, checksums);
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
/// Write a file with a description of columns.
|
2020-05-11 17:20:26 +00:00
|
|
|
auto out = volume->getDisk()->writeFile(part_path + "columns.txt", 4096);
|
2020-03-13 15:09:55 +00:00
|
|
|
part_columns.writeText(*out);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
/// Write file with checksums.
|
2020-05-11 17:20:26 +00:00
|
|
|
auto out = volume->getDisk()->writeFile(part_path + "checksums.txt", 4096);
|
2020-02-27 16:47:40 +00:00
|
|
|
checksums.write(*out);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2020-03-13 15:09:55 +00:00
|
|
|
new_part->setColumns(part_columns);
|
2017-10-24 14:11:53 +00:00
|
|
|
new_part->rows_count = rows_count;
|
2017-08-30 19:03:19 +00:00
|
|
|
new_part->modification_time = time(nullptr);
|
2019-11-18 15:18:50 +00:00
|
|
|
new_part->index = writer->releaseIndexColumns();
|
2017-08-30 19:03:19 +00:00
|
|
|
new_part->checksums = checksums;
|
2020-03-23 13:32:02 +00:00
|
|
|
new_part->setBytesOnDisk(checksums.getTotalSizeOnDisk());
|
2019-11-07 11:11:38 +00:00
|
|
|
new_part->index_granularity = writer->getIndexGranularity();
|
2020-03-23 12:19:43 +00:00
|
|
|
new_part->calculateColumnsSizesOnDisk();
|
2016-07-21 16:22:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Permutation * permutation)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
block.checkNumberOfRows();
|
|
|
|
size_t rows = block.rows();
|
2018-12-04 14:44:42 +00:00
|
|
|
if (!rows)
|
|
|
|
return;
|
2018-11-30 15:36:10 +00:00
|
|
|
|
2019-12-27 21:17:53 +00:00
|
|
|
std::unordered_set<String> skip_indexes_column_names_set;
|
2020-05-28 12:47:17 +00:00
|
|
|
for (const auto & index : storage.getIndices())
|
2020-05-28 12:37:05 +00:00
|
|
|
std::copy(index.column_names.cbegin(), index.column_names.cend(),
|
2019-02-14 16:59:26 +00:00
|
|
|
std::inserter(skip_indexes_column_names_set, skip_indexes_column_names_set.end()));
|
|
|
|
Names skip_indexes_column_names(skip_indexes_column_names_set.begin(), skip_indexes_column_names_set.end());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-05-21 19:46:03 +00:00
|
|
|
Block primary_key_block = getBlockAndPermute(block, storage.getPrimaryKeyColumns(), permutation);
|
2019-12-09 21:21:17 +00:00
|
|
|
Block skip_indexes_block = getBlockAndPermute(block, skip_indexes_column_names, permutation);
|
2019-02-05 14:50:25 +00:00
|
|
|
|
2019-11-07 11:11:38 +00:00
|
|
|
writer->write(block, permutation, primary_key_block, skip_indexes_block);
|
2019-11-05 11:53:22 +00:00
|
|
|
writer->calculateAndSerializeSkipIndices(skip_indexes_block, rows);
|
2019-11-07 11:11:38 +00:00
|
|
|
writer->calculateAndSerializePrimaryIndex(primary_key_block, rows);
|
2019-11-05 11:53:22 +00:00
|
|
|
writer->next();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-10-24 14:11:53 +00:00
|
|
|
rows_count += rows;
|
2016-07-21 16:22:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|