2013-04-24 10:31:32 +00:00
|
|
|
#pragma once
|
|
|
|
|
2019-03-30 13:44:23 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeIndexGranularity.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/WriteBufferFromFile.h>
|
2018-12-28 18:15:26 +00:00
|
|
|
#include <Compression/CompressedWriteBuffer.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/HashingWriteBuffer.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeData.h>
|
|
|
|
#include <DataStreams/IBlockOutputStream.h>
|
2013-04-24 10:31:32 +00:00
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Columns/ColumnArray.h>
|
2016-11-03 12:00:44 +00:00
|
|
|
|
2013-09-15 01:10:16 +00:00
|
|
|
|
2013-04-24 10:31:32 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2015-08-14 02:45:40 +00:00
|
|
|
|
|
|
|
|
2014-03-04 11:30:50 +00:00
|
|
|
class IMergedBlockOutputStream : public IBlockOutputStream
|
2013-04-24 10:31:32 +00:00
|
|
|
{
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
IMergedBlockOutputStream(
|
|
|
|
MergeTreeData & storage_,
|
|
|
|
size_t min_compress_block_size_,
|
|
|
|
size_t max_compress_block_size_,
|
2018-12-21 12:17:30 +00:00
|
|
|
CompressionCodecPtr default_codec_,
|
2018-11-29 13:50:34 +00:00
|
|
|
size_t aio_threshold_,
|
2018-11-30 15:36:10 +00:00
|
|
|
bool blocks_are_granules_size_,
|
2019-03-30 13:44:23 +00:00
|
|
|
const MergeTreeIndexGranularity & index_granularity_);
|
2013-08-24 08:01:19 +00:00
|
|
|
|
2018-10-16 21:22:41 +00:00
|
|
|
using WrittenOffsetColumns = std::set<std::string>;
|
|
|
|
|
2014-03-04 11:30:50 +00:00
|
|
|
protected:
|
2018-06-07 18:14:37 +00:00
|
|
|
using SerializationState = IDataType::SerializeBinaryBulkStatePtr;
|
|
|
|
using SerializationStates = std::vector<SerializationState>;
|
2015-03-14 02:36:39 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
struct ColumnStream
|
|
|
|
{
|
|
|
|
ColumnStream(
|
|
|
|
const String & escaped_column_name_,
|
|
|
|
const String & data_path,
|
|
|
|
const std::string & data_file_extension_,
|
|
|
|
const std::string & marks_path,
|
|
|
|
const std::string & marks_file_extension_,
|
2018-10-11 02:57:48 +00:00
|
|
|
const CompressionCodecPtr & compression_codec,
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t max_compress_block_size,
|
|
|
|
size_t estimated_size,
|
|
|
|
size_t aio_threshold);
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
String escaped_column_name;
|
|
|
|
std::string data_file_extension;
|
|
|
|
std::string marks_file_extension;
|
2014-04-14 13:08:26 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// compressed -> compressed_buf -> plain_hashing -> plain_file
|
|
|
|
std::unique_ptr<WriteBufferFromFileBase> plain_file;
|
|
|
|
HashingWriteBuffer plain_hashing;
|
|
|
|
CompressedWriteBuffer compressed_buf;
|
|
|
|
HashingWriteBuffer compressed;
|
2014-04-14 13:08:26 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// marks -> marks_file
|
|
|
|
WriteBufferFromFile marks_file;
|
|
|
|
HashingWriteBuffer marks;
|
2013-09-15 01:10:16 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void finalize();
|
2016-07-21 16:22:24 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void sync();
|
2016-07-21 16:22:24 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void addToChecksums(MergeTreeData::DataPart::Checksums & checksums);
|
|
|
|
};
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
using ColumnStreams = std::map<String, std::unique_ptr<ColumnStream>>;
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2018-10-11 02:57:48 +00:00
|
|
|
void addStreams(const String & path, const String & name, const IDataType & type,
|
|
|
|
const CompressionCodecPtr & codec, size_t estimated_size, bool skip_offsets);
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-10-16 21:22:41 +00:00
|
|
|
IDataType::OutputStreamGetter createStreamGetter(const String & name, WrittenOffsetColumns & offset_columns, bool skip_offsets);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Write data of one column.
|
2018-11-30 15:36:10 +00:00
|
|
|
/// Return how many marks were written and
|
|
|
|
/// how many rows were written for last mark
|
|
|
|
std::pair<size_t, size_t> writeColumn(
|
|
|
|
const String & name,
|
|
|
|
const IDataType & type,
|
|
|
|
const IColumn & column,
|
|
|
|
WrittenOffsetColumns & offset_columns,
|
|
|
|
bool skip_offsets,
|
|
|
|
IDataType::SerializeBinaryBulkStatePtr & serialization_state,
|
|
|
|
size_t from_mark
|
|
|
|
);
|
|
|
|
|
2019-04-01 11:40:13 +00:00
|
|
|
/// Write single granule of one column (rows between 2 marks)
|
2018-11-30 15:36:10 +00:00
|
|
|
size_t writeSingleGranule(
|
|
|
|
const String & name,
|
|
|
|
const IDataType & type,
|
|
|
|
const IColumn & column,
|
|
|
|
WrittenOffsetColumns & offset_columns,
|
|
|
|
bool skip_offsets,
|
|
|
|
IDataType::SerializeBinaryBulkStatePtr & serialization_state,
|
|
|
|
IDataType::SerializeBinaryBulkSettings & serialize_settings,
|
|
|
|
size_t from_row,
|
|
|
|
size_t number_of_rows,
|
|
|
|
bool write_marks);
|
|
|
|
|
2019-04-01 11:40:13 +00:00
|
|
|
/// Count index_granularity for block and store in `index_granularity`
|
2018-11-30 15:36:10 +00:00
|
|
|
void fillIndexGranularity(const Block & block);
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
MergeTreeData & storage;
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
ColumnStreams column_streams;
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// The offset to the first row of the block for which you want to write the index.
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t index_offset = 0;
|
2014-04-08 15:29:12 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t min_compress_block_size;
|
|
|
|
size_t max_compress_block_size;
|
2015-03-14 02:36:39 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t aio_threshold;
|
2015-04-08 16:48:47 +00:00
|
|
|
|
2018-11-30 15:36:10 +00:00
|
|
|
size_t current_mark = 0;
|
2018-11-15 14:06:54 +00:00
|
|
|
|
2018-11-30 15:36:10 +00:00
|
|
|
const std::string marks_file_extension;
|
|
|
|
const size_t mark_size_in_bytes;
|
|
|
|
const bool blocks_are_granules_size;
|
|
|
|
|
2019-03-30 13:44:23 +00:00
|
|
|
MergeTreeIndexGranularity index_granularity;
|
2018-11-29 13:50:34 +00:00
|
|
|
|
2018-11-30 15:36:10 +00:00
|
|
|
const bool compute_granularity;
|
2018-12-21 12:17:30 +00:00
|
|
|
CompressionCodecPtr codec;
|
2014-03-04 11:30:50 +00:00
|
|
|
};
|
|
|
|
|
2015-08-14 02:45:40 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** To write one part.
|
2017-08-14 18:16:11 +00:00
|
|
|
* The data refers to one partition, and is written in one part.
|
2014-03-04 11:30:50 +00:00
|
|
|
*/
|
2017-11-20 02:15:15 +00:00
|
|
|
class MergedBlockOutputStream final : public IMergedBlockOutputStream
|
2014-03-04 11:30:50 +00:00
|
|
|
{
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
MergedBlockOutputStream(
|
|
|
|
MergeTreeData & storage_,
|
|
|
|
String part_path_,
|
2017-12-25 21:57:29 +00:00
|
|
|
const NamesAndTypesList & columns_list_,
|
2019-03-18 12:02:33 +00:00
|
|
|
CompressionCodecPtr default_codec_,
|
2019-03-25 13:55:24 +00:00
|
|
|
bool blocks_are_granules_size_ = false);
|
2014-06-26 00:58:14 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
MergedBlockOutputStream(
|
|
|
|
MergeTreeData & storage_,
|
|
|
|
String part_path_,
|
2017-12-25 21:57:29 +00:00
|
|
|
const NamesAndTypesList & columns_list_,
|
2018-12-21 12:17:30 +00:00
|
|
|
CompressionCodecPtr default_codec_,
|
2017-04-01 07:20:54 +00:00
|
|
|
const MergeTreeData::DataPart::ColumnToSize & merged_column_to_size_,
|
2018-11-30 15:36:10 +00:00
|
|
|
size_t aio_threshold_,
|
2019-03-25 13:55:24 +00:00
|
|
|
bool blocks_are_granules_size_ = false);
|
2016-07-21 16:22:24 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
std::string getPartPath() const;
|
2016-01-28 16:06:57 +00:00
|
|
|
|
2018-02-19 00:45:32 +00:00
|
|
|
Block getHeader() const override { return storage.getSampleBlock(); }
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// If the data is pre-sorted.
|
2017-04-01 07:20:54 +00:00
|
|
|
void write(const Block & block) override;
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2017-11-20 02:15:15 +00:00
|
|
|
/** If the data is not sorted, but we have previously calculated the permutation, that will sort it.
|
2017-04-16 15:00:33 +00:00
|
|
|
* This method is used to save RAM, since you do not need to keep two blocks at once - the original one and the sorted one.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void writeWithPermutation(const Block & block, const IColumn::Permutation * permutation);
|
2014-03-27 17:30:04 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void writeSuffix() override;
|
2014-06-26 00:58:14 +00:00
|
|
|
|
2018-11-15 14:06:54 +00:00
|
|
|
/// Finilize writing part and fill inner structures
|
2017-08-30 19:03:19 +00:00
|
|
|
void writeSuffixAndFinalizePart(
|
|
|
|
MergeTreeData::MutableDataPartPtr & new_part,
|
2017-12-25 21:57:29 +00:00
|
|
|
const NamesAndTypesList * total_columns_list = nullptr,
|
2017-08-30 19:03:19 +00:00
|
|
|
MergeTreeData::DataPart::Checksums * additional_column_checksums = nullptr);
|
2016-11-03 12:00:44 +00:00
|
|
|
|
2019-03-30 13:44:23 +00:00
|
|
|
const MergeTreeIndexGranularity & getIndexGranularity() const
|
2018-11-30 15:36:10 +00:00
|
|
|
{
|
|
|
|
return index_granularity;
|
|
|
|
}
|
|
|
|
|
2015-04-10 15:31:51 +00:00
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
void init();
|
2015-04-10 15:31:51 +00:00
|
|
|
|
2017-05-09 19:07:35 +00:00
|
|
|
/** If `permutation` is given, it rearranges the values in the columns when writing.
|
2017-04-16 15:00:33 +00:00
|
|
|
* This is necessary to not keep the whole block in the RAM to sort it.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void writeImpl(const Block & block, const IColumn::Permutation * permutation);
|
2015-08-14 02:45:40 +00:00
|
|
|
|
2014-03-04 11:30:50 +00:00
|
|
|
private:
|
2017-12-25 21:57:29 +00:00
|
|
|
NamesAndTypesList columns_list;
|
2018-06-07 18:14:37 +00:00
|
|
|
SerializationStates serialization_states;
|
2017-04-01 07:20:54 +00:00
|
|
|
String part_path;
|
2014-03-13 12:48:07 +00:00
|
|
|
|
2017-10-24 14:11:53 +00:00
|
|
|
size_t rows_count = 0;
|
2014-03-27 12:32:37 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
std::unique_ptr<WriteBufferFromFile> index_file_stream;
|
|
|
|
std::unique_ptr<HashingWriteBuffer> index_stream;
|
2017-12-15 20:48:46 +00:00
|
|
|
MutableColumns index_columns;
|
2019-02-05 14:50:25 +00:00
|
|
|
|
|
|
|
std::vector<std::unique_ptr<ColumnStream>> skip_indices_streams;
|
2019-03-08 19:52:21 +00:00
|
|
|
MergeTreeIndexAggregators skip_indices_aggregators;
|
2019-02-05 14:50:25 +00:00
|
|
|
std::vector<size_t> skip_index_filling;
|
2013-04-24 10:31:32 +00:00
|
|
|
};
|
|
|
|
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2018-10-16 21:22:41 +00:00
|
|
|
/// Writes only those columns that are in `header`
|
2017-11-20 02:15:15 +00:00
|
|
|
class MergedColumnOnlyOutputStream final : public IMergedBlockOutputStream
|
2014-03-04 11:30:50 +00:00
|
|
|
{
|
|
|
|
public:
|
2018-04-23 03:30:28 +00:00
|
|
|
/// skip_offsets: used when ALTERing columns if we know that array offsets are not altered.
|
2018-10-16 21:25:45 +00:00
|
|
|
/// Pass empty 'already_written_offset_columns' first time then and pass the same object to subsequent instances of MergedColumnOnlyOutputStream
|
|
|
|
/// if you want to serialize elements of Nested data structure in different instances of MergedColumnOnlyOutputStream.
|
2017-04-01 07:20:54 +00:00
|
|
|
MergedColumnOnlyOutputStream(
|
2018-10-16 21:22:41 +00:00
|
|
|
MergeTreeData & storage_, const Block & header_, String part_path_, bool sync_,
|
2018-12-21 12:17:30 +00:00
|
|
|
CompressionCodecPtr default_codec_, bool skip_offsets_,
|
2018-11-30 15:36:10 +00:00
|
|
|
WrittenOffsetColumns & already_written_offset_columns,
|
2019-03-30 13:44:23 +00:00
|
|
|
const MergeTreeIndexGranularity & index_granularity_);
|
2014-03-27 17:30:04 +00:00
|
|
|
|
2018-02-19 00:45:32 +00:00
|
|
|
Block getHeader() const override { return header; }
|
2017-04-01 07:20:54 +00:00
|
|
|
void write(const Block & block) override;
|
|
|
|
void writeSuffix() override;
|
|
|
|
MergeTreeData::DataPart::Checksums writeSuffixAndGetChecksums();
|
2014-03-04 11:30:50 +00:00
|
|
|
|
|
|
|
private:
|
2018-02-19 00:45:32 +00:00
|
|
|
Block header;
|
2018-06-07 18:14:37 +00:00
|
|
|
SerializationStates serialization_states;
|
2017-04-01 07:20:54 +00:00
|
|
|
String part_path;
|
2014-03-04 11:30:50 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
bool initialized = false;
|
|
|
|
bool sync;
|
|
|
|
bool skip_offsets;
|
2018-10-16 21:22:41 +00:00
|
|
|
|
|
|
|
/// To correctly write Nested elements column-by-column.
|
|
|
|
WrittenOffsetColumns & already_written_offset_columns;
|
2014-03-04 11:30:50 +00:00
|
|
|
};
|
2014-06-26 00:58:14 +00:00
|
|
|
|
2013-09-26 19:16:43 +00:00
|
|
|
}
|