ClickHouse/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h

#pragma once

#include <IO/WriteBufferFromFile.h>
#include <IO/CompressedWriteBuffer.h>
#include <IO/HashingWriteBuffer.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <DataStreams/IBlockOutputStream.h>

#include <Columns/ColumnArray.h>


namespace DB
{


class IMergedBlockOutputStream : public IBlockOutputStream
{
public:
    IMergedBlockOutputStream(
        MergeTreeData & storage_,
        size_t min_compress_block_size_,
        size_t max_compress_block_size_,
        CompressionMethod compression_method_,
        size_t aio_threshold_);

protected:
    using OffsetColumns = std::set<std::string>;

    struct ColumnStream
    {
        ColumnStream(
            const String & escaped_column_name_,
            const String & data_path,
            const std::string & data_file_extension_,
            const std::string & marks_path,
            const std::string & marks_file_extension_,
            size_t max_compress_block_size,
            CompressionMethod compression_method,
            size_t estimated_size,
            size_t aio_threshold);

        String escaped_column_name;
        std::string data_file_extension;
        std::string marks_file_extension;

        /// compressed -> compressed_buf -> plain_hashing -> plain_file
        std::unique_ptr<WriteBufferFromFileBase> plain_file;
        HashingWriteBuffer plain_hashing;
        CompressedWriteBuffer compressed_buf;
        HashingWriteBuffer compressed;

        /// marks -> marks_file
        WriteBufferFromFile marks_file;
        HashingWriteBuffer marks;

        void finalize();

        void sync();

        void addToChecksums(MergeTreeData::DataPart::Checksums & checksums);
    };

    using ColumnStreams = std::map<String, std::unique_ptr<ColumnStream>>;

    void addStream(const String & path, const String & name, const IDataType & type, size_t estimated_size,
        size_t level, const String & filename, bool skip_offsets);

    /// Write data of one column.
    void writeData(const String & name, const IDataType & type, const IColumn & column, OffsetColumns & offset_columns,
        size_t level, bool skip_offsets);

    MergeTreeData & storage;

    ColumnStreams column_streams;

    /// The offset to the first row of the block for which you want to write the index.
    size_t index_offset = 0;

    size_t min_compress_block_size;
    size_t max_compress_block_size;

    size_t aio_threshold;

    CompressionMethod compression_method;

private:
    /// Internal version of writeData.
    void writeDataImpl(const String & name, const IDataType & type, const IColumn & column,
        OffsetColumns & offset_columns, size_t level, bool write_array_data, bool skip_offsets);
};


/** To write one part.
  * The data refers to one month, and are written in one part.
  */
class MergedBlockOutputStream : public IMergedBlockOutputStream
{
public:
    MergedBlockOutputStream(
        MergeTreeData & storage_,
        String part_path_,
        const NamesAndTypesList & columns_list_,
        CompressionMethod compression_method);

    MergedBlockOutputStream(
        MergeTreeData & storage_,
        String part_path_,
        const NamesAndTypesList & columns_list_,
        CompressionMethod compression_method,
        const MergeTreeData::DataPart::ColumnToSize & merged_column_to_size_,
        size_t aio_threshold_);

    std::string getPartPath() const;

    /// If the data is pre-sorted.
    void write(const Block & block) override;

    /** If the data is not sorted, but we have previously calculated the permutation, after which they will be sorted.
      * This method is used to save RAM, since you do not need to keep two blocks at once - the original one and the sorted one.
      */
    void writeWithPermutation(const Block & block, const IColumn::Permutation * permutation);

    void writeSuffix() override;

    MergeTreeData::DataPart::Checksums writeSuffixAndGetChecksums(
        const NamesAndTypesList & total_column_list,
        MergeTreeData::DataPart::Checksums * additional_column_checksums = nullptr);

    MergeTreeData::DataPart::Checksums writeSuffixAndGetChecksums();

    MergeTreeData::DataPart::Index & getIndex();

    /// How many marks are already written.
    size_t marksCount();

private:
    void init();

    /** If `permutation` is given, it rearranges the values in the columns when writing.
      * This is necessary to not keep the whole block in the RAM to sort it.
      */
    void writeImpl(const Block & block, const IColumn::Permutation * permutation);

private:
    NamesAndTypesList columns_list;
    String part_path;

    size_t marks_count = 0;

    std::unique_ptr<WriteBufferFromFile> index_file_stream;
    std::unique_ptr<HashingWriteBuffer> index_stream;
    MergeTreeData::DataPart::Index index_columns;
};


/// Writes only those columns that are in `block`
class MergedColumnOnlyOutputStream : public IMergedBlockOutputStream
{
public:
    MergedColumnOnlyOutputStream(
        MergeTreeData & storage_, String part_path_, bool sync_, CompressionMethod compression_method, bool skip_offsets_);

    void write(const Block & block) override;
    void writeSuffix() override;
    MergeTreeData::DataPart::Checksums writeSuffixAndGetChecksums();

private:
    String part_path;

    bool initialized = false;
    bool sync;
    bool skip_offsets;
};

}
Merge 2013-04-24 10:31:32 +00:00			`#pragma once`

Moved headers and sources to same place [#CLICKHOUSE-3]. 2017-04-01 09:19:00 +00:00			`#include <IO/WriteBufferFromFile.h>`
			`#include <IO/CompressedWriteBuffer.h>`
			`#include <IO/HashingWriteBuffer.h>`
			`#include <Storages/MergeTree/MergeTreeData.h>`
			`#include <DataStreams/IBlockOutputStream.h>`
Merge 2013-04-24 10:31:32 +00:00
Moved headers and sources to same place [#CLICKHOUSE-3]. 2017-04-01 09:19:00 +00:00			`#include <Columns/ColumnArray.h>`
Vertical merging algorithm for MergeTree engines. [#METR-23305] 2016-11-03 12:00:44 +00:00
Merge 2013-09-15 01:10:16 +00:00
Merge 2013-04-24 10:31:32 +00:00			`namespace DB`
			`{`
dbms: lowered memory usage for INSERT [#METR-17704]. 2015-08-14 02:45:40 +00:00

dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00			`class IMergedBlockOutputStream : public IBlockOutputStream`
Merge 2013-04-24 10:31:32 +00:00			`{`
			`public:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`IMergedBlockOutputStream(`
			`MergeTreeData & storage_,`
			`size_t min_compress_block_size_,`
			`size_t max_compress_block_size_,`
			`CompressionMethod compression_method_,`
			`size_t aio_threshold_);`
Merge 2013-08-24 08:01:19 +00:00
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00			`protected:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`using OffsetColumns = std::set<std::string>;`
dbms: different compression methods on merge: preparation [#METR-15386]. 2015-03-14 02:36:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`struct ColumnStream`
			`{`
			`ColumnStream(`
			`const String & escaped_column_name_,`
			`const String & data_path,`
			`const std::string & data_file_extension_,`
			`const std::string & marks_path,`
			`const std::string & marks_file_extension_,`
			`size_t max_compress_block_size,`
			`CompressionMethod compression_method,`
			`size_t estimated_size,`
			`size_t aio_threshold);`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`String escaped_column_name;`
			`std::string data_file_extension;`
			`std::string marks_file_extension;`
Merge 2014-04-14 13:08:26 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// compressed -> compressed_buf -> plain_hashing -> plain_file`
			`std::unique_ptr<WriteBufferFromFileBase> plain_file;`
			`HashingWriteBuffer plain_hashing;`
			`CompressedWriteBuffer compressed_buf;`
			`HashingWriteBuffer compressed;`
Merge 2014-04-14 13:08:26 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// marks -> marks_file`
			`WriteBufferFromFile marks_file;`
			`HashingWriteBuffer marks;`
Merge 2013-09-15 01:10:16 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void finalize();`
dbms: NULL support for MergeTree [#METR-19266] 2016-07-21 16:22:24 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void sync();`
dbms: NULL support for MergeTree [#METR-19266] 2016-07-21 16:22:24 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void addToChecksums(MergeTreeData::DataPart::Checksums & checksums);`
			`};`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`using ColumnStreams = std::map<String, std::unique_ptr<ColumnStream>>;`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void addStream(const String & path, const String & name, const IDataType & type, size_t estimated_size,`
			`size_t level, const String & filename, bool skip_offsets);`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
translate comments 2017-04-16 15:00:33 +00:00			`/// Write data of one column.`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void writeData(const String & name, const IDataType & type, const IColumn & column, OffsetColumns & offset_columns,`
			`size_t level, bool skip_offsets);`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`MergeTreeData & storage;`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`ColumnStreams column_streams;`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
translate comments 2017-04-16 15:00:33 +00:00			`/// The offset to the first row of the block for which you want to write the index.`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t index_offset = 0;`
Merge 2014-04-08 15:29:12 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t min_compress_block_size;`
			`size_t max_compress_block_size;`
dbms: different compression methods on merge: preparation [#METR-15386]. 2015-03-14 02:36:39 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t aio_threshold;`
dbms: Server: feature development. [#METR-15090] 2015-04-08 16:48:47 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`CompressionMethod compression_method;`
dbms: Server: fixes [#METR-19266] 2016-10-20 10:13:07 +00:00
			`private:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`/// Internal version of writeData.`
			`void writeDataImpl(const String & name, const IDataType & type, const IColumn & column,`
			`OffsetColumns & offset_columns, size_t level, bool write_array_data, bool skip_offsets);`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00			`};`

dbms: lowered memory usage for INSERT [#METR-17704]. 2015-08-14 02:45:40 +00:00
translate comments 2017-04-16 15:00:33 +00:00			`/** To write one part.`
			`* The data refers to one month, and are written in one part.`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00			`*/`
			`class MergedBlockOutputStream : public IMergedBlockOutputStream`
			`{`
			`public:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`MergedBlockOutputStream(`
			`MergeTreeData & storage_,`
			`String part_path_,`
			`const NamesAndTypesList & columns_list_,`
			`CompressionMethod compression_method);`
dbms: improved performance on short queries [#METR-11571]. 2014-06-26 00:58:14 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`MergedBlockOutputStream(`
			`MergeTreeData & storage_,`
			`String part_path_,`
			`const NamesAndTypesList & columns_list_,`
			`CompressionMethod compression_method,`
			`const MergeTreeData::DataPart::ColumnToSize & merged_column_to_size_,`
			`size_t aio_threshold_);`
dbms: NULL support for MergeTree [#METR-19266] 2016-07-21 16:22:24 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`std::string getPartPath() const;`
Merge 2016-01-28 16:06:57 +00:00
translate comments 2017-04-16 15:00:33 +00:00			`/// If the data is pre-sorted.`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void write(const Block & block) override;`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
translate comments 2017-04-16 15:00:33 +00:00			`/** If the data is not sorted, but we have previously calculated the permutation, after which they will be sorted.`
			`* This method is used to save RAM, since you do not need to keep two blocks at once - the original one and the sorted one.`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`*/`
			`void writeWithPermutation(const Block & block, const IColumn::Permutation * permutation);`
Merge 2014-03-27 17:30:04 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void writeSuffix() override;`
dbms: improved performance on short queries [#METR-11571]. 2014-06-26 00:58:14 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`MergeTreeData::DataPart::Checksums writeSuffixAndGetChecksums(`
			`const NamesAndTypesList & total_column_list,`
			`MergeTreeData::DataPart::Checksums * additional_column_checksums = nullptr);`
Vertical merging algorithm for MergeTree engines. [#METR-23305] 2016-11-03 12:00:44 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`MergeTreeData::DataPart::Checksums writeSuffixAndGetChecksums();`
dbms: NULL support for MergeTree [#METR-19266] 2016-07-21 16:22:24 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`MergeTreeData::DataPart::Index & getIndex();`
dbms: improved performance on short queries [#METR-11571]. 2014-06-26 00:58:14 +00:00
translate comments 2017-04-16 15:00:33 +00:00			`/// How many marks are already written.`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t marksCount();`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
dbms: Server: Added support for the client-side parameter min_bytes_to_use_direct_io in OPTIMIZE. [#METR-15090] 2015-04-10 15:31:51 +00:00			`private:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void init();`
dbms: Server: Added support for the client-side parameter min_bytes_to_use_direct_io in OPTIMIZE. [#METR-15090] 2015-04-10 15:31:51 +00:00
Fixed translation errors, part 1 [#CLICKHOUSE-3]. 2017-05-09 19:07:35 +00:00			/** If `permutation` is given, it rearranges the values in the columns when writing.
translate comments 2017-04-16 15:00:33 +00:00			`* This is necessary to not keep the whole block in the RAM to sort it.`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`*/`
			`void writeImpl(const Block & block, const IColumn::Permutation * permutation);`
dbms: lowered memory usage for INSERT [#METR-17704]. 2015-08-14 02:45:40 +00:00
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00			`private:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`NamesAndTypesList columns_list;`
			`String part_path;`
Merge 2014-03-13 12:48:07 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`size_t marks_count = 0;`
Deduplicated code for merge tree writing; previous commit is useless. [#METR-10202] 2014-03-27 12:32:37 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`std::unique_ptr<WriteBufferFromFile> index_file_stream;`
			`std::unique_ptr<HashingWriteBuffer> index_stream;`
			`MergeTreeData::DataPart::Index index_columns;`
Merge 2013-04-24 10:31:32 +00:00			`};`

dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
translate comments 2017-04-16 15:00:33 +00:00			/// Writes only those columns that are in `block`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00			`class MergedColumnOnlyOutputStream : public IMergedBlockOutputStream`
			`{`
			`public:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`MergedColumnOnlyOutputStream(`
			`MergeTreeData & storage_, String part_path_, bool sync_, CompressionMethod compression_method, bool skip_offsets_);`
Merge 2014-03-27 17:30:04 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`void write(const Block & block) override;`
			`void writeSuffix() override;`
			`MergeTreeData::DataPart::Checksums writeSuffixAndGetChecksums();`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
			`private:`
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`String part_path;`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00
Changed tabs to spaces in code [#CLICKHOUSE-3]. 2017-04-01 07:20:54 +00:00			`bool initialized = false;`
			`bool sync;`
			`bool skip_offsets;`
dbms: added code to modify column type [#METR-10242] 2014-03-04 11:30:50 +00:00			`};`
dbms: improved performance on short queries [#METR-11571]. 2014-06-26 00:58:14 +00:00
Merge 2013-09-26 19:16:43 +00:00			`}`