Keep indices for StorageStripeLog in memory.

2024-11-24 16:42:05 +00:00 · 2021-08-27 01:15:24 +03:00 · 2021-08-27 01:15:24 +03:00 · 0e8c9b089f
commit 0e8c9b089f
parent 42596b16bc
10 changed files with 359 additions and 175 deletions
--- a/src/Common/FileChecker.cpp
+++ b/src/Common/FileChecker.cpp
@ -50,11 +50,6 @@ void FileChecker::setEmpty(const String & full_file_path)
    map[fileName(full_file_path)] = 0;
 }

-const FileChecker::Map & FileChecker::getFileSizes() const
-{
-    return map;
-}
-
 size_t FileChecker::getFileSize(const String & full_file_path) const
 {
    auto it = map.find(fileName(full_file_path));
--- a/src/Common/FileChecker.h
+++ b/src/Common/FileChecker.h
@ -28,11 +28,6 @@ public:
    /// The purpose of this function is to rollback a group of unfinished writes.
    void repair();

-    /// File name -> size.
-    using Map = std::map<String, UInt64>;
-
-    const Map & getFileSizes() const;
-
    /// Returns stored file size.
    size_t getFileSize(const String & full_file_path) const;

@ -43,7 +38,7 @@ private:
    const Poco::Logger * log = &Poco::Logger::get("FileChecker");

    String files_info_path;
-    Map map;
+    std::map<String, size_t> map;
 };

 }
--- a/src/Formats/IndexForNativeFormat.cpp
+++ b/src/Formats/IndexForNativeFormat.cpp
@ -0,0 +1,91 @@
+#include <Formats/IndexForNativeFormat.h>
+#include <IO/ReadHelpers.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int INCORRECT_INDEX;
+}
+
+void IndexOfBlockForNativeFormat::read(ReadBuffer & istr)
+{
+    readVarUInt(num_columns, istr);
+    readVarUInt(num_rows, istr);
+    columns.clear();
+    for (size_t i = 0; i < num_columns; ++i)
+    {
+        auto & column = columns.emplace_back();
+        readBinary(column.name, istr);
+        readBinary(column.type, istr);
+        readBinary(column.location.offset_in_compressed_file, istr);
+        readBinary(column.location.offset_in_decompressed_block, istr);
+    }
+}
+
+void IndexOfBlockForNativeFormat::write(WriteBuffer & ostr) const
+{
+    writeVarUInt(num_columns, ostr);
+    writeVarUInt(num_rows, ostr);
+    for (size_t i = 0; i < num_columns; ++i)
+    {
+        const auto & column = columns[i];
+        writeBinary(column.name, ostr);
+        writeBinary(column.type, ostr);
+        writeBinary(column.location.offset_in_compressed_file, ostr);
+        writeBinary(column.location.offset_in_decompressed_block, ostr);
+    }
+}
+
+IndexOfBlockForNativeFormat IndexOfBlockForNativeFormat::extractIndexForColumns(const NameSet & required_columns) const
+{
+    if (num_columns < required_columns.size())
+        throw Exception("Index contain less than required columns", ErrorCodes::INCORRECT_INDEX);
+
+    IndexOfBlockForNativeFormat res;
+    for (size_t i = 0; i < num_columns; ++i)
+    {
+        const auto & column = columns[i];
+        if (required_columns.contains(column.name))
+            res.columns.push_back(column);
+    }
+
+    if (res.columns.size() < required_columns.size())
+        throw Exception("Index contain less than required columns", ErrorCodes::INCORRECT_INDEX);
+    if (res.columns.size() > required_columns.size())
+        throw Exception("Index contain duplicate columns", ErrorCodes::INCORRECT_INDEX);
+
+    res.num_columns = res.columns.size();
+    res.num_rows = num_rows;
+    return res;
+}
+
+
+void IndexForNativeFormat::read(ReadBuffer & istr)
+{
+    blocks.clear();
+    while (!istr.eof())
+    {
+        auto & block = blocks.emplace_back();
+        block.read(istr);
+    }
+}
+
+void IndexForNativeFormat::write(WriteBuffer & ostr) const
+{
+    for (const auto & block : blocks)
+        block.write(ostr);
+}
+
+IndexForNativeFormat IndexForNativeFormat::extractIndexForColumns(const NameSet & required_columns) const
+{
+    IndexForNativeFormat res;
+    res.blocks.reserve(blocks.size());
+    for (const auto & block : blocks)
+        res.blocks.emplace_back(block.extractIndexForColumns(required_columns));
+    return res;
+}
+
+}
--- a/src/Formats/IndexForNativeFormat.h
+++ b/src/Formats/IndexForNativeFormat.h
@ -0,0 +1,60 @@
+#pragma once
+
+#include <Core/Names.h>
+#include <Formats/MarkInCompressedFile.h>
+
+namespace DB
+{
+
+/** The Native format can contain a separately located index,
+  *  which allows you to understand where what column is located,
+  *  and skip unnecessary columns.
+  */
+
+/** The position of one piece of a single column. */
+struct IndexOfOneColumnForNativeFormat
+{
+    String name;
+    String type;
+    MarkInCompressedFile location;
+};
+
+/** The index for the data block. */
+struct IndexOfBlockForNativeFormat
+{
+    using Columns = std::vector<IndexOfOneColumnForNativeFormat>;
+
+    size_t num_columns;
+    size_t num_rows;
+    Columns columns;
+
+    /// Reads the index for the data block.
+    void read(ReadBuffer & istr);
+
+    /// Writes the index for the data block.
+    void write(WriteBuffer & ostr) const;
+
+    /// Returns the index only for the required columns.
+    IndexOfBlockForNativeFormat extractIndexForColumns(const NameSet & required_columns) const;
+};
+
+/** The whole index. */
+struct IndexForNativeFormat
+{
+    using Blocks = std::vector<IndexOfBlockForNativeFormat>;
+    Blocks blocks;
+
+    bool empty() const { return blocks.empty(); }
+    void clear() { blocks.clear(); }
+
+    /// Reads the index.
+    void read(ReadBuffer & istr);
+
+    /// Writes the index.
+    void write(WriteBuffer & ostr) const;
+
+    /// Returns the index only for the required columns.
+    IndexForNativeFormat extractIndexForColumns(const NameSet & required_columns) const;
+};
+
+}
--- a/src/Formats/NativeReader.cpp
+++ b/src/Formats/NativeReader.cpp
@ -221,39 +221,4 @@ void NativeReader::updateAvgValueSizeHints(const Block & block)
    }
 }

-void IndexForNativeFormat::read(ReadBuffer & istr, const NameSet & required_columns)
-{
-    while (!istr.eof())
-    {
-        blocks.emplace_back();
-        IndexOfBlockForNativeFormat & block = blocks.back();
-
-        readVarUInt(block.num_columns, istr);
-        readVarUInt(block.num_rows, istr);
-
-        if (block.num_columns < required_columns.size())
-            throw Exception("Index contain less than required columns", ErrorCodes::INCORRECT_INDEX);
-
-        for (size_t i = 0; i < block.num_columns; ++i)
-        {
-            IndexOfOneColumnForNativeFormat column_index;
-
-            readBinary(column_index.name, istr);
-            readBinary(column_index.type, istr);
-            readBinary(column_index.location.offset_in_compressed_file, istr);
-            readBinary(column_index.location.offset_in_decompressed_block, istr);
-
-            if (required_columns.count(column_index.name))
-                block.columns.push_back(std::move(column_index));
-        }
-
-        if (block.columns.size() < required_columns.size())
-            throw Exception("Index contain less than required columns", ErrorCodes::INCORRECT_INDEX);
-        if (block.columns.size() > required_columns.size())
-            throw Exception("Index contain duplicate columns", ErrorCodes::INCORRECT_INDEX);
-
-        block.num_columns = block.columns.size();
-    }
-}
-
 }
--- a/src/Formats/NativeReader.h
+++ b/src/Formats/NativeReader.h
@ -1,5 +1,6 @@
 #pragma once

+#include <Formats/IndexForNativeFormat.h>
 #include <Formats/MarkInCompressedFile.h>
 #include <Common/PODArray.h>
 #include <Core/Block.h>
@ -9,48 +10,6 @@ namespace DB

 class CompressedReadBufferFromFile;

-
-/** The Native format can contain a separately located index,
-  *  which allows you to understand where what column is located,
-  *  and skip unnecessary columns.
-  */
-
-/** The position of one piece of a single column. */
-struct IndexOfOneColumnForNativeFormat
-{
-    String name;
-    String type;
-    MarkInCompressedFile location;
-};
-
-/** The index for the data block. */
-struct IndexOfBlockForNativeFormat
-{
-    using Columns = std::vector<IndexOfOneColumnForNativeFormat>;
-
-    size_t num_columns;
-    size_t num_rows;
-    Columns columns;
-};
-
-/** The whole index. */
-struct IndexForNativeFormat
-{
-    using Blocks = std::vector<IndexOfBlockForNativeFormat>;
-    Blocks blocks;
-
-    IndexForNativeFormat() {}
-
-    IndexForNativeFormat(ReadBuffer & istr, const NameSet & required_columns)
-    {
-        read(istr, required_columns);
-    }
-
-    /// Read the index, only for the required columns.
-    void read(ReadBuffer & istr, const NameSet & required_columns);
-};
-
-
 /** Deserializes the stream of blocks from the native binary format (with names and column types).
  * Designed for communication between servers.
  *
--- a/src/Formats/NativeWriter.cpp
+++ b/src/Formats/NativeWriter.cpp
@ -5,6 +5,7 @@
 #include <IO/VarInt.h>
 #include <Compression/CompressedWriteBuffer.h>

+#include <Formats/IndexForNativeFormat.h>
 #include <Formats/MarkInCompressedFile.h>
 #include <Formats/NativeWriter.h>

@ -22,11 +23,11 @@ namespace ErrorCodes

 NativeWriter::NativeWriter(
    WriteBuffer & ostr_, UInt64 client_revision_, const Block & header_, bool remove_low_cardinality_,
-    WriteBuffer * index_ostr_, size_t initial_size_of_file_)
+    IndexForNativeFormat * index_, size_t initial_size_of_file_)
    : ostr(ostr_), client_revision(client_revision_), header(header_),
-    index_ostr(index_ostr_), initial_size_of_file(initial_size_of_file_), remove_low_cardinality(remove_low_cardinality_)
+      index(index_), initial_size_of_file(initial_size_of_file_), remove_low_cardinality(remove_low_cardinality_)
 {
-    if (index_ostr)
+    if (index)
    {
        ostr_concrete = typeid_cast<CompressedWriteBuffer *>(&ostr);
        if (!ostr_concrete)
@ -80,18 +81,20 @@ void NativeWriter::write(const Block & block)
    /** The index has the same structure as the data stream.
      * But instead of column values, it contains a mark that points to the location in the data file where this part of the column is located.
      */
-    if (index_ostr)
+    IndexOfBlockForNativeFormat index_block;
+    if (index)
    {
-        writeVarUInt(columns, *index_ostr);
-        writeVarUInt(rows, *index_ostr);
+        index_block.num_columns = columns;
+        index_block.num_rows = rows;
+        index_block.columns.resize(columns);
    }

    for (size_t i = 0; i < columns; ++i)
    {
        /// For the index.
-        MarkInCompressedFile mark;
+        MarkInCompressedFile mark{0, 0};

-        if (index_ostr)
+        if (index)
        {
            ostr_concrete->next();  /// Finish compressed block.
            mark.offset_in_compressed_file = initial_size_of_file + ostr_concrete->getCompressedBytes();
@ -125,15 +128,17 @@ void NativeWriter::write(const Block & block)
        if (rows)    /// Zero items of data is always represented as zero number of bytes.
            writeData(*column.type, column.column, ostr, 0, 0);

-        if (index_ostr)
+        if (index)
        {
-            writeStringBinary(column.name, *index_ostr);
-            writeStringBinary(column.type->getName(), *index_ostr);
-
-            writeBinary(mark.offset_in_compressed_file, *index_ostr);
-            writeBinary(mark.offset_in_decompressed_block, *index_ostr);
+            index_block.columns[i].name = column.name;
+            index_block.columns[i].type = column.type->getName();
+            index_block.columns[i].location.offset_in_compressed_file = mark.offset_in_compressed_file;
+            index_block.columns[i].location.offset_in_decompressed_block = mark.offset_in_decompressed_block;
        }
    }
+
+    if (index)
+        index->blocks.emplace_back(std::move(index_block));
 }

 }
--- a/src/Formats/NativeWriter.h
+++ b/src/Formats/NativeWriter.h
@ -9,7 +9,7 @@ namespace DB

 class WriteBuffer;
 class CompressedWriteBuffer;
-
+struct IndexForNativeFormat;

 /** Serializes the stream of blocks in their native binary format (with names and column types).
  * Designed for communication between servers.
@ -24,7 +24,7 @@ public:
      */
    NativeWriter(
        WriteBuffer & ostr_, UInt64 client_revision_, const Block & header_, bool remove_low_cardinality_ = false,
-        WriteBuffer * index_ostr_ = nullptr, size_t initial_size_of_file_ = 0);
+        IndexForNativeFormat * index_ = nullptr, size_t initial_size_of_file_ = 0);

    Block getHeader() const { return header; }
    void write(const Block & block);
@ -36,7 +36,7 @@ private:
    WriteBuffer & ostr;
    UInt64 client_revision;
    Block header;
-    WriteBuffer * index_ostr;
+    IndexForNativeFormat * index = nullptr;
    size_t initial_size_of_file;    /// The initial size of the data file, if `append` done. Used for the index.
    /// If you need to write index, then `ostr` must be a CompressedWriteBuffer.
    CompressedWriteBuffer * ostr_concrete = nullptr;
--- a/src/Storages/StorageStripeLog.cpp
+++ b/src/Storages/StorageStripeLog.cpp
@ -47,11 +47,13 @@ namespace ErrorCodes
 }


+/// NOTE: The lock `StorageStripeLog::rwlock` is NOT kept locked while reading,
+/// because we read ranges of data that do not change.
 class StripeLogSource final : public SourceWithProgress
 {
 public:
    static Block getHeader(
-        StorageStripeLog & storage,
+        const StorageStripeLog & storage,
        const StorageMetadataPtr & metadata_snapshot,
        const Names & column_names,
        IndexForNativeFormat::Blocks::const_iterator index_begin,
@ -74,19 +76,18 @@ public:
    }

    StripeLogSource(
-        StorageStripeLog & storage_,
+        const StorageStripeLog & storage_,
        const StorageMetadataPtr & metadata_snapshot_,
        const Names & column_names,
        ReadSettings read_settings_,
-        std::shared_ptr<const IndexForNativeFormat> & index_,
+        std::shared_ptr<const IndexForNativeFormat> indices_,
        IndexForNativeFormat::Blocks::const_iterator index_begin_,
        IndexForNativeFormat::Blocks::const_iterator index_end_)
-        : SourceWithProgress(
-            getHeader(storage_, metadata_snapshot_, column_names, index_begin_, index_end_))
+        : SourceWithProgress(getHeader(storage_, metadata_snapshot_, column_names, index_begin_, index_end_))
        , storage(storage_)
        , metadata_snapshot(metadata_snapshot_)
        , read_settings(std::move(read_settings_))
-        , index(index_)
+        , indices(indices_)
        , index_begin(index_begin_)
        , index_end(index_end_)
    {
@ -109,7 +110,7 @@ protected:
            {
                block_in.reset();
                data_in.reset();
-                index.reset();
+                indices.reset();
            }
        }

@ -117,13 +118,14 @@ protected:
    }

 private:
-    StorageStripeLog & storage;
+    const StorageStripeLog & storage;
    StorageMetadataPtr metadata_snapshot;
    ReadSettings read_settings;

-    std::shared_ptr<const IndexForNativeFormat> index;
+    std::shared_ptr<const IndexForNativeFormat> indices;
    IndexForNativeFormat::Blocks::const_iterator index_begin;
    IndexForNativeFormat::Blocks::const_iterator index_end;
+
    Block header;

    /** optional - to create objects only on first reading
@ -141,40 +143,45 @@ private:
            started = true;

            String data_file_path = storage.table_path + "data.bin";
-            data_in.emplace(storage.disk->readFile(data_file_path, read_settings.adjustBufferSize(storage.disk->getFileSize(data_file_path))));
+
+            /// We cannot just use `storage.file_checker` to get the size of the file here,
+            /// because `storage.rwlock` is not locked at this point.
+            size_t data_file_size = storage.disk->getFileSize(data_file_path);
+
+            data_in.emplace(storage.disk->readFile(data_file_path, read_settings.adjustBufferSize(data_file_size)));
            block_in.emplace(*data_in, 0, index_begin, index_end);
        }
    }
 };


+/// NOTE: The lock `StorageStripeLog::rwlock` is kept locked in exclusive mode while writing.
 class StripeLogSink final : public SinkToStorage
 {
 public:
+    using WriteLock = std::unique_lock<std::shared_timed_mutex>;
+
    explicit StripeLogSink(
-        StorageStripeLog & storage_, const StorageMetadataPtr & metadata_snapshot_, std::unique_lock<std::shared_timed_mutex> && lock_)
+        StorageStripeLog & storage_, const StorageMetadataPtr & metadata_snapshot_, WriteLock && lock_)
        : SinkToStorage(metadata_snapshot_->getSampleBlock())
        , storage(storage_)
        , metadata_snapshot(metadata_snapshot_)
        , lock(std::move(lock_))
-        , data_out_file(storage.table_path + "data.bin")
-        , data_out_compressed(storage.disk->writeFile(data_out_file, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append))
+        , data_out_compressed(storage.disk->writeFile(storage.data_file_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append))
        , data_out(std::make_unique<CompressedWriteBuffer>(
-            *data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), storage.max_compress_block_size))
-        , index_out_file(storage.table_path + "index.mrk")
-        , index_out_compressed(storage.disk->writeFile(index_out_file, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append))
-        , index_out(std::make_unique<CompressedWriteBuffer>(*index_out_compressed))
-        , block_out(*data_out, 0, metadata_snapshot->getSampleBlock(), false, index_out.get(), storage.disk->getFileSize(data_out_file))
+              *data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), storage.max_compress_block_size))
    {
        if (!lock)
            throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);

-        if (storage.file_checker.empty())
-        {
-            storage.file_checker.setEmpty(storage.table_path + "data.bin");
-            storage.file_checker.setEmpty(storage.table_path + "index.mrk");
-            storage.file_checker.save();
-        }
+        /// Ensure that indices are loaded because we're going to update them.
+        storage.loadIndices(lock);
+
+        /// If there were no files, save zero file sizes to be able to rollback in case of error.
+        storage.saveFileSizes(lock);
+
+        size_t initial_data_size = storage.file_checker.getFileSize(storage.data_file_path);
+        block_out = std::make_unique<NativeWriter>(*data_out, 0, metadata_snapshot->getSampleBlock(), false, &storage.indices, initial_data_size);
    }

    String getName() const override { return "StripeLogSink"; }
@ -186,12 +193,16 @@ public:
            if (!done)
            {
                /// Rollback partial writes.
+
+                /// No more writing.
                data_out.reset();
                data_out_compressed.reset();
-                index_out.reset();
-                index_out_compressed.reset();

+                /// Truncate files to the older sizes.
                storage.file_checker.repair();
+
+                /// Remove excessive indices.
+                storage.removeUnsavedIndices(lock);
            }
        }
        catch (...)
@ -202,7 +213,7 @@ public:

    void consume(Chunk chunk) override
    {
-        block_out.write(getHeader().cloneWithColumns(chunk.detachColumns()));
+        block_out->write(getHeader().cloneWithColumns(chunk.detachColumns()));
    }

    void onFinish() override
@ -213,13 +224,12 @@ public:
        data_out->next();
        data_out_compressed->next();
        data_out_compressed->finalize();
-        index_out->next();
-        index_out_compressed->next();
-        index_out_compressed->finalize();

-        storage.file_checker.update(data_out_file);
-        storage.file_checker.update(index_out_file);
-        storage.file_checker.save();
+        /// Save the new indices.
+        storage.saveIndices(lock);
+
+        /// Save the new file sizes.
+        storage.saveFileSizes(lock);

        done = true;

@ -232,15 +242,11 @@ public:
 private:
    StorageStripeLog & storage;
    StorageMetadataPtr metadata_snapshot;
-    std::unique_lock<std::shared_timed_mutex> lock;
+    WriteLock lock;

-    String data_out_file;
    std::unique_ptr<WriteBuffer> data_out_compressed;
    std::unique_ptr<CompressedWriteBuffer> data_out;
-    String index_out_file;
-    std::unique_ptr<WriteBuffer> index_out_compressed;
-    std::unique_ptr<CompressedWriteBuffer> index_out;
-    NativeWriter block_out;
+    std::unique_ptr<NativeWriter> block_out;

    bool done = false;
 };
@ -258,8 +264,10 @@ StorageStripeLog::StorageStripeLog(
    : IStorage(table_id_)
    , disk(std::move(disk_))
    , table_path(relative_path_)
-    , max_compress_block_size(max_compress_block_size_)
+    , data_file_path(table_path + "data.bin")
+    , index_file_path(table_path + "index.mrk")
    , file_checker(disk, table_path + "sizes.json")
+    , max_compress_block_size(max_compress_block_size_)
    , log(&Poco::Logger::get("StorageStripeLog"))
 {
    StorageInMemoryMetadata storage_metadata;
@ -271,6 +279,13 @@ StorageStripeLog::StorageStripeLog(
    if (relative_path_.empty())
        throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME);

+    /// Ensure the file checker is initialized.
+    if (file_checker.empty())
+    {
+        file_checker.setEmpty(data_file_path);
+        file_checker.setEmpty(index_file_path);
+    }
+
    if (!attach)
    {
        /// create directories if they do not exist
@ -290,6 +305,9 @@ StorageStripeLog::StorageStripeLog(
 }


+StorageStripeLog::~StorageStripeLog() = default;
+
+
 void StorageStripeLog::rename(const String & new_path_to_table_data, const StorageID & new_table_id)
 {
    assert(table_path != new_path_to_table_data);
@ -297,6 +315,8 @@ void StorageStripeLog::rename(const String & new_path_to_table_data, const Stora
        disk->moveDirectory(table_path, new_path_to_table_data);

        table_path = new_path_to_table_data;
+        data_file_path = table_path + "data.bin";
+        index_file_path = table_path + "index.mrk";
        file_checker.setPath(table_path + "sizes.json");
    }
    renameInMemory(new_table_id);
@ -322,41 +342,38 @@ Pipe StorageStripeLog::read(
    const size_t /*max_block_size*/,
    unsigned num_streams)
 {
-    std::shared_lock lock(rwlock, getLockTimeout(context));
+    metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
+
+    auto lock_timeout = getLockTimeout(context);
+    loadIndices(lock_timeout);
+
+    ReadLock lock{rwlock, lock_timeout};
    if (!lock)
        throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);

-    metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
-
-    NameSet column_names_set(column_names.begin(), column_names.end());
-
-    Pipes pipes;
-
-    String index_file = table_path + "index.mrk";
-    if (file_checker.empty() || !disk->exists(index_file))
-    {
+    if (!file_checker.getFileSize(data_file_path))
        return Pipe(std::make_shared<NullSource>(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID())));
-    }

-    ReadSettings read_settings = context->getReadSettings();
+    auto indices_for_selected_columns
+        = std::make_shared<IndexForNativeFormat>(indices.extractIndexForColumns(NameSet{column_names.begin(), column_names.end()}));

-    CompressedReadBufferFromFile index_in(disk->readFile(index_file, read_settings.adjustBufferSize(4096)));
-    std::shared_ptr<const IndexForNativeFormat> index{std::make_shared<IndexForNativeFormat>(index_in, column_names_set)};
-
-    size_t size = index->blocks.size();
+    size_t size = indices_for_selected_columns->blocks.size();
    if (num_streams > size)
        num_streams = size;

+    ReadSettings read_settings = context->getReadSettings();
+    Pipes pipes;
+
    for (size_t stream = 0; stream < num_streams; ++stream)
    {
-        IndexForNativeFormat::Blocks::const_iterator begin = index->blocks.begin();
-        IndexForNativeFormat::Blocks::const_iterator end = index->blocks.begin();
+        IndexForNativeFormat::Blocks::const_iterator begin = indices_for_selected_columns->blocks.begin();
+        IndexForNativeFormat::Blocks::const_iterator end = indices_for_selected_columns->blocks.begin();

        std::advance(begin, stream * size / num_streams);
        std::advance(end, (stream + 1) * size / num_streams);

        pipes.emplace_back(std::make_shared<StripeLogSource>(
-            *this, metadata_snapshot, column_names, read_settings, index, begin, end));
+            *this, metadata_snapshot, column_names, read_settings, indices_for_selected_columns, begin, end));
    }

    /// We do not keep read lock directly at the time of reading, because we read ranges of data that do not change.
@ -367,7 +384,7 @@ Pipe StorageStripeLog::read(

 SinkToStoragePtr StorageStripeLog::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr context)
 {
-    std::unique_lock lock(rwlock, getLockTimeout(context));
+    WriteLock lock{rwlock, getLockTimeout(context)};
    if (!lock)
        throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);

@ -377,17 +394,91 @@ SinkToStoragePtr StorageStripeLog::write(const ASTPtr & /*query*/, const Storage

 CheckResults StorageStripeLog::checkData(const ASTPtr & /* query */, ContextPtr context)
 {
-    std::shared_lock lock(rwlock, getLockTimeout(context));
+    ReadLock lock{rwlock, getLockTimeout(context)};
    if (!lock)
        throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);

    return file_checker.check();
 }

+
 void StorageStripeLog::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &)
 {
    disk->clearDirectory(table_path);
-    file_checker = FileChecker{disk, table_path + "sizes.json"};
+
+    indices.clear();
+    file_checker.setEmpty(data_file_path);
+    file_checker.setEmpty(index_file_path);
+
+    indices_loaded = true;
+    num_indices_saved = 0;
+}
+
+
+void StorageStripeLog::loadIndices(std::chrono::seconds lock_timeout)
+{
+    if (indices_loaded)
+        return;
+
+    /// We load indices with an exclusive lock (i.e. the write lock) because we don't want
+    /// a data race between two threads trying to load indices simultaneously.
+    WriteLock lock{rwlock, lock_timeout};
+    if (!lock)
+        throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);
+
+    loadIndices(lock);
+}
+
+
+void StorageStripeLog::loadIndices(const WriteLock & /* already locked exclusively */)
+{
+    if (indices_loaded)
+        return;
+
+    if (disk->exists(index_file_path))
+    {
+        CompressedReadBufferFromFile index_in(disk->readFile(index_file_path, ReadSettings{}.adjustBufferSize(4096)));
+        indices.read(index_in);
+    }
+
+    indices_loaded = true;
+    num_indices_saved = indices.blocks.size();
+}
+
+
+void StorageStripeLog::saveIndices(const WriteLock & /* already locked for writing */)
+{
+    size_t num_indices = indices.blocks.size();
+    if (num_indices_saved == num_indices)
+        return;
+
+    size_t start = num_indices_saved;
+    auto index_out_compressed = disk->writeFile(index_file_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append);
+    auto index_out = std::make_unique<CompressedWriteBuffer>(*index_out_compressed);
+
+    for (size_t i = start; i != num_indices; ++i)
+        indices.blocks[i].write(*index_out);
+
+    index_out->next();
+    index_out_compressed->next();
+    index_out_compressed->finalize();
+
+    num_indices_saved = num_indices;
+}
+
+
+void StorageStripeLog::removeUnsavedIndices(const WriteLock & /* already locked for writing */)
+{
+    if (indices.blocks.size() > num_indices_saved)
+        indices.blocks.resize(num_indices_saved);
+}
+
+
+void StorageStripeLog::saveFileSizes(const WriteLock & /* already locked for writing */)
+{
+    file_checker.update(data_file_path);
+    file_checker.update(index_file_path);
+    file_checker.save();
 }


--- a/src/Storages/StorageStripeLog.h
+++ b/src/Storages/StorageStripeLog.h
@ -7,12 +7,15 @@

 #include <Core/Defines.h>
 #include <Storages/IStorage.h>
+#include <Formats/IndexForNativeFormat.h>
 #include <Common/FileChecker.h>
 #include <Common/escapeForFileName.h>


 namespace DB
 {
+struct IndexForNativeFormat;
+
 /** Implements a table engine that is suitable for small chunks of the log.
  * In doing so, stores all the columns in a single Native file, with a nearby index.
  */
@ -23,6 +26,8 @@ class StorageStripeLog final : public shared_ptr_helper<StorageStripeLog>, publi
    friend struct shared_ptr_helper<StorageStripeLog>;

 public:
+    ~StorageStripeLog() override;
+
    String getName() const override { return "StripeLog"; }

    Pipe read(
@ -57,18 +62,36 @@ protected:
        size_t max_compress_block_size_);

 private:
-    struct ColumnData
-    {
-        String data_file_path;
-    };
-    using Files = std::map<String, ColumnData>; /// file name -> column data
+    using ReadLock = std::shared_lock<std::shared_timed_mutex>;
+    using WriteLock = std::unique_lock<std::shared_timed_mutex>;

-    DiskPtr disk;
+    /// Reads the index file if it hasn't read yet.
+    /// It is done lazily, so that with a large number of tables, the server starts quickly.
+    void loadIndices(std::chrono::seconds lock_timeout);
+    void loadIndices(const WriteLock &);
+
+    /// Saves the index file.
+    void saveIndices(const WriteLock &);
+
+    /// Removes all unsaved indices.
+    void removeUnsavedIndices(const WriteLock &);
+
+    /// Saves the sizes of the data and index files.
+    void saveFileSizes(const WriteLock &);
+
+    const DiskPtr disk;
    String table_path;

-    size_t max_compress_block_size;
-
+    String data_file_path;
+    String index_file_path;
    FileChecker file_checker;
+
+    IndexForNativeFormat indices;
+    std::atomic<bool> indices_loaded = false;
+    size_t num_indices_saved = 0;
+
+    const size_t max_compress_block_size;
+
    std::shared_timed_mutex rwlock;

    Poco::Logger * log;