ClickHouse/src/Storages/StorageMemory.cpp

373 lines
12 KiB
C++
Raw Normal View History

2020-11-23 17:45:59 +00:00
#include <cassert>
#include <Common/Exception.h>
2011-10-31 17:55:06 +00:00
#include <DataStreams/IBlockInputStream.h>
2015-01-18 08:25:56 +00:00
2020-09-22 09:23:46 +00:00
#include <Interpreters/MutationsInterpreter.h>
#include <Storages/StorageFactory.h>
2020-09-22 09:23:46 +00:00
#include <Storages/StorageMemory.h>
2021-02-07 01:41:31 +00:00
#include <Storages/MemorySettings.h>
2011-10-31 17:55:06 +00:00
2018-06-05 19:46:49 +00:00
#include <IO/WriteHelpers.h>
#include <Processors/Sources/SourceWithProgress.h>
#include <Processors/Pipe.h>
2018-06-05 19:46:49 +00:00
2011-10-31 17:55:06 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
class MemorySource : public SourceWithProgress
2011-10-31 17:55:06 +00:00
{
2021-02-07 01:41:31 +00:00
using InitializerFunc = std::function<void(std::shared_ptr<const LazyBlocks> &)>;
2015-01-18 08:25:56 +00:00
public:
2020-08-07 13:02:28 +00:00
/// Blocks are stored in std::list which may be appended in another thread.
/// We use pointer to the beginning of the list and its current size.
/// We don't need synchronisation in this reader, because while we hold SharedLock on storage,
/// only new elements can be added to the back of the list, so our iterators remain valid
MemorySource(
Names column_names_,
const StorageMemory & storage,
const StorageMetadataPtr & metadata_snapshot,
2021-02-07 01:41:31 +00:00
std::shared_ptr<const LazyBlocks> data_,
2020-12-13 21:21:25 +00:00
std::shared_ptr<std::atomic<size_t>> parallel_execution_index_,
InitializerFunc initializer_func_ = {})
2020-06-19 17:17:13 +00:00
: SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID()))
, column_names_and_types(metadata_snapshot->getColumns().getAllWithSubcolumns().addTypes(std::move(column_names_)))
2020-10-08 11:27:13 +00:00
, data(data_)
2020-12-13 21:21:25 +00:00
, parallel_execution_index(parallel_execution_index_)
, initializer_func(std::move(initializer_func_))
{
2021-02-07 01:41:31 +00:00
for (const auto & elem : column_names_and_types)
column_positions.push_back(metadata_snapshot->getSampleBlock().getPositionByName(elem.getNameInStorage()));
}
2011-10-31 17:55:06 +00:00
String getName() const override { return "Memory"; }
2011-10-31 17:55:06 +00:00
2015-01-18 08:25:56 +00:00
protected:
Chunk generate() override
{
2020-12-13 21:21:25 +00:00
if (initializer_func)
2020-09-04 08:36:47 +00:00
{
2020-12-13 21:21:25 +00:00
initializer_func(data);
initializer_func = {};
2020-09-04 08:36:47 +00:00
}
2020-12-13 21:21:25 +00:00
size_t current_index = getAndIncrementExecutionIndex();
2020-12-13 21:50:55 +00:00
if (current_index >= data->size())
{
return {};
2020-12-13 21:21:25 +00:00
}
2021-02-07 01:41:31 +00:00
const LazyBlock & src = (*data)[current_index];
Columns columns;
columns.reserve(columns.size());
/// Add only required columns to `res`.
2021-02-07 01:41:31 +00:00
size_t i = 0;
for (const auto & elem : column_names_and_types)
{
2021-02-07 01:41:31 +00:00
auto current_column = src[column_positions[i]]();
if (elem.isSubcolumn())
2020-12-22 15:03:48 +00:00
columns.emplace_back(elem.getTypeInStorage()->getSubcolumn(elem.getSubcolumnName(), *current_column));
2020-08-25 23:12:16 +00:00
else
columns.emplace_back(std::move(current_column));
2021-02-07 01:41:31 +00:00
++i;
}
2020-08-25 19:46:47 +00:00
2021-02-07 01:41:31 +00:00
size_t rows = columns.at(0)->size();
return Chunk(std::move(columns), rows);
}
2015-01-18 08:25:56 +00:00
private:
2020-12-13 21:21:25 +00:00
size_t getAndIncrementExecutionIndex()
{
if (parallel_execution_index)
{
return (*parallel_execution_index)++;
}
else
{
2020-12-13 21:50:55 +00:00
return execution_index++;
2020-12-13 21:21:25 +00:00
}
}
2020-09-04 08:36:47 +00:00
const NamesAndTypesList column_names_and_types;
2020-12-13 21:21:25 +00:00
size_t execution_index = 0;
2021-02-07 01:41:31 +00:00
std::shared_ptr<const LazyBlocks> data;
2020-12-13 21:21:25 +00:00
std::shared_ptr<std::atomic<size_t>> parallel_execution_index;
InitializerFunc initializer_func;
2021-02-07 01:41:31 +00:00
std::vector<size_t> column_positions;
2015-01-18 08:25:56 +00:00
};
2011-10-31 17:55:06 +00:00
2015-01-18 08:25:56 +00:00
class MemoryBlockOutputStream : public IBlockOutputStream
2011-10-31 17:55:06 +00:00
{
2015-01-18 08:25:56 +00:00
public:
2021-02-10 17:48:39 +00:00
MemoryBlockOutputStream(
StorageMemory & storage_,
const StorageMetadataPtr & metadata_snapshot_)
: storage(storage_)
, metadata_snapshot(metadata_snapshot_)
2021-02-10 17:48:39 +00:00
{
}
Block getHeader() const override { return metadata_snapshot->getSampleBlock(); }
void write(const Block & block) override
{
2020-06-17 14:32:25 +00:00
metadata_snapshot->check(block, true);
2021-02-07 02:40:06 +00:00
2021-02-07 02:40:06 +00:00
inserted_bytes += block.allocatedBytes();
inserted_rows += block.rows();
2021-02-07 02:40:06 +00:00
Block sample = metadata_snapshot->getSampleBlock();
LazyColumns lazy_columns;
lazy_columns.reserve(sample.columns());
for (const auto & elem : sample)
2021-02-07 02:40:06 +00:00
{
2021-02-07 02:40:06 +00:00
const ColumnPtr & column = block.getByName(elem.name).column;
if (storage.compress)
lazy_columns.emplace_back(column->compress());
else
lazy_columns.emplace_back([=]{ return column; });
}
2021-02-07 02:40:06 +00:00
new_blocks.emplace_back(std::move(lazy_columns));
}
2021-02-07 02:40:06 +00:00
2021-02-07 02:40:06 +00:00
void writeSuffix() override
{
std::lock_guard lock(storage.mutex);
auto new_data = std::make_unique<LazyBlocks>(*(storage.data.get()));
2021-02-07 02:40:06 +00:00
new_data->insert(new_data->end(), new_blocks.begin(), new_blocks.end());
storage.data.set(std::move(new_data));
storage.total_size_bytes.fetch_add(inserted_bytes, std::memory_order_relaxed);
storage.total_size_rows.fetch_add(inserted_rows, std::memory_order_relaxed);
}
2021-02-07 02:40:06 +00:00
2015-01-18 08:25:56 +00:00
private:
2021-02-07 02:40:06 +00:00
LazyBlocks new_blocks;
size_t inserted_bytes = 0;
size_t inserted_rows = 0;
2021-02-07 02:40:06 +00:00
StorageMemory & storage;
StorageMetadataPtr metadata_snapshot;
2015-01-18 08:25:56 +00:00
};
2011-10-31 17:55:06 +00:00
2021-02-07 01:41:31 +00:00
StorageMemory::StorageMemory(
const StorageID & table_id_,
ColumnsDescription columns_description_,
ConstraintsDescription constraints_,
bool compress_)
: IStorage(table_id_), data(std::make_unique<const LazyBlocks>()), compress(compress_)
{
2020-06-19 15:39:41 +00:00
StorageInMemoryMetadata storage_metadata;
storage_metadata.setColumns(std::move(columns_description_));
storage_metadata.setConstraints(std::move(constraints_));
setInMemoryMetadata(storage_metadata);
}
2020-08-03 13:54:14 +00:00
Pipe StorageMemory::read(
const Names & column_names,
const StorageMetadataPtr & metadata_snapshot,
SelectQueryInfo & /*query_info*/,
const Context & /*context*/,
QueryProcessingStage::Enum /*processed_stage*/,
2017-12-01 21:13:25 +00:00
size_t /*max_block_size*/,
2017-06-02 15:54:39 +00:00
unsigned num_streams)
2011-10-31 17:55:06 +00:00
{
2020-06-19 17:17:13 +00:00
metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
2012-05-30 04:45:49 +00:00
2020-09-04 08:36:47 +00:00
if (delay_read_for_global_subqueries)
{
/// Note: for global subquery we use single source.
/// Mainly, the reason is that at this point table is empty,
/// and we don't know the number of blocks are going to be inserted into it.
///
/// It may seem to be not optimal, but actually data from such table is used to fill
/// set for IN or hash table for JOIN, which can't be done concurrently.
/// Since no other manipulation with data is done, multiple sources shouldn't give any profit.
2020-10-08 07:19:37 +00:00
return Pipe(std::make_shared<MemorySource>(
column_names,
*this,
metadata_snapshot,
2020-12-13 21:21:25 +00:00
nullptr /* data */,
nullptr /* parallel execution index */,
2021-02-07 01:41:31 +00:00
[this](std::shared_ptr<const LazyBlocks> & data_to_initialize)
2020-10-08 11:16:53 +00:00
{
2020-12-13 21:21:25 +00:00
data_to_initialize = data.get();
2020-10-08 07:19:37 +00:00
}));
2020-09-04 08:36:47 +00:00
}
2020-10-08 11:15:11 +00:00
auto current_data = data.get();
2020-10-04 16:28:36 +00:00
size_t size = current_data->size();
2017-06-02 15:54:39 +00:00
if (num_streams > size)
num_streams = size;
2012-05-30 04:45:49 +00:00
Pipes pipes;
2012-05-30 04:45:49 +00:00
2020-12-13 21:21:25 +00:00
auto parallel_execution_index = std::make_shared<std::atomic<size_t>>(0);
2020-08-25 17:42:35 +00:00
2017-06-02 15:54:39 +00:00
for (size_t stream = 0; stream < num_streams; ++stream)
{
2020-12-13 21:21:25 +00:00
pipes.emplace_back(std::make_shared<MemorySource>(column_names, *this, metadata_snapshot, current_data, parallel_execution_index));
}
2020-08-06 12:24:05 +00:00
return Pipe::unitePipes(std::move(pipes));
2011-10-31 17:55:06 +00:00
}
BlockOutputStreamPtr StorageMemory::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/)
2011-10-31 17:55:06 +00:00
{
return std::make_shared<MemoryBlockOutputStream>(*this, metadata_snapshot);
2011-10-31 17:55:06 +00:00
}
2011-11-05 23:31:19 +00:00
2020-01-22 11:30:11 +00:00
void StorageMemory::drop()
2011-11-05 23:31:19 +00:00
{
2021-02-07 01:41:31 +00:00
data.set(std::make_unique<LazyBlocks>());
total_size_bytes.store(0, std::memory_order_relaxed);
total_size_rows.store(0, std::memory_order_relaxed);
2011-11-05 23:31:19 +00:00
}
2021-02-07 01:41:31 +00:00
static inline void updateBlockData(LazyBlock & old_block, const LazyBlock & new_block, const Block & old_header, const Block & new_header)
2020-09-22 09:23:46 +00:00
{
2021-02-07 01:41:31 +00:00
size_t i = 0;
for (const auto & it : new_header)
2020-09-22 09:23:46 +00:00
{
2021-02-07 01:41:31 +00:00
old_block[old_header.getPositionByName(it.name)] = new_block[i];
++i;
2020-09-22 09:23:46 +00:00
}
}
void StorageMemory::mutate(const MutationCommands & commands, const Context & context)
{
2020-10-08 07:19:37 +00:00
std::lock_guard lock(mutex);
2020-09-22 09:29:57 +00:00
auto metadata_snapshot = getInMemoryMetadataPtr();
auto storage = getStorageID();
auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context);
auto interpreter = std::make_unique<MutationsInterpreter>(storage_ptr, metadata_snapshot, commands, context, true);
2020-09-22 09:23:46 +00:00
auto in = interpreter->execute();
2021-02-07 01:41:31 +00:00
Block old_header = metadata_snapshot->getSampleBlock();
Block mutation_header = in->getHeader();
2020-09-22 09:23:46 +00:00
in->readPrefix();
2021-02-07 01:41:31 +00:00
LazyBlocks out;
while (Block block = in->read())
2020-09-22 09:23:46 +00:00
{
2021-02-07 01:41:31 +00:00
LazyColumns lazy_columns;
for (const auto & elem : block)
{
if (compress)
lazy_columns.emplace_back(elem.column->compress());
else
lazy_columns.emplace_back([=]{ return elem.column; });
}
out.emplace_back(std::move(lazy_columns));
2020-09-22 09:23:46 +00:00
}
in->readSuffix();
2021-02-07 01:41:31 +00:00
std::unique_ptr<LazyBlocks> new_data;
2020-11-23 05:30:36 +00:00
2021-02-07 01:41:31 +00:00
/// All columns affected.
2020-09-22 09:23:46 +00:00
if (interpreter->isAffectingAllColumns())
{
2021-02-07 01:41:31 +00:00
new_data = std::make_unique<LazyBlocks>(out);
2020-09-22 09:23:46 +00:00
}
else
{
2021-02-07 01:41:31 +00:00
/// Just some of the columns affected, we need update it with new column.
new_data = std::make_unique<LazyBlocks>(*(data.get()));
2020-10-04 16:28:36 +00:00
auto data_it = new_data->begin();
2020-09-22 09:23:46 +00:00
auto out_it = out.begin();
2020-11-26 19:22:26 +00:00
2020-11-23 17:45:59 +00:00
while (data_it != new_data->end())
2020-09-22 09:23:46 +00:00
{
2021-02-07 01:41:31 +00:00
/// Mutation does not change the number of blocks.
2020-11-23 17:45:59 +00:00
assert(out_it != out.end());
2021-02-07 01:41:31 +00:00
updateBlockData(*data_it, *out_it, old_header, mutation_header);
2020-09-22 09:23:46 +00:00
++data_it;
++out_it;
}
2020-11-23 17:45:59 +00:00
assert(out_it == out.end());
2020-09-22 09:23:46 +00:00
}
2020-11-23 05:30:36 +00:00
2021-02-07 01:41:31 +00:00
/* size_t rows = 0;
2020-11-23 05:30:36 +00:00
size_t bytes = 0;
for (const auto & buffer : *new_data)
{
rows += buffer.rows();
bytes += buffer.bytes();
}
total_size_bytes.store(rows, std::memory_order_relaxed);
2021-02-07 01:41:31 +00:00
total_size_rows.store(bytes, std::memory_order_relaxed);*/
2020-11-23 05:30:36 +00:00
data.set(std::move(new_data));
2020-09-22 09:23:46 +00:00
}
2020-06-18 10:29:13 +00:00
void StorageMemory::truncate(
2020-06-18 16:10:47 +00:00
const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &)
2018-04-21 00:35:20 +00:00
{
2021-02-07 01:41:31 +00:00
data.set(std::make_unique<LazyBlocks>());
total_size_bytes.store(0, std::memory_order_relaxed);
total_size_rows.store(0, std::memory_order_relaxed);
2018-04-21 00:35:20 +00:00
}
2020-11-25 13:47:32 +00:00
std::optional<UInt64> StorageMemory::totalRows(const Settings &) const
{
/// All modifications of these counters are done under mutex which automatically guarantees synchronization/consistency
/// When run concurrently we are fine with any value: "before" or "after"
return total_size_rows.load(std::memory_order_relaxed);
}
2020-11-25 13:47:32 +00:00
std::optional<UInt64> StorageMemory::totalBytes(const Settings &) const
{
return total_size_bytes.load(std::memory_order_relaxed);
}
void registerStorageMemory(StorageFactory & factory)
{
factory.registerStorage("Memory", [](const StorageFactory::Arguments & args)
{
if (!args.engine_args.empty())
2021-02-07 01:41:31 +00:00
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Engine {} doesn't support any arguments ({} given)",
args.engine_name, args.engine_args.size());
bool has_settings = args.storage_def->settings;
MemorySettings settings;
if (has_settings)
settings.loadFromQuery(*args.storage_def);
2021-02-07 01:41:31 +00:00
return StorageMemory::create(args.table_id, args.columns, args.constraints, settings.compress);
},
{
2021-02-07 01:41:31 +00:00
.supports_settings = true,
.supports_parallel_insert = true,
});
}
2011-10-31 17:55:06 +00:00
}