part besides reader

This commit is contained in:
Vladimir Makarov 2022-04-15 15:43:30 +00:00
parent f14b9ec2af
commit 0914c1bb1e
5 changed files with 75 additions and 1 deletions

View File

@ -0,0 +1,23 @@
#pragma once
#include <Storages/MergeTree/IMergeTreeIndices.h>
namespace DB
{
class IMergeTreeIndexReturnIdCondition : public IMergeTreeIndexCondition {
public:
virtual ~IMergeTreeIndexReturnIdCondition() override = default;
virtual bool alwaysUnknownOrTrue() const override = 0;
virtual bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const override = 0;
virtual std::vector<int32_t> returnIdRecords(MergeTreeIndexGranulePtr granule) const = 0;
};
using MergeTreeIndexReturnIdConditionPtr = std::shared_ptr<IMergeTreeIndexReturnIdCondition>;
using MergeTreeIndexReturnIdConditions = std::vector<MergeTreeIndexReturnIdConditionPtr>;
}

View File

@ -0,0 +1,15 @@
#include <Storage/MergeTree/MarkRange.h>
namespace DB
{
struct MarkRangeSelective
{
MarkRangeSelective() = default;
MarkRangeSelective(MarkRange range, const std::vector<size_t>& selected) : MarkRange(range), selected(selected) {}
MarkRangeSelective(MarkRange range, std::vector<size_t>&& selected) : MarkRange(range), selected(std::move(selected)) {}
std::vector<size_t> selected;
}
}

View File

@ -41,6 +41,9 @@
#include <Storages/MergeTree/StorageFromMergeTreeDataPart.h>
#include <IO/WriteBufferFromOStream.h>
#include <Storages/MergeTree/IMergeTreeIndexReturnIdCondition.h>
#include <Storages/MergeTree/MarkRangeSelective.h>
namespace DB
{
@ -1545,6 +1548,8 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
MarkRanges res;
auto return_id_condition = dynamic_cast<MergeTreeIndexReturnIdConditionPtr>(condition);
/// Some granules can cover two or more ranges,
/// this variable is stored to avoid reading the same granule twice.
MergeTreeIndexGranulePtr granule = nullptr;
@ -1573,6 +1578,10 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
continue;
}
if (return_id_condition) {
res.push_back(MarkRangeSelected(data_range, return_id_condition->returnIdRecords(granule)));
}
if (res.empty() || res.back().end - data_range.begin > min_marks_for_seek)
res.push_back(data_range);
else

View File

@ -28,6 +28,8 @@ namespace Annoy
const int NUM_OF_TREES = 20;
const int DIMENSION = 512;
const int32_t LIMIT = 10;
template<typename Dist>
void AnnoyIndexSerialize<Dist>::serialize(WriteBuffer& ostr) const
{
@ -197,6 +199,27 @@ bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr i
return dist[0] < min_distance;
}
std::vector<int32_t> MergeTreeIndexConditionAnnoy::returnIdRecords(MergeTreeIndexGranulePtr idx_granule) const {
// TODO: Change assert to the exception
assert(expression.has_value());
std::vector<int32_t> items;
items.reserve(LIMIT);
std::vector<float> target_vec = expression.value().target;
float min_distance = expression.value().distance;
auto granule = std::dynamic_pointer_cast<MergeTreeIndexGranuleAnnoy>(idx_granule);
auto annoy = std::dynamic_pointer_cast<Annoy::AnnoyIndexSerialize<>>(granule->index_base);
// 1 - num of nearest neighbour (NN)
// next number - upper limit on the size of the internal queue; -1 means, that it is equal to num of trees * num of NN
annoy->get_nns_by_vector(&target_vec[0], LIMIT, 200, &items, NULL);
return items;
}
bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const
{
return !expression.has_value();

View File

@ -4,6 +4,8 @@
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/KeyCondition.h>
#include <Storages/MergeTree/IMergeTreeIndexReturnIdCondition.h>
#include <annoylib.h>
#include <kissrandom.h>
@ -69,7 +71,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator
};
class MergeTreeIndexConditionAnnoy final : public IMergeTreeIndexCondition
class MergeTreeIndexConditionAnnoy final : public IMergeTreeIndexReturnIdCondition
{
public:
MergeTreeIndexConditionAnnoy(
@ -81,6 +83,8 @@ public:
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
std::vector<int32_t> returnIdRecords(MergeTreeIndexGranulePtr granule) const override;
~MergeTreeIndexConditionAnnoy() override = default;
private:
// Type of the vector to use as a target in the distance function