Fix and move Annoy

This commit is contained in:
Vladimir Makarov 2022-03-24 19:40:09 +00:00
parent b52fb9ef12
commit dd1df58559
13 changed files with 193 additions and 143 deletions

56
contrib/annoy/annoy.cpp Normal file
View File

@ -0,0 +1,56 @@
#include <ctime>
#include <limits>
#include <map>
#include <set>
#include "annoy.h"
#include "settings.h"
#include "point.h"
Annoy::Annoy(const std::vector<Point>& points) : points_(std::make_shared<const std::vector<Point>>(points)), trees_(NUM_OF_TREES) {
assert(!points.empty());
std::srand(std::time(nullptr));
std::vector<size_t> indexes(points_->size());
for (int i = 0; i < points_->size(); ++i) {
indexes[i] = i;
}
for (auto& tree : trees_) {
tree = std::make_shared<Node>(points_);
tree->data.emplace<Node::LeafData>(Node::LeafData{indexes});
tree->TrySplit();
}
}
std::vector<Point> Annoy::FindKNN(const Point& x, size_t k) const {
std::multimap<double, std::shared_ptr<Node>> heap;
std::map<size_t, double> candidates_set;
for (const auto& tree : trees_) {
heap.insert({std::numeric_limits<double>::max(), tree});
}
while (candidates_set.size() < k && !heap.empty()) {
auto [dist, node] = *heap.rbegin();
heap.erase(std::prev(heap.end()));
if (node->IsList()) {
for (size_t i : std::get<Node::LeafData>(node->data).indexes) {
candidates_set[i] = ScalarMul(x - (*points_)[i], x - (*points_)[i]);
}
} else {
auto& leaf_data = std::get<Node::InnerData>(node->data);
double scalar_mul = ScalarMul(x - leaf_data.div_line_point, leaf_data.div_line_norm);
heap.insert({std::min(-scalar_mul, dist), leaf_data.left});
heap.insert({std::min(scalar_mul, dist), leaf_data.right});
}
}
std::vector<std::pair<size_t, double>> candidates;
candidates.reserve(candidates_set.size());
for (const auto& x : candidates_set) {
candidates.emplace_back(x);
}
std::vector<Point> result(std::min(k, candidates.size()));
std::partial_sort(candidates.begin(), candidates.begin() + result.size(), candidates.end());
for (int i = 0; i < result.size(); ++i) {
result[i] = (*points_)[candidates[i].first];
}
return result;
}

View File

@ -13,7 +13,6 @@ class Annoy {
std::vector<Point> FindKNN(const Point& x, size_t k) const;
private:
std::shared_ptr<const std::vector<Point>> points_;
std::shared_ptr<Node> tree_;
// std::vector<std::shared_ptr<Node>> trees_;
std::vector<std::shared_ptr<Node>> trees_;
int dim_;
};

46
contrib/annoy/node.cpp Normal file
View File

@ -0,0 +1,46 @@
#include "node.h"
#include "point.h"
#include "settings.h"
const double EPS = 1e-5;
Node::Node(std::shared_ptr<const std::vector<Point>> points) : points(std::move(points)), data(LeafData()) {}
void Node::TrySplit() {
if (std::get<LeafData>(data).indexes.size() <= MAX_LEAF_NODE_SIZE) {
return;
}
auto indexes = std::move(std::get<LeafData>(data).indexes);
data.emplace<InnerData>();
auto& inner_node_data = std::get<InnerData>(data);
GenerateLine(inner_node_data, indexes);
inner_node_data.left = std::make_shared<Node>(points);
inner_node_data.right = std::make_shared<Node>(points);
auto& left_child_indexes = std::get<LeafData>(inner_node_data.left->data).indexes;
auto& right_child_indexes = std::get<LeafData>(inner_node_data.right->data).indexes;
for (size_t i : indexes) {
if (ScalarMul((*points)[i] - inner_node_data.div_line_point, inner_node_data.div_line_norm) < 0) {
left_child_indexes.push_back(i);
} else {
right_child_indexes.push_back(i);
}
}
inner_node_data.left->TrySplit();
inner_node_data.right->TrySplit();
}
bool Node::IsList() const {
return std::holds_alternative<LeafData>(data);
}
void Node::GenerateLine(InnerData& inner_node_data, const std::vector<size_t>& indexes) {
size_t i1 = std::rand() % (indexes.size() - 1);
size_t i2 = std::rand() % (indexes.size() - i1 - 1);
i2 += i1 + 1;
inner_node_data.div_line_point = ((*points)[indexes[i1]] + (*points)[indexes[i2]]) * 0.5;
inner_node_data.div_line_norm = (*points)[indexes[i2]] - (*points)[indexes[i1]];
}

37
contrib/annoy/node.h Normal file
View File

@ -0,0 +1,37 @@
#pragma once
#include <cassert>
#include <memory>
#include <variant>
#include <vector>
struct Node {
using Point = std::vector<double>;
struct InnerData {
std::shared_ptr<Node> left;
std::shared_ptr<Node> right;
Point div_line_point;
Point div_line_norm;
};
struct LeafData {
std::vector<size_t> indexes;
};
std::variant<InnerData, LeafData> data;
size_t dim = 0;
std::shared_ptr<const std::vector<Point>> points;
bool IsList() const;
Node() = default;
Node(std::shared_ptr<const std::vector<Point>> points);
void TrySplit();
private:
void GenerateLine(InnerData& inner_node_data, const std::vector<size_t>& indexes);
};

33
contrib/annoy/point.cpp Normal file
View File

@ -0,0 +1,33 @@
#include "point.h"
double ScalarMul(const Point& first, const Point& second) {
double sum = 0.;
for (int i = 0; i < first.size(); ++i) {
sum += first[i] * second[i];
}
return sum;
}
Point operator+(const Point& first, const Point& second) {
std::vector<double> result(first.size());
for (size_t i = 0; i < first.size(); ++i) {
result[i] = first[i] + second[i];
}
return result;
}
Point operator-(const Point& point) {
return point * (-1.);
}
Point operator-(const Point& first, const Point& second) {
return first + (-second);
}
Point operator*(const Point& point, double k) {
Point result = point;
for (int i = 0; i < point.size(); ++i) {
result[i] *= k;
}
return result;
}

16
contrib/annoy/point.h Normal file
View File

@ -0,0 +1,16 @@
#pragma once
#include <cmath>
#include <vector>
using Point = std::vector<double>;
double ScalarMul(const Point& first, const Point& second);
Point operator+(const Point& first, const Point& second);
Point operator-(const Point& point);
Point operator-(const Point& first, const Point& second);
Point operator*(const Point& first, double k);

4
contrib/annoy/settings.h Normal file
View File

@ -0,0 +1,4 @@
#include <cstddef>
const size_t NUM_OF_TREES = 3;
const size_t MAX_LEAF_NODE_SIZE = 1;

View File

@ -1,43 +0,0 @@
#include <algorithm>
#include <limits>
#include <map>
#include "annoy.h"
#include "point.h"
Annoy::Annoy(int dim) : dim_(dim) {}
Annoy::Annoy(const std::vector<Point>& points) : points_(std::make_shared<const std::vector<Point>>(points)) {
assert(!points.empty());
std::srand(std::time(nullptr));
dim_ = (*points_)[0].size();
tree_ = std::make_shared<Node>(dim_, points_);
tree_->Split();
}
std::vector<Point> Annoy::FindKNN(const Point& x, size_t k) const {
std::multimap<double, std::shared_ptr<Node>> heap;
std::vector<std::pair<size_t, double>> candidates;
heap.insert({std::numeric_limits<double>::max(), tree_});
while (candidates.size() < k && heap.size() > 0) {
auto [dist, node] = *heap.begin();
heap.erase(heap.begin());
if (node->is_list) {
for (size_t i : node->indexes) {
candidates.push_back({i, ScalarMul(x - (*points_)[i], x - (*points_)[i])});
}
} else {
double scalar_mul = std::fabs(ScalarMul(x - (*points_)[node->div_line_point], node->div_line_norm));
scalar_mul = std::min(scalar_mul, dist);
heap.insert({scalar_mul, node->left});
heap.insert({scalar_mul, node->right});
}
}
std::partial_sort(candidates.begin(), candidates.begin() + k, candidates.end());
std::vector<Point> result(k);
for (int i = 0; i < k; ++i) {
result[i] = (*points_)[candidates[i].first];
}
return result;
}

View File

@ -1,8 +0,0 @@
#include "annoy.h"
int main()
{
std::vector<std::vector<double>> points = {{0, 0},{0, 1}, {1, 0}, {1, 1}};
Annoy annoy(points);
return 0;
}

View File

@ -1,35 +0,0 @@
#include "node.h"
#include "point.h"
Node::Node(int dim, std::shared_ptr<const std::vector<Point>> points) : dim(dim), points(points) {}
void Node::Split() {
assert(dim > 0);
if (indexes.size() <= MAX_LEAF_NODE_SIZE) {
is_list = true;
return;
}
size_t i1 = std::rand() % indexes.size();
size_t i2 = std::rand() % (indexes.size() - 1);
i2 += (i2 >= i1);
div_line_point = i1;
div_line_norm = (*points)[indexes[i2]] - (*points)[indexes[i1]];
double scalar_mul = std::fabs(ScalarMul(div_line_norm, div_line_norm)) / 2;
left = std::make_shared<Node>(dim, points);
right = std::make_shared<Node>(dim, points);
left->dim = dim;
right->dim = dim;
for (size_t i : indexes) {
if (std::fabs(ScalarMul((*points)[i] - (*points)[indexes[i1]], div_line_norm)) < scalar_mul) {
left->indexes.push_back(i);
} else {
right->indexes.push_back(i);
}
}
left->Split();
right->Split();
}

View File

@ -1,28 +0,0 @@
#pragma once
#include <cassert>
#include <cstdlib>
#include <ctime>
#include <memory>
#include <vector>
// Подбираемая константа - максимальное количество точек в листе дерева
const int MAX_LEAF_NODE_SIZE = 4;
struct Node {
using Point = std::vector<double>;
std::shared_ptr<Node> left;
std::shared_ptr<Node> right;
size_t dim = 0;
std::shared_ptr<const std::vector<Point>> points;
bool is_list = false;
std::vector<size_t> indexes;
size_t div_line_point;
Point div_line_norm;
Node() = default;
Node(int dim, std::shared_ptr<const std::vector<Point>> points);
void Split();
};

View File

@ -1,17 +0,0 @@
#include "point.h"
double ScalarMul(const Point& first, const Point& second) {
double sum;
for (int i = 0; i < first.size(); ++i) {
sum += (second[i] - first[i]) * (second[i] - first[i]);
}
return sum;
}
Point operator-(const Point& first, const Point& second) {
std::vector<double> result(first.size());
for (size_t i = 0; i < first.size(); ++i) {
result[i] = first[i] - second[i];
}
return result;
}

View File

@ -1,10 +0,0 @@
#pragma once
#include <cmath>
#include <vector>
using Point = std::vector<double>;
double ScalarMul(const Point& first, const Point& second);
Point operator-(const Point& first, const Point& second);