mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
Fix and move Annoy
This commit is contained in:
parent
b52fb9ef12
commit
dd1df58559
56
contrib/annoy/annoy.cpp
Normal file
56
contrib/annoy/annoy.cpp
Normal file
@ -0,0 +1,56 @@
|
||||
#include <ctime>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
#include "annoy.h"
|
||||
#include "settings.h"
|
||||
#include "point.h"
|
||||
|
||||
|
||||
Annoy::Annoy(const std::vector<Point>& points) : points_(std::make_shared<const std::vector<Point>>(points)), trees_(NUM_OF_TREES) {
|
||||
assert(!points.empty());
|
||||
std::srand(std::time(nullptr));
|
||||
std::vector<size_t> indexes(points_->size());
|
||||
for (int i = 0; i < points_->size(); ++i) {
|
||||
indexes[i] = i;
|
||||
}
|
||||
for (auto& tree : trees_) {
|
||||
tree = std::make_shared<Node>(points_);
|
||||
tree->data.emplace<Node::LeafData>(Node::LeafData{indexes});
|
||||
tree->TrySplit();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Point> Annoy::FindKNN(const Point& x, size_t k) const {
|
||||
std::multimap<double, std::shared_ptr<Node>> heap;
|
||||
std::map<size_t, double> candidates_set;
|
||||
for (const auto& tree : trees_) {
|
||||
heap.insert({std::numeric_limits<double>::max(), tree});
|
||||
}
|
||||
while (candidates_set.size() < k && !heap.empty()) {
|
||||
auto [dist, node] = *heap.rbegin();
|
||||
heap.erase(std::prev(heap.end()));
|
||||
if (node->IsList()) {
|
||||
for (size_t i : std::get<Node::LeafData>(node->data).indexes) {
|
||||
candidates_set[i] = ScalarMul(x - (*points_)[i], x - (*points_)[i]);
|
||||
}
|
||||
} else {
|
||||
auto& leaf_data = std::get<Node::InnerData>(node->data);
|
||||
double scalar_mul = ScalarMul(x - leaf_data.div_line_point, leaf_data.div_line_norm);
|
||||
heap.insert({std::min(-scalar_mul, dist), leaf_data.left});
|
||||
heap.insert({std::min(scalar_mul, dist), leaf_data.right});
|
||||
}
|
||||
}
|
||||
std::vector<std::pair<size_t, double>> candidates;
|
||||
candidates.reserve(candidates_set.size());
|
||||
for (const auto& x : candidates_set) {
|
||||
candidates.emplace_back(x);
|
||||
}
|
||||
std::vector<Point> result(std::min(k, candidates.size()));
|
||||
std::partial_sort(candidates.begin(), candidates.begin() + result.size(), candidates.end());
|
||||
for (int i = 0; i < result.size(); ++i) {
|
||||
result[i] = (*points_)[candidates[i].first];
|
||||
}
|
||||
return result;
|
||||
}
|
@ -13,7 +13,6 @@ class Annoy {
|
||||
std::vector<Point> FindKNN(const Point& x, size_t k) const;
|
||||
private:
|
||||
std::shared_ptr<const std::vector<Point>> points_;
|
||||
std::shared_ptr<Node> tree_;
|
||||
// std::vector<std::shared_ptr<Node>> trees_;
|
||||
std::vector<std::shared_ptr<Node>> trees_;
|
||||
int dim_;
|
||||
};
|
46
contrib/annoy/node.cpp
Normal file
46
contrib/annoy/node.cpp
Normal file
@ -0,0 +1,46 @@
|
||||
#include "node.h"
|
||||
#include "point.h"
|
||||
#include "settings.h"
|
||||
|
||||
const double EPS = 1e-5;
|
||||
|
||||
Node::Node(std::shared_ptr<const std::vector<Point>> points) : points(std::move(points)), data(LeafData()) {}
|
||||
|
||||
|
||||
void Node::TrySplit() {
|
||||
if (std::get<LeafData>(data).indexes.size() <= MAX_LEAF_NODE_SIZE) {
|
||||
return;
|
||||
}
|
||||
auto indexes = std::move(std::get<LeafData>(data).indexes);
|
||||
data.emplace<InnerData>();
|
||||
auto& inner_node_data = std::get<InnerData>(data);
|
||||
|
||||
GenerateLine(inner_node_data, indexes);
|
||||
|
||||
inner_node_data.left = std::make_shared<Node>(points);
|
||||
inner_node_data.right = std::make_shared<Node>(points);
|
||||
auto& left_child_indexes = std::get<LeafData>(inner_node_data.left->data).indexes;
|
||||
auto& right_child_indexes = std::get<LeafData>(inner_node_data.right->data).indexes;
|
||||
for (size_t i : indexes) {
|
||||
if (ScalarMul((*points)[i] - inner_node_data.div_line_point, inner_node_data.div_line_norm) < 0) {
|
||||
left_child_indexes.push_back(i);
|
||||
} else {
|
||||
right_child_indexes.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
inner_node_data.left->TrySplit();
|
||||
inner_node_data.right->TrySplit();
|
||||
}
|
||||
|
||||
bool Node::IsList() const {
|
||||
return std::holds_alternative<LeafData>(data);
|
||||
}
|
||||
|
||||
void Node::GenerateLine(InnerData& inner_node_data, const std::vector<size_t>& indexes) {
|
||||
size_t i1 = std::rand() % (indexes.size() - 1);
|
||||
size_t i2 = std::rand() % (indexes.size() - i1 - 1);
|
||||
i2 += i1 + 1;
|
||||
inner_node_data.div_line_point = ((*points)[indexes[i1]] + (*points)[indexes[i2]]) * 0.5;
|
||||
inner_node_data.div_line_norm = (*points)[indexes[i2]] - (*points)[indexes[i1]];
|
||||
}
|
37
contrib/annoy/node.h
Normal file
37
contrib/annoy/node.h
Normal file
@ -0,0 +1,37 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
|
||||
struct Node {
|
||||
using Point = std::vector<double>;
|
||||
|
||||
struct InnerData {
|
||||
std::shared_ptr<Node> left;
|
||||
std::shared_ptr<Node> right;
|
||||
Point div_line_point;
|
||||
Point div_line_norm;
|
||||
};
|
||||
struct LeafData {
|
||||
std::vector<size_t> indexes;
|
||||
};
|
||||
|
||||
std::variant<InnerData, LeafData> data;
|
||||
|
||||
size_t dim = 0;
|
||||
std::shared_ptr<const std::vector<Point>> points;
|
||||
|
||||
bool IsList() const;
|
||||
|
||||
Node() = default;
|
||||
|
||||
Node(std::shared_ptr<const std::vector<Point>> points);
|
||||
|
||||
void TrySplit();
|
||||
|
||||
private:
|
||||
void GenerateLine(InnerData& inner_node_data, const std::vector<size_t>& indexes);
|
||||
};
|
33
contrib/annoy/point.cpp
Normal file
33
contrib/annoy/point.cpp
Normal file
@ -0,0 +1,33 @@
|
||||
#include "point.h"
|
||||
|
||||
double ScalarMul(const Point& first, const Point& second) {
|
||||
double sum = 0.;
|
||||
for (int i = 0; i < first.size(); ++i) {
|
||||
sum += first[i] * second[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
Point operator+(const Point& first, const Point& second) {
|
||||
std::vector<double> result(first.size());
|
||||
for (size_t i = 0; i < first.size(); ++i) {
|
||||
result[i] = first[i] + second[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Point operator-(const Point& point) {
|
||||
return point * (-1.);
|
||||
}
|
||||
|
||||
Point operator-(const Point& first, const Point& second) {
|
||||
return first + (-second);
|
||||
}
|
||||
|
||||
Point operator*(const Point& point, double k) {
|
||||
Point result = point;
|
||||
for (int i = 0; i < point.size(); ++i) {
|
||||
result[i] *= k;
|
||||
}
|
||||
return result;
|
||||
}
|
16
contrib/annoy/point.h
Normal file
16
contrib/annoy/point.h
Normal file
@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
using Point = std::vector<double>;
|
||||
|
||||
double ScalarMul(const Point& first, const Point& second);
|
||||
|
||||
Point operator+(const Point& first, const Point& second);
|
||||
|
||||
Point operator-(const Point& point);
|
||||
|
||||
Point operator-(const Point& first, const Point& second);
|
||||
|
||||
Point operator*(const Point& first, double k);
|
4
contrib/annoy/settings.h
Normal file
4
contrib/annoy/settings.h
Normal file
@ -0,0 +1,4 @@
|
||||
#include <cstddef>
|
||||
|
||||
const size_t NUM_OF_TREES = 3;
|
||||
const size_t MAX_LEAF_NODE_SIZE = 1;
|
@ -1,43 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
|
||||
#include "annoy.h"
|
||||
#include "point.h"
|
||||
|
||||
Annoy::Annoy(int dim) : dim_(dim) {}
|
||||
|
||||
Annoy::Annoy(const std::vector<Point>& points) : points_(std::make_shared<const std::vector<Point>>(points)) {
|
||||
assert(!points.empty());
|
||||
std::srand(std::time(nullptr));
|
||||
dim_ = (*points_)[0].size();
|
||||
tree_ = std::make_shared<Node>(dim_, points_);
|
||||
tree_->Split();
|
||||
}
|
||||
|
||||
std::vector<Point> Annoy::FindKNN(const Point& x, size_t k) const {
|
||||
std::multimap<double, std::shared_ptr<Node>> heap;
|
||||
std::vector<std::pair<size_t, double>> candidates;
|
||||
heap.insert({std::numeric_limits<double>::max(), tree_});
|
||||
while (candidates.size() < k && heap.size() > 0) {
|
||||
|
||||
auto [dist, node] = *heap.begin();
|
||||
heap.erase(heap.begin());
|
||||
if (node->is_list) {
|
||||
for (size_t i : node->indexes) {
|
||||
candidates.push_back({i, ScalarMul(x - (*points_)[i], x - (*points_)[i])});
|
||||
}
|
||||
} else {
|
||||
double scalar_mul = std::fabs(ScalarMul(x - (*points_)[node->div_line_point], node->div_line_norm));
|
||||
scalar_mul = std::min(scalar_mul, dist);
|
||||
heap.insert({scalar_mul, node->left});
|
||||
heap.insert({scalar_mul, node->right});
|
||||
}
|
||||
}
|
||||
std::partial_sort(candidates.begin(), candidates.begin() + k, candidates.end());
|
||||
std::vector<Point> result(k);
|
||||
for (int i = 0; i < k; ++i) {
|
||||
result[i] = (*points_)[candidates[i].first];
|
||||
}
|
||||
return result;
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
#include "annoy.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
std::vector<std::vector<double>> points = {{0, 0},{0, 1}, {1, 0}, {1, 1}};
|
||||
Annoy annoy(points);
|
||||
return 0;
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
#include "node.h"
|
||||
#include "point.h"
|
||||
|
||||
Node::Node(int dim, std::shared_ptr<const std::vector<Point>> points) : dim(dim), points(points) {}
|
||||
|
||||
void Node::Split() {
|
||||
assert(dim > 0);
|
||||
if (indexes.size() <= MAX_LEAF_NODE_SIZE) {
|
||||
is_list = true;
|
||||
return;
|
||||
}
|
||||
|
||||
size_t i1 = std::rand() % indexes.size();
|
||||
size_t i2 = std::rand() % (indexes.size() - 1);
|
||||
i2 += (i2 >= i1);
|
||||
div_line_point = i1;
|
||||
|
||||
div_line_norm = (*points)[indexes[i2]] - (*points)[indexes[i1]];
|
||||
double scalar_mul = std::fabs(ScalarMul(div_line_norm, div_line_norm)) / 2;
|
||||
|
||||
left = std::make_shared<Node>(dim, points);
|
||||
right = std::make_shared<Node>(dim, points);
|
||||
left->dim = dim;
|
||||
right->dim = dim;
|
||||
for (size_t i : indexes) {
|
||||
if (std::fabs(ScalarMul((*points)[i] - (*points)[indexes[i1]], div_line_norm)) < scalar_mul) {
|
||||
left->indexes.push_back(i);
|
||||
} else {
|
||||
right->indexes.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
left->Split();
|
||||
right->Split();
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
// Подбираемая константа - максимальное количество точек в листе дерева
|
||||
const int MAX_LEAF_NODE_SIZE = 4;
|
||||
|
||||
struct Node {
|
||||
using Point = std::vector<double>;
|
||||
std::shared_ptr<Node> left;
|
||||
std::shared_ptr<Node> right;
|
||||
size_t dim = 0;
|
||||
std::shared_ptr<const std::vector<Point>> points;
|
||||
bool is_list = false;
|
||||
std::vector<size_t> indexes;
|
||||
size_t div_line_point;
|
||||
Point div_line_norm;
|
||||
|
||||
Node() = default;
|
||||
|
||||
Node(int dim, std::shared_ptr<const std::vector<Point>> points);
|
||||
|
||||
void Split();
|
||||
};
|
@ -1,17 +0,0 @@
|
||||
#include "point.h"
|
||||
|
||||
double ScalarMul(const Point& first, const Point& second) {
|
||||
double sum;
|
||||
for (int i = 0; i < first.size(); ++i) {
|
||||
sum += (second[i] - first[i]) * (second[i] - first[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
Point operator-(const Point& first, const Point& second) {
|
||||
std::vector<double> result(first.size());
|
||||
for (size_t i = 0; i < first.size(); ++i) {
|
||||
result[i] = first[i] - second[i];
|
||||
}
|
||||
return result;
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
using Point = std::vector<double>;
|
||||
|
||||
double ScalarMul(const Point& first, const Point& second);
|
||||
|
||||
Point operator-(const Point& first, const Point& second);
|
Loading…
Reference in New Issue
Block a user