diff --git a/contrib/annoy/annoy.cpp b/contrib/annoy/annoy.cpp new file mode 100644 index 00000000000..1f73db3812c --- /dev/null +++ b/contrib/annoy/annoy.cpp @@ -0,0 +1,56 @@ +#include +#include +#include +#include + +#include "annoy.h" +#include "settings.h" +#include "point.h" + + +Annoy::Annoy(const std::vector& points) : points_(std::make_shared>(points)), trees_(NUM_OF_TREES) { + assert(!points.empty()); + std::srand(std::time(nullptr)); + std::vector indexes(points_->size()); + for (int i = 0; i < points_->size(); ++i) { + indexes[i] = i; + } + for (auto& tree : trees_) { + tree = std::make_shared(points_); + tree->data.emplace(Node::LeafData{indexes}); + tree->TrySplit(); + } +} + +std::vector Annoy::FindKNN(const Point& x, size_t k) const { + std::multimap> heap; + std::map candidates_set; + for (const auto& tree : trees_) { + heap.insert({std::numeric_limits::max(), tree}); + } + while (candidates_set.size() < k && !heap.empty()) { + auto [dist, node] = *heap.rbegin(); + heap.erase(std::prev(heap.end())); + if (node->IsList()) { + for (size_t i : std::get(node->data).indexes) { + candidates_set[i] = ScalarMul(x - (*points_)[i], x - (*points_)[i]); + } + } else { + auto& leaf_data = std::get(node->data); + double scalar_mul = ScalarMul(x - leaf_data.div_line_point, leaf_data.div_line_norm); + heap.insert({std::min(-scalar_mul, dist), leaf_data.left}); + heap.insert({std::min(scalar_mul, dist), leaf_data.right}); + } + } + std::vector> candidates; + candidates.reserve(candidates_set.size()); + for (const auto& x : candidates_set) { + candidates.emplace_back(x); + } + std::vector result(std::min(k, candidates.size())); + std::partial_sort(candidates.begin(), candidates.begin() + result.size(), candidates.end()); + for (int i = 0; i < result.size(); ++i) { + result[i] = (*points_)[candidates[i].first]; + } + return result; +} \ No newline at end of file diff --git a/src/Algorithms/Annoy/annoy.h b/contrib/annoy/annoy.h similarity index 81% rename from src/Algorithms/Annoy/annoy.h rename to contrib/annoy/annoy.h index 096a305951d..9c920dca1a3 100644 --- a/src/Algorithms/Annoy/annoy.h +++ b/contrib/annoy/annoy.h @@ -13,7 +13,6 @@ class Annoy { std::vector FindKNN(const Point& x, size_t k) const; private: std::shared_ptr> points_; - std::shared_ptr tree_; - // std::vector> trees_; + std::vector> trees_; int dim_; }; \ No newline at end of file diff --git a/contrib/annoy/node.cpp b/contrib/annoy/node.cpp new file mode 100644 index 00000000000..d1b5e0a0896 --- /dev/null +++ b/contrib/annoy/node.cpp @@ -0,0 +1,46 @@ +#include "node.h" +#include "point.h" +#include "settings.h" + +const double EPS = 1e-5; + +Node::Node(std::shared_ptr> points) : points(std::move(points)), data(LeafData()) {} + + +void Node::TrySplit() { + if (std::get(data).indexes.size() <= MAX_LEAF_NODE_SIZE) { + return; + } + auto indexes = std::move(std::get(data).indexes); + data.emplace(); + auto& inner_node_data = std::get(data); + + GenerateLine(inner_node_data, indexes); + + inner_node_data.left = std::make_shared(points); + inner_node_data.right = std::make_shared(points); + auto& left_child_indexes = std::get(inner_node_data.left->data).indexes; + auto& right_child_indexes = std::get(inner_node_data.right->data).indexes; + for (size_t i : indexes) { + if (ScalarMul((*points)[i] - inner_node_data.div_line_point, inner_node_data.div_line_norm) < 0) { + left_child_indexes.push_back(i); + } else { + right_child_indexes.push_back(i); + } + } + + inner_node_data.left->TrySplit(); + inner_node_data.right->TrySplit(); +} + +bool Node::IsList() const { + return std::holds_alternative(data); +} + +void Node::GenerateLine(InnerData& inner_node_data, const std::vector& indexes) { + size_t i1 = std::rand() % (indexes.size() - 1); + size_t i2 = std::rand() % (indexes.size() - i1 - 1); + i2 += i1 + 1; + inner_node_data.div_line_point = ((*points)[indexes[i1]] + (*points)[indexes[i2]]) * 0.5; + inner_node_data.div_line_norm = (*points)[indexes[i2]] - (*points)[indexes[i1]]; +} diff --git a/contrib/annoy/node.h b/contrib/annoy/node.h new file mode 100644 index 00000000000..ea7b9abf7f6 --- /dev/null +++ b/contrib/annoy/node.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include +#include +#include + + +struct Node { + using Point = std::vector; + + struct InnerData { + std::shared_ptr left; + std::shared_ptr right; + Point div_line_point; + Point div_line_norm; + }; + struct LeafData { + std::vector indexes; + }; + + std::variant data; + + size_t dim = 0; + std::shared_ptr> points; + + bool IsList() const; + + Node() = default; + + Node(std::shared_ptr> points); + + void TrySplit(); + + private: + void GenerateLine(InnerData& inner_node_data, const std::vector& indexes); +}; diff --git a/contrib/annoy/point.cpp b/contrib/annoy/point.cpp new file mode 100644 index 00000000000..fac1c81cfa7 --- /dev/null +++ b/contrib/annoy/point.cpp @@ -0,0 +1,33 @@ +#include "point.h" + +double ScalarMul(const Point& first, const Point& second) { + double sum = 0.; + for (int i = 0; i < first.size(); ++i) { + sum += first[i] * second[i]; + } + return sum; +} + +Point operator+(const Point& first, const Point& second) { + std::vector result(first.size()); + for (size_t i = 0; i < first.size(); ++i) { + result[i] = first[i] + second[i]; + } + return result; +} + +Point operator-(const Point& point) { + return point * (-1.); +} + +Point operator-(const Point& first, const Point& second) { + return first + (-second); +} + +Point operator*(const Point& point, double k) { + Point result = point; + for (int i = 0; i < point.size(); ++i) { + result[i] *= k; + } + return result; +} \ No newline at end of file diff --git a/contrib/annoy/point.h b/contrib/annoy/point.h new file mode 100644 index 00000000000..efd7adf1721 --- /dev/null +++ b/contrib/annoy/point.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +using Point = std::vector; + +double ScalarMul(const Point& first, const Point& second); + +Point operator+(const Point& first, const Point& second); + +Point operator-(const Point& point); + +Point operator-(const Point& first, const Point& second); + +Point operator*(const Point& first, double k); \ No newline at end of file diff --git a/contrib/annoy/settings.h b/contrib/annoy/settings.h new file mode 100644 index 00000000000..40328552f89 --- /dev/null +++ b/contrib/annoy/settings.h @@ -0,0 +1,4 @@ +#include + +const size_t NUM_OF_TREES = 3; +const size_t MAX_LEAF_NODE_SIZE = 1; \ No newline at end of file diff --git a/src/Algorithms/Annoy/annoy.cpp b/src/Algorithms/Annoy/annoy.cpp deleted file mode 100644 index f8a6718c7a1..00000000000 --- a/src/Algorithms/Annoy/annoy.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include -#include -#include - -#include "annoy.h" -#include "point.h" - -Annoy::Annoy(int dim) : dim_(dim) {} - -Annoy::Annoy(const std::vector& points) : points_(std::make_shared>(points)) { - assert(!points.empty()); - std::srand(std::time(nullptr)); - dim_ = (*points_)[0].size(); - tree_ = std::make_shared(dim_, points_); - tree_->Split(); -} - -std::vector Annoy::FindKNN(const Point& x, size_t k) const { - std::multimap> heap; - std::vector> candidates; - heap.insert({std::numeric_limits::max(), tree_}); - while (candidates.size() < k && heap.size() > 0) { - - auto [dist, node] = *heap.begin(); - heap.erase(heap.begin()); - if (node->is_list) { - for (size_t i : node->indexes) { - candidates.push_back({i, ScalarMul(x - (*points_)[i], x - (*points_)[i])}); - } - } else { - double scalar_mul = std::fabs(ScalarMul(x - (*points_)[node->div_line_point], node->div_line_norm)); - scalar_mul = std::min(scalar_mul, dist); - heap.insert({scalar_mul, node->left}); - heap.insert({scalar_mul, node->right}); - } - } - std::partial_sort(candidates.begin(), candidates.begin() + k, candidates.end()); - std::vector result(k); - for (int i = 0; i < k; ++i) { - result[i] = (*points_)[candidates[i].first]; - } - return result; -} \ No newline at end of file diff --git a/src/Algorithms/Annoy/main.cpp b/src/Algorithms/Annoy/main.cpp deleted file mode 100644 index c18d1d9ae7b..00000000000 --- a/src/Algorithms/Annoy/main.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#include "annoy.h" - -int main() -{ - std::vector> points = {{0, 0},{0, 1}, {1, 0}, {1, 1}}; - Annoy annoy(points); - return 0; -} \ No newline at end of file diff --git a/src/Algorithms/Annoy/node.cpp b/src/Algorithms/Annoy/node.cpp deleted file mode 100644 index 07f19f0b267..00000000000 --- a/src/Algorithms/Annoy/node.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "node.h" -#include "point.h" - -Node::Node(int dim, std::shared_ptr> points) : dim(dim), points(points) {} - -void Node::Split() { - assert(dim > 0); - if (indexes.size() <= MAX_LEAF_NODE_SIZE) { - is_list = true; - return; - } - - size_t i1 = std::rand() % indexes.size(); - size_t i2 = std::rand() % (indexes.size() - 1); - i2 += (i2 >= i1); - div_line_point = i1; - - div_line_norm = (*points)[indexes[i2]] - (*points)[indexes[i1]]; - double scalar_mul = std::fabs(ScalarMul(div_line_norm, div_line_norm)) / 2; - - left = std::make_shared(dim, points); - right = std::make_shared(dim, points); - left->dim = dim; - right->dim = dim; - for (size_t i : indexes) { - if (std::fabs(ScalarMul((*points)[i] - (*points)[indexes[i1]], div_line_norm)) < scalar_mul) { - left->indexes.push_back(i); - } else { - right->indexes.push_back(i); - } - } - - left->Split(); - right->Split(); -} diff --git a/src/Algorithms/Annoy/node.h b/src/Algorithms/Annoy/node.h deleted file mode 100644 index e7994355980..00000000000 --- a/src/Algorithms/Annoy/node.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -// Подбираемая константа - максимальное количество точек в листе дерева -const int MAX_LEAF_NODE_SIZE = 4; - -struct Node { - using Point = std::vector; - std::shared_ptr left; - std::shared_ptr right; - size_t dim = 0; - std::shared_ptr> points; - bool is_list = false; - std::vector indexes; - size_t div_line_point; - Point div_line_norm; - - Node() = default; - - Node(int dim, std::shared_ptr> points); - - void Split(); -}; diff --git a/src/Algorithms/Annoy/point.cpp b/src/Algorithms/Annoy/point.cpp deleted file mode 100644 index 147c869597f..00000000000 --- a/src/Algorithms/Annoy/point.cpp +++ /dev/null @@ -1,17 +0,0 @@ -#include "point.h" - -double ScalarMul(const Point& first, const Point& second) { - double sum; - for (int i = 0; i < first.size(); ++i) { - sum += (second[i] - first[i]) * (second[i] - first[i]); - } - return sum; -} - -Point operator-(const Point& first, const Point& second) { - std::vector result(first.size()); - for (size_t i = 0; i < first.size(); ++i) { - result[i] = first[i] - second[i]; - } - return result; -} \ No newline at end of file diff --git a/src/Algorithms/Annoy/point.h b/src/Algorithms/Annoy/point.h deleted file mode 100644 index 22147fd0135..00000000000 --- a/src/Algorithms/Annoy/point.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include -#include - -using Point = std::vector; - -double ScalarMul(const Point& first, const Point& second); - -Point operator-(const Point& first, const Point& second); \ No newline at end of file