Add original and new Annoy

This commit is contained in:
Vladimir Makarov 2022-03-25 18:22:59 +00:00
parent 4a9fbdf4dc
commit 2f1213d25f
15 changed files with 79 additions and 80 deletions

View File

@ -90,6 +90,7 @@ add_contrib (openldap-cmake openldap)
add_contrib (grpc-cmake grpc)
add_contrib (msgpack-c-cmake msgpack-c)
add_contrib (spotify-annoy-cmake spotify-annoy)
add_contrib (annoy-cmake annoy)
if (ENABLE_FUZZING)
add_contrib (libprotobuf-mutator-cmake libprotobuf-mutator)

View File

@ -0,0 +1,21 @@
set(ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/annoy")
set(ANNOY_SOURCE_DIR "${ANNOY_PROJECT_DIR}/src")
set(ANNOY_INCLUDE_DIR ${ANNOY_PROJECT_DIR}/include)
set(ANNOY_HEADERS
${ANNOY_INCLUDE_DIR}/annoy.h
${ANNOY_INCLUDE_DIR}/node.h
${ANNOY_INCLUDE_DIR}/point.h
)
set(ANNOY_SRC
${ANNOY_SOURCE_DIR}/annoy.cpp
${ANNOY_SOURCE_DIR}/node.cpp
${ANNOY_SOURCE_DIR}/point.cpp
${ANNOY_SOURCE_DIR}/settings.cpp
)
add_library(_annoy ${ANNOY_SRC})
target_include_directories(_annoy SYSTEM PUBLIC ${ANNOY_INCLUDE_DIR})
add_library(ch_contrib::annoy ALIAS _annoy)

1
contrib/annoy/Readme.md Normal file
View File

@ -0,0 +1 @@
Algorithm was inspired by the https://github.com/spotify/annoy.

View File

@ -5,6 +5,8 @@
#include "node.h"
namespace Annoy {
class Annoy {
using Point = std::vector<double>;
public:
@ -15,4 +17,6 @@ class Annoy {
std::shared_ptr<const std::vector<Point>> points_;
std::vector<std::shared_ptr<Node>> trees_;
int dim_;
};
};
};

View File

@ -5,6 +5,7 @@
#include <variant>
#include <vector>
namespace Annoy {
struct Node {
using Point = std::vector<double>;
@ -35,3 +36,5 @@ struct Node {
private:
void GenerateLine(InnerData& inner_node_data, const std::vector<size_t>& indexes);
};
};

View File

@ -3,6 +3,8 @@
#include <cmath>
#include <vector>
namespace Annoy {
using Point = std::vector<double>;
double ScalarMul(const Point& first, const Point& second);
@ -13,4 +15,6 @@ Point operator-(const Point& point);
Point operator-(const Point& first, const Point& second);
Point operator*(const Point& first, double k);
Point operator*(const Point& first, double k);
};

View File

@ -1,4 +0,0 @@
#include <cstddef>
const size_t NUM_OF_TREES = 3;
const size_t MAX_LEAF_NODE_SIZE = 1;

View File

@ -3,13 +3,14 @@
#include <map>
#include <set>
#include "annoy.h"
#include "settings.h"
#include "point.h"
#include "annoy/annoy.h"
#include "annoy/point.h"
#include "settings.cpp"
namespace Annoy {
Annoy::Annoy(const std::vector<Point>& points) : points_(std::make_shared<const std::vector<Point>>(points)), trees_(NUM_OF_TREES) {
assert(!points.empty());
std::srand(std::time(nullptr));
std::vector<size_t> indexes(points_->size());
for (int i = 0; i < points_->size(); ++i) {
@ -53,4 +54,6 @@ std::vector<Point> Annoy::FindKNN(const Point& x, size_t k) const {
result[i] = (*points_)[candidates[i].first];
}
return result;
}
}
};

View File

@ -1,6 +1,8 @@
#include "node.h"
#include "point.h"
#include "settings.h"
#include "annoy/node.h"
#include "annoy/point.h"
#include "settings.cpp"
namespace Annoy {
const double EPS = 1e-5;
@ -44,3 +46,5 @@ void Node::GenerateLine(InnerData& inner_node_data, const std::vector<size_t>& i
inner_node_data.div_line_point = ((*points)[indexes[i1]] + (*points)[indexes[i2]]) * 0.5;
inner_node_data.div_line_norm = (*points)[indexes[i2]] - (*points)[indexes[i1]];
}
};

View File

@ -1,4 +1,6 @@
#include "point.h"
#include "annoy/point.h"
namespace Annoy {
double ScalarMul(const Point& first, const Point& second) {
double sum = 0.;
@ -30,4 +32,6 @@ Point operator*(const Point& point, double k) {
result[i] *= k;
}
return result;
}
}
};

View File

@ -0,0 +1,8 @@
#include <cstddef>
namespace Annoy {
const size_t NUM_OF_TREES = 3;
const size_t MAX_LEAF_NODE_SIZE = 1;
};

View File

@ -1,16 +1,17 @@
cmake_minimum_required(VERSION 3.14)
add_library(SpotifyAnnoy INTERFACE)
set(SPOTIFY_ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/sporify-annoy")
set(SPOTIFY_ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/spotify-annoy")
set(SPOTIFY_ANNOY_SOURCE_DIR "${SPOTIFY_ANNOY_PROJECT_DIR}/src")
set(SPOTIFY_ANNOY_INCLUDE_DIR ${SPOTIFY_ANNOY_PROJECT_DIR}/include/annoy)
set(SPOTIFY_ANNOY_INCLUDE_DIR ${SPOTIFY_ANNOY_PROJECT_DIR}/include)
file(MAKE_DIRECTORY ${SPOTIFY_ANNOY_INCLUDE_DIR})
foreach (HEADER annoylib.h kissrandom.h mman.h)
file(COPY ${SPOTIFY_ANNOY_PROJECT_DIR}/src/${HEADER} DESTINATION ${SPOTIFY_ANNOY_INCLUDE_DIR})
endforeach()
set(SPOTIFY_ANNOY_HEADERS
${SPOTIFY_ANNOY_SOURCE_DIR}/annoylib.h
${SPOTIFY_ANNOY_SOURCE_DIR}/kissrandom.h
)
target_include_directories(SpotifyAnnoy INTERFACE include/)
set(SPOTIFY_ANNOY_SRC
${SPOTIFY_ANNOY_SOURCE_DIR}/mman.h
)
add_library(ch::contrib::spotify-annoy ALIAS SpotifyAnnoy)
add_library(_spotify_annoy ${SPOTIFY_ANNOY_SRC})
target_include_directories(_spotify_annoy SYSTEM PUBLIC ${SPOTIFY_ANNOY_SOURCE_DIR})
add_library(ch_contrib::spotify-annoy ALIAS _spotify_annoy)

View File

@ -523,6 +523,7 @@ endif()
dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing)
dbms_target_link_libraries(PUBLIC ch_contrib::spotify-annoy)
dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake")

View File

@ -1,21 +0,0 @@
#include <Storages/MergeTree/MergeTreeIndexSimpleHnsw.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
MergeTreeIndexGranuleSimpleHnsw::MergeTreeIndexGranuleSimpleHnsw(const String & index_name_, const Block & index_sample_block_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
{}
void MergeTreeIndexGranuleMinMax::serializeBinary(WriteBuffer & /*ostr*/) const{}
void MergeTreeIndexGranuleMinMax::deserializeBinary(ReadBuffer & /*istr*/, MergeTreeIndexVersion /*version*/){}
}

View File

@ -1,31 +0,0 @@
#pragma once
#include <Storages/MergeTree/MergeTreeIndices.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/KeyCondition.h>
#include "Storages/MergeTree/MergeTreeIndexMinMax.h"
#include <memory>
#include "object.h"
#include <spotify-annoy>
namespace DB
{
struct MergeTreeIndexGranuleSimpleSpotifyAnnoy final : public IMergeTreeIndexGranule
{
MergeTreeIndexGranuleSimpleSpotifyAnnoy(const String & index_name_, const Block & index_sample_block_);
~MergeTreeIndexGranuleSimpleSpotifyAnnoy() override = default;
void serializeBinary(WriteBuffer & ostr) const override;
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
bool empty() const override { return true; }
String index_name;
Block index_sample_block;
similarity::ObjectVector batch_data;
};
}