ClickHouse/src/Dictionaries/PolygonDictionary.h

294 lines
10 KiB
C++
Raw Normal View History

2019-12-02 15:26:59 +00:00
#pragma once
#include <atomic>
#include <variant>
#include <Core/Block.h>
2019-12-23 13:23:11 +00:00
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnString.h>
2019-12-02 15:26:59 +00:00
#include <Common/Arena.h>
#include <boost/geometry.hpp>
#include <boost/geometry/geometries/multi_polygon.hpp>
#include "DictionaryStructure.h"
#include "IDictionary.h"
#include "IDictionarySource.h"
namespace DB
{
namespace bg = boost::geometry;
2019-12-25 17:32:02 +00:00
/** An interface for polygon dictionaries.
* Polygons are read and stored as multi_polygons from boost::geometry in Euclidean coordinates.
* An implementation should inherit from this base class and preprocess the data upon construction if needed.
* It must override the find method of this class which retrieves the polygon containing a single point.
*/
2019-12-02 15:26:59 +00:00
class IPolygonDictionary : public IDictionaryBase
{
public:
/** Controls the different types of polygons allowed as input.
* The structure of a multi-polygon is as follows:
* - A multi-polygon is represented by a nonempty array of polygons.
* - A polygon is represented by a nonempty array of rings. The first element represents the outer ring. Zero
* or more following rings are cut out from the polygon.
* - A ring is represented by a nonempty array of points.
* - A point is represented by its coordinates stored in an according structure (see below).
* A simple polygon is represented by an one-dimensional array of points, stored in the according structure.
*/
enum class InputType
{
MultiPolygon,
SimplePolygon
};
/** Controls the different types allowed for providing the coordinates of points.
* Right now a point can be represented by either an array or a tuple of two Float64 values.
*/
enum class PointType
{
Array,
Tuple,
};
2019-12-02 15:26:59 +00:00
IPolygonDictionary(
const std::string & database_,
2019-12-02 15:26:59 +00:00
const std::string & name_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
DictionaryLifetime dict_lifetime_,
InputType input_type_,
PointType point_type_);
2019-12-02 15:26:59 +00:00
const std::string & getDatabase() const override;
const std::string & getName() const override;
const std::string & getFullName() const override;
2019-12-02 15:26:59 +00:00
std::string getTypeName() const override;
2019-12-16 15:46:51 +00:00
std::string getKeyDescription() const;
2019-12-02 15:26:59 +00:00
size_t getBytesAllocated() const override;
size_t getQueryCount() const override;
double getHitRate() const override;
size_t getElementCount() const override;
double getLoadFactor() const override;
const IDictionarySource * getSource() const override;
const DictionaryStructure & getStructure() const override;
const DictionaryLifetime & getLifetime() const override;
bool isInjective(const std::string & attribute_name) const override;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
2019-12-23 13:23:11 +00:00
template <typename T>
using ResultArrayType = std::conditional_t<IsDecimalNumber<T>, DecimalPaddedPODArray<T>, PaddedPODArray<T>>;
2019-12-24 18:21:50 +00:00
/** Functions used to retrieve attributes of specific type by key. */
2019-12-23 13:23:11 +00:00
#define DECLARE(TYPE) \
void get##TYPE( \
2019-12-23 13:39:17 +00:00
const std::string & attribute_name, const Columns & key_columns, const DataTypes &, ResultArrayType<TYPE> & out) const;
2019-12-23 13:23:11 +00:00
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(UInt128)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
DECLARE(Decimal32)
DECLARE(Decimal64)
DECLARE(Decimal128)
#undef DECLARE
2019-12-23 13:39:17 +00:00
void getString(const std::string & attribute_name, const Columns & key_columns, const DataTypes &, ColumnString * out) const;
2019-12-23 13:23:11 +00:00
#define DECLARE(TYPE) \
void get##TYPE( \
const std::string & attribute_name, \
const Columns & key_columns, \
2019-12-23 13:39:17 +00:00
const DataTypes &, \
2019-12-23 13:23:11 +00:00
const PaddedPODArray<TYPE> & def, \
ResultArrayType<TYPE> & out) const;
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(UInt128)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
DECLARE(Decimal32)
DECLARE(Decimal64)
DECLARE(Decimal128)
#undef DECLARE
2019-12-23 13:39:17 +00:00
void getString(
const std::string & attribute_name,
const Columns & key_columns,
const DataTypes &,
const ColumnString * const def,
ColumnString * const out) const;
2019-12-23 13:23:11 +00:00
#define DECLARE(TYPE) \
void get##TYPE( \
const std::string & attribute_name, \
const Columns & key_columns, \
2019-12-23 13:39:17 +00:00
const DataTypes &, \
2019-12-23 13:23:11 +00:00
const TYPE def, \
ResultArrayType<TYPE> & out) const;
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(UInt128)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
DECLARE(Decimal32)
DECLARE(Decimal64)
DECLARE(Decimal128)
#undef DECLARE
2019-12-24 18:21:50 +00:00
void getString(
const std::string & attribute_name,
const Columns & key_columns,
const DataTypes & key_types,
const String & def,
ColumnString * const out) const;
2019-12-23 13:23:11 +00:00
2019-12-24 18:21:50 +00:00
/** Checks whether or not a point can be found in one of the polygons in the dictionary.
* The check is performed for multiple points represented by columns of their x and y coordinates.
* The boolean result is written to out.
*/
// TODO: Refactor the whole dictionary design to perform stronger checks, i.e. make this an override.
2019-12-16 15:34:46 +00:00
void has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const;
2019-12-02 15:26:59 +00:00
/** A two-dimensional point in Euclidean coordinates. */
using Point = bg::model::point<Float64, 2, bg::cs::cartesian>;
2019-12-25 17:32:02 +00:00
/** A polygon in boost is a an outer ring of points with zero or more cut out inner rings. */
2019-12-02 15:26:59 +00:00
using Polygon = bg::model::polygon<Point>;
protected:
2019-12-24 18:21:50 +00:00
/** Returns true if the given point can be found in the polygon dictionary.
2019-12-25 17:32:02 +00:00
* If true id is set to the index of a polygon containing the given point.
2019-12-24 18:21:50 +00:00
* Overridden in different implementations of this interface.
*/
2019-12-02 15:26:59 +00:00
virtual bool find(const Point & point, size_t & id) const = 0;
std::vector<Polygon> polygons;
/** Since the original data may have been in the form of multi-polygons, an id is stored for each single polygon
* corresponding to the row in which any other attributes for this entry are located.
*/
std::vector<size_t> ids;
2019-12-24 18:21:50 +00:00
const std::string database;
2019-12-16 15:24:26 +00:00
const std::string name;
const std::string full_name;
2019-12-16 15:24:26 +00:00
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
const InputType input_type;
const PointType point_type;
2019-12-16 15:11:16 +00:00
private:
2019-12-24 18:21:50 +00:00
/** Helper functions for loading the data from the configuration.
* The polygons serving as keys are extracted into boost types.
* All other values are stored in one column per attribute.
*/
2019-12-02 15:26:59 +00:00
void createAttributes();
void blockToAttributes(const Block & block);
void loadData();
void calculateBytesAllocated();
2019-12-24 18:21:50 +00:00
/** Checks whether a given attribute exists and returns its index */
2019-12-23 13:23:11 +00:00
size_t getAttributeIndex(const std::string & attribute_name) const;
2019-12-24 18:21:50 +00:00
2019-12-27 13:06:03 +00:00
/** Helper functions to retrieve and instantiate the provided null value of an attribute.
* Since a null value is obligatory for every attribute they are simply appended to null_values defined below.
*/
2019-12-23 13:23:11 +00:00
template <typename T>
void appendNullValueImpl(const Field & null_value);
void appendNullValue(AttributeUnderlyingType type, const Field & value);
2019-12-23 13:23:11 +00:00
2019-12-24 18:21:50 +00:00
/** Helper function for retrieving the value of an attribute by key. */
2019-12-23 13:23:11 +00:00
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void getItemsImpl(size_t attribute_ind, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const;
2019-12-27 13:06:03 +00:00
/** A mapping from the names of the attributes to their index in the two vectors defined below. */
2019-12-24 18:21:50 +00:00
std::map<std::string, size_t> attribute_index_by_name;
2019-12-27 13:06:03 +00:00
/** A vector of columns storing the values of each attribute. */
2019-12-23 13:23:11 +00:00
Columns attributes;
2019-12-27 13:06:03 +00:00
/** A vector of null values corresponding to each attribute. */
std::vector<std::variant<
UInt8,
UInt16,
UInt32,
UInt64,
UInt128,
Int8,
Int16,
Int32,
Int64,
Decimal32,
Decimal64,
Decimal128,
Float32,
Float64,
String>> null_values;
2019-12-02 15:26:59 +00:00
size_t bytes_allocated = 0;
size_t element_count = 0;
mutable std::atomic<size_t> query_count{0};
2020-01-15 13:50:03 +00:00
/** Extracts a list of polygons from a column according to input_type and point_type.
* The polygons are appended to the dictionary with the corresponding ids.
2020-01-15 14:04:19 +00:00
*/
void extractPolygons(const ColumnPtr & column);
2019-12-02 15:26:59 +00:00
2019-12-24 18:21:50 +00:00
/** Extracts a list of points from two columns representing their x and y coordinates. */
2019-12-23 13:23:11 +00:00
static std::vector<Point> extractPoints(const Columns &key_columns);
2019-12-02 15:26:59 +00:00
};
2019-12-25 17:32:02 +00:00
/** Simple implementation of the polygon dictionary. Doesn't generate anything during its construction.
2019-12-24 18:21:50 +00:00
* Iterates over all stored polygons for each query, checking each of them in linear time.
2020-01-15 14:04:19 +00:00
* Retrieves the polygon with the smallest area containing the given point. If there is more than one any such polygon
* may be returned.
2019-12-24 18:21:50 +00:00
*/
2019-12-02 15:26:59 +00:00
class SimplePolygonDictionary : public IPolygonDictionary
{
2019-12-16 15:11:16 +00:00
public:
SimplePolygonDictionary(
const std::string & database_,
2019-12-16 15:11:16 +00:00
const std::string & name_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
DictionaryLifetime dict_lifetime_,
InputType input_type_,
PointType point_type_);
2019-12-16 15:24:26 +00:00
std::shared_ptr<const IExternalLoadable> clone() const override;
2019-12-02 15:26:59 +00:00
private:
bool find(const Point & point, size_t & id) const override;
};
2019-12-25 22:02:55 +00:00
}