Merge pull request #8086 from ClickHouse/geodist-less-wrong

Make the code of geodist less wrong.
2024-11-21 15:12:02 +00:00 · 2019-12-09 20:38:29 +03:00 · 2019-12-09 20:38:29 +03:00 · 36ca1a0a24
commit 36ca1a0a24
parent 39bb8e724c 1e3b9af66f
5 changed files with 182 additions and 102 deletions
--- a/dbms/src/Functions/greatCircleDistance.cpp
+++ b/dbms/src/Functions/greatCircleDistance.cpp
@ -14,70 +14,102 @@
 namespace DB
 {

-namespace ErrorCodes
-{
-    extern const int ARGUMENT_OUT_OF_BOUND;
-    extern const int ILLEGAL_COLUMN;
-    extern const int LOGICAL_ERROR;
-}
-
-/** https://en.wikipedia.org/wiki/Great-circle_distance
- *
- *  The function calculates distance in meters between two points on Earth specified by longitude and latitude in degrees.
- *  The function uses great circle distance formula https://en.wikipedia.org/wiki/Great-circle_distance .
- *  Throws exception when one or several input values are not within reasonable bounds.
- *  Latitude must be in [-90, 90], longitude must be [-180, 180].
- *  Original code of this implementation of this function is here https://github.com/sphinxsearch/sphinx/blob/409f2c2b5b2ff70b04e38f92b6b1a890326bad65/src/sphinxexpr.cpp#L3825.
- *  Andrey Aksenov, the author of original code, permitted to use this code in ClickHouse under the Apache 2.0 license.
- *  Presentation about this code from Highload++ Siberia 2019 is here https://github.com/ClickHouse/ClickHouse/files/3324740/1_._._GEODIST_._.pdf
- *  The main idea of this implementation is optimisations based on Taylor series, trigonometric identity and calculated constants once for cosine, arcsine(sqrt) and look up table.
- */
+/** Calculates the distance between two geographical locations.
+  * There are two variants:
+  * greatCircleDistance: calculates the distance on a sphere: https://en.wikipedia.org/wiki/Great-circle_distance
+  * geoDistance: calculates the distance on WGS-84 ellipsoid.
+  *
+  * The function calculates distance in meters between two points on Earth specified by longitude and latitude in degrees.
+  *
+  * Latitude must be in [-90, 90], longitude must be [-180, 180].
+  *
+  * Original code of this implementation of this function is here:
+  * https://github.com/sphinxsearch/sphinx/blob/409f2c2b5b2ff70b04e38f92b6b1a890326bad65/src/sphinxexpr.cpp#L3825.
+  * Andrey Aksenov, the author of original code, permitted to use this code in ClickHouse under the Apache 2.0 license.
+  * Presentation about this code from Highload++ Siberia 2019 is here https://github.com/ClickHouse/ClickHouse/files/3324740/1_._._GEODIST_._.pdf
+  * The main idea of this implementation is optimisations based on Taylor series, trigonometric identity
+  *  and calculated constants once for cosine, arcsine(sqrt) and look up table.
+  */

 namespace
 {

 constexpr double PI = 3.14159265358979323846;
-constexpr float TO_RADF = static_cast<float>(PI / 180.0);
-constexpr float TO_RADF2 = static_cast<float>(PI / 360.0);
+constexpr float RAD_IN_DEG = static_cast<float>(PI / 180.0);
+constexpr float RAD_IN_DEG_HALF = static_cast<float>(PI / 360.0);

-constexpr size_t GEODIST_TABLE_COS = 1024; // maxerr 0.00063%
-constexpr size_t GEODIST_TABLE_ASIN = 512;
-constexpr size_t GEODIST_TABLE_K = 1024;
+constexpr size_t COS_LUT_SIZE = 1024; // maxerr 0.00063%
+constexpr size_t ASIN_SQRT_LUT_SIZE = 512;
+constexpr size_t METRIC_LUT_SIZE = 1024;
+
+/** We use "WGS-84 ellipsoidal quadratic mean radius of Earth" as the approximation to calculate distances on sphere.
+  * The motivation for it is explained here: https://math.wikia.org/wiki/Ellipsoidal_quadratic_mean_radius
+  *
+  * Brief explanation:
+  * - the radius of sphere is choosen to minimize the difference between distance on that sphere and distance on WGS-84 ellipsoid between two points,
+  *   averaged uniformly (?) by all angles (?) between points.
+  * This sounds not clear enough for me: what set we are averaging and by what measure?
+  *
+  * The value should be calculated this way:
+  * WITH 6378137.0 AS a, 6356752.314245 AS b SELECT sqrt(3 * a * a + b * b) / 2
+  *
+  * But for unknown reason, slightly different value is used.
+  * This constant may be changed in future with a note about backward incompatible change in the changelog.
+  *
+  * See also:
+  * https://github.com/Project-OSRM/osrm-backend/blob/bb1f4a025a3cefd3598a38b9d3e55485d1080ec5/third_party/libosmium/include/osmium/geom/haversine.hpp#L58-L59
+  * https://github.com/Project-OSRM/osrm-backend/issues/5051
+  * https://github.com/mapbox/turf-swift/issues/26
+  * https://github.com/Project-OSRM/osrm-backend/pull/5041
+  * https://en.wikipedia.org/wiki/Talk:Great-circle_distance/Archive_1
+  */
+constexpr float EARTH_RADIUS = 6372797.560856;
+constexpr float EARTH_DIAMETER = 2 * EARTH_RADIUS;
+
+
+float cos_lut[COS_LUT_SIZE + 1];       /// cos(x) table
+float asin_sqrt_lut[ASIN_SQRT_LUT_SIZE + 1]; /// asin(sqrt(x)) * earth_diameter table
+
+float sphere_metric_lut[METRIC_LUT_SIZE + 1];    /// sphere metric: the distance for one degree across longitude depending on latitude
+float wgs84_metric_lut[2 * (METRIC_LUT_SIZE + 1)];  /// ellipsoid metric: the distance across one degree latitude/longitude depending on latitude

-float g_GeoCos[GEODIST_TABLE_COS + 1];        /// cos(x) table
-float g_GeoAsin[GEODIST_TABLE_ASIN + 1];    /// asin(sqrt(x)) table
-float g_GeoFlatK[GEODIST_TABLE_K + 1][2];    /// geodistAdaptive() flat ellipsoid method k1, k2 coeffs table

 inline double sqr(double v)
 {
    return v * v;
 }

-inline float fsqr(float v)
+inline float sqrf(float v)
 {
    return v * v;
 }

 void geodistInit()
 {
-    for (size_t i = 0; i <= GEODIST_TABLE_COS; ++i)
-        g_GeoCos[i] = static_cast<float>(cos(2 * PI * i / GEODIST_TABLE_COS)); // [0, 2 * pi] -> [0, COSTABLE]
+    for (size_t i = 0; i <= COS_LUT_SIZE; ++i)
+        cos_lut[i] = static_cast<float>(cos(2 * PI * i / COS_LUT_SIZE)); // [0, 2 * pi] -> [0, COS_LUT_SIZE]

-    for (size_t i = 0; i <= GEODIST_TABLE_ASIN; ++i)
-        g_GeoAsin[i] = static_cast<float>(asin(
-                sqrt(static_cast<double>(i) / GEODIST_TABLE_ASIN))); // [0, 1] -> [0, ASINTABLE]
+    for (size_t i = 0; i <= ASIN_SQRT_LUT_SIZE; ++i)
+        asin_sqrt_lut[i] = static_cast<float>(EARTH_DIAMETER * asin(
+            sqrt(static_cast<double>(i) / ASIN_SQRT_LUT_SIZE))); // [0, 1] -> [0, ASIN_SQRT_LUT_SIZE]

-    for (size_t i = 0; i <= GEODIST_TABLE_K; ++i)
+    for (size_t i = 0; i <= METRIC_LUT_SIZE; ++i)
    {
-        double x = PI * i / GEODIST_TABLE_K - PI * 0.5; // [-pi / 2, pi / 2] -> [0, KTABLE]
-        g_GeoFlatK[i][0] = static_cast<float>(sqr(111132.09 - 566.05 * cos(2 * x) + 1.20 * cos(4 * x)));
-        g_GeoFlatK[i][1] = static_cast<float>(sqr(111415.13 * cos(x) - 94.55 * cos(3 * x) + 0.12 * cos(5 * x)));
+        double latitude = i * (PI / METRIC_LUT_SIZE) - PI * 0.5; // [-pi / 2, pi / 2] -> [0, METRIC_LUT_SIZE]
+
+        /// Squared metric coefficients (for the distance in meters) on a tangent plane, for latitude and longitude (in degrees),
+        /// depending on the latitude (in radians).
+
+        wgs84_metric_lut[i * 2] = static_cast<float>(sqr(111132.09 - 566.05 * cos(2 * latitude) + 1.20 * cos(4 * latitude)));
+        wgs84_metric_lut[i * 2 + 1] = static_cast<float>(sqr(111415.13 * cos(latitude) - 94.55 * cos(3 * latitude) + 0.12 * cos(5 * latitude)));
+
+        sphere_metric_lut[i] = static_cast<float>(sqr((EARTH_DIAMETER * PI / 360) * cos(latitude)));
    }
 }

 inline float geodistDegDiff(float f)
 {
-    f = static_cast<float>(fabs(f));
+    f = fabsf(f);
    while (f > 360)
        f -= 360;
    if (f > 180)
@ -87,50 +119,113 @@ inline float geodistDegDiff(float f)

 inline float geodistFastCos(float x)
 {
-    float y = static_cast<float>(fabs(x) * GEODIST_TABLE_COS / PI / 2);
-    int i = static_cast<int>(y);
+    float y = fabsf(x) * (COS_LUT_SIZE / PI / 2);
+    size_t i = static_cast<size_t>(y);
    y -= i;
-    i &= (GEODIST_TABLE_COS - 1);
-    return g_GeoCos[i] + (g_GeoCos[i + 1] - g_GeoCos[i]) * y;
+    i &= (COS_LUT_SIZE - 1);
+    return cos_lut[i] + (cos_lut[i + 1] - cos_lut[i]) * y;
 }

 inline float geodistFastSin(float x)
 {
-    float y = static_cast<float>(fabs(x) * GEODIST_TABLE_COS / PI / 2);
-    int i = static_cast<int>(y);
+    float y = fabsf(x) * (COS_LUT_SIZE / PI / 2);
+    size_t i = static_cast<size_t>(y);
    y -= i;
-    i = (i - GEODIST_TABLE_COS / 4) & (GEODIST_TABLE_COS - 1); // cos(x - pi / 2) = sin(x), costable / 4 = pi / 2
-    return g_GeoCos[i] + (g_GeoCos[i + 1] - g_GeoCos[i]) * y;
+    i = (i - COS_LUT_SIZE / 4) & (COS_LUT_SIZE - 1); // cos(x - pi / 2) = sin(x), costable / 4 = pi / 2
+    return cos_lut[i] + (cos_lut[i + 1] - cos_lut[i]) * y;
 }

 /// fast implementation of asin(sqrt(x))
 /// max error in floats 0.00369%, in doubles 0.00072%
 inline float geodistFastAsinSqrt(float x)
 {
-    if (x < 0.122)
+    if (x < 0.122f)
    {
-        // distance under 4546km, Taylor error under 0.00072%
-        float y = static_cast<float>(sqrt(x));
-        return y + x * y * 0.166666666666666f + x * x * y * 0.075f + x * x * x * y * 0.044642857142857f;
+        // distance under 4546 km, Taylor error under 0.00072%
+        float y = sqrtf(x);
+        return EARTH_DIAMETER * (y + x * y * 0.166666666666666f + x * x * y * 0.075f + x * x * x * y * 0.044642857142857f);
    }
-    if (x < 0.948)
+    if (x < 0.948f)
    {
-        // distance under 17083km, 512-entry LUT error under 0.00072%
-        x *= GEODIST_TABLE_ASIN;
-        int i = static_cast<int>(x);
-        return g_GeoAsin[i] + (g_GeoAsin[i + 1] - g_GeoAsin[i]) * (x - i);
+        // distance under 17083 km, 512-entry LUT error under 0.00072%
+        x *= ASIN_SQRT_LUT_SIZE;
+        size_t i = static_cast<size_t>(x);
+        return asin_sqrt_lut[i] + (asin_sqrt_lut[i + 1] - asin_sqrt_lut[i]) * (x - i);
+    }
+    return asinf(sqrtf(x)); // distance over 17083 km, just compute exact
+}
+
+
+enum class Method
+{
+    SPHERE,
+    WGS84
+};
+
+
+template <Method method>
+float distance(float lon1deg, float lat1deg, float lon2deg, float lat2deg)
+{
+    float lat_diff = geodistDegDiff(lat1deg - lat2deg);
+    float lon_diff = geodistDegDiff(lon1deg - lon2deg);
+
+    if (lon_diff < 13)
+    {
+        // points are close enough; use flat ellipsoid model
+        // interpolate metric coefficients using latitudes midpoint
+
+        /// Why comparing only difference in longitude?
+        /// If longitudes are different enough, there is a big difference between great circle line and a line with constant latitude.
+        ///  (Remember how a plane flies from Moscow to New York)
+        /// But if longitude is close but latitude is different enough, there is no difference between meridian and great circle line.
+
+        float latitude_midpoint = (lat1deg + lat2deg + 180) * METRIC_LUT_SIZE / 360; // [-90, 90] degrees -> [0, KTABLE] indexes
+        size_t latitude_midpoint_index = static_cast<size_t>(latitude_midpoint) & (METRIC_LUT_SIZE - 1);
+
+        /// This is linear interpolation between two table items at index "latitude_midpoint_index" and "latitude_midpoint_index + 1".
+
+        float k_lat;
+        float k_lon;
+
+        if constexpr (method == Method::SPHERE)
+        {
+            k_lat = sqr(EARTH_DIAMETER * PI / 360);
+
+            k_lon = sphere_metric_lut[latitude_midpoint_index]
+                + (sphere_metric_lut[latitude_midpoint_index + 1] - sphere_metric_lut[latitude_midpoint_index]) * (latitude_midpoint - latitude_midpoint_index);
+        }
+        else if constexpr (method == Method::WGS84)
+        {
+            k_lat = wgs84_metric_lut[latitude_midpoint_index * 2]
+                + (wgs84_metric_lut[(latitude_midpoint_index + 1) * 2] - wgs84_metric_lut[latitude_midpoint_index * 2]) * (latitude_midpoint - latitude_midpoint_index);
+
+            k_lon = wgs84_metric_lut[latitude_midpoint_index * 2 + 1]
+                + (wgs84_metric_lut[(latitude_midpoint_index + 1) * 2 + 1] - wgs84_metric_lut[latitude_midpoint_index * 2 + 1]) * (latitude_midpoint - latitude_midpoint_index);
+        }
+
+        /// Metric on a tangent plane: it differs from Euclidean metric only by scale of coordinates.
+        return sqrtf(k_lat * lat_diff * lat_diff + k_lon * lon_diff * lon_diff);
+    }
+    else
+    {
+        // points too far away; use haversine
+
+        float a = sqrf(geodistFastSin(lat_diff * RAD_IN_DEG_HALF))
+            + geodistFastCos(lat1deg * RAD_IN_DEG) * geodistFastCos(lat2deg * RAD_IN_DEG) * sqrf(geodistFastSin(lon_diff * RAD_IN_DEG_HALF));
+
+        return geodistFastAsinSqrt(a);
    }
-    return static_cast<float>(asin(sqrt(x))); // distance over 17083km, just compute honestly
 }

 }


-class FunctionGreatCircleDistance : public IFunction
+template <Method method>
+class FunctionGeoDistance : public IFunction
 {
 public:
-    static constexpr auto name = "greatCircleDistance";
-    static FunctionPtr create(const Context &) { return std::make_shared<FunctionGreatCircleDistance>(); }
+    static constexpr auto name = (method == Method::SPHERE) ? "greatCircleDistance" : "geoDistance";
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionGeoDistance<method>>(); }

 private:
    String getName() const override { return name; }
@ -143,50 +238,15 @@ private:
        for (const auto arg_idx : ext::range(0, arguments.size()))
        {
            const auto arg = arguments[arg_idx].get();
-            if (!WhichDataType(arg).isFloat())
+            if (!isNumber(WhichDataType(arg)))
                throw Exception(
-                    "Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " + getName() + ". Must be Float64",
+                    "Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " + getName() + ". Must be numeric",
                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
        }

        return std::make_shared<DataTypeFloat32>();
    }

-    Float32 greatCircleDistance(Float32 lon1deg, Float32 lat1deg, Float32 lon2deg, Float32 lat2deg)
-    {
-        if (lon1deg < -180 || lon1deg > 180 ||
-            lon2deg < -180 || lon2deg > 180 ||
-            lat1deg < -90 || lat1deg > 90 ||
-            lat2deg < -90 || lat2deg > 90)
-        {
-            throw Exception("Arguments values out of bounds for function " + getName(),
-                            ErrorCodes::ARGUMENT_OUT_OF_BOUND);
-        }
-
-        float lat_diff = geodistDegDiff(lat1deg - lat2deg);
-        float lon_diff = geodistDegDiff(lon1deg - lon2deg);
-
-        if (lon_diff < 13)
-        {
-            // points are close enough; use flat ellipsoid model
-            // interpolate sqr(k1), sqr(k2) coefficients using latitudes midpoint
-            float m = (lat1deg + lat2deg + 180) * GEODIST_TABLE_K / 360; // [-90, 90] degrees -> [0, KTABLE] indexes
-            size_t i = static_cast<size_t>(m) & (GEODIST_TABLE_K - 1);
-            float kk1 = g_GeoFlatK[i][0] + (g_GeoFlatK[i + 1][0] - g_GeoFlatK[i][0]) * (m - i);
-            float kk2 = g_GeoFlatK[i][1] + (g_GeoFlatK[i + 1][1] - g_GeoFlatK[i][1]) * (m - i);
-            return static_cast<float>(sqrt(kk1 * lat_diff * lat_diff + kk2 * lon_diff * lon_diff));
-        }
-        else
-        {
-            // points too far away; use haversine
-            static const float d = 2 * 6371000;
-            float a = fsqr(geodistFastSin(lat_diff * TO_RADF2)) +
-                geodistFastCos(lat1deg * TO_RADF) * geodistFastCos(lat2deg * TO_RADF) *
-                fsqr(geodistFastSin(lon_diff * TO_RADF2));
-            return static_cast<float>(d * geodistFastAsinSqrt(a));
-        }
-    }
-
    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
    {
        auto dst = ColumnVector<Float32>::create();
@ -199,7 +259,7 @@ private:
        const IColumn & col_lat2 = *block.getByPosition(arguments[3]).column;

        for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
-            dst_data[row_num] = greatCircleDistance(
+            dst_data[row_num] = distance<method>(
                col_lon1.getFloat32(row_num), col_lat1.getFloat32(row_num),
                col_lon2.getFloat32(row_num), col_lat2.getFloat32(row_num));

@ -208,10 +268,11 @@ private:
 };


-void registerFunctionGreatCircleDistance(FunctionFactory & factory)
+void registerFunctionGeoDistance(FunctionFactory & factory)
 {
    geodistInit();
-    factory.registerFunction<FunctionGreatCircleDistance>();
+    factory.registerFunction<FunctionGeoDistance<Method::SPHERE>>();
+    factory.registerFunction<FunctionGeoDistance<Method::WGS84>>();
 }

 }
--- a/dbms/src/Functions/registerFunctionsGeo.cpp
+++ b/dbms/src/Functions/registerFunctionsGeo.cpp
@ -5,7 +5,7 @@ namespace DB

 class FunctionFactory;

-void registerFunctionGreatCircleDistance(FunctionFactory & factory);
+void registerFunctionGeoDistance(FunctionFactory & factory);
 void registerFunctionPointInEllipses(FunctionFactory & factory);
 void registerFunctionPointInPolygon(FunctionFactory & factory);
 void registerFunctionGeohashEncode(FunctionFactory & factory);
@ -18,7 +18,7 @@ void registerFunctionGeoToH3(FunctionFactory &);

 void registerFunctionsGeo(FunctionFactory & factory)
 {
-    registerFunctionGreatCircleDistance(factory);
+    registerFunctionGeoDistance(factory);
    registerFunctionPointInEllipses(factory);
    registerFunctionPointInPolygon(factory);
    registerFunctionGeohashEncode(factory);
--- a/dbms/tests/performance/great_circle_dist.xml
+++ b/dbms/tests/performance/great_circle_dist.xml
@ -9,8 +9,8 @@
    </stop_conditions>

    <!-- lon [-180; 180], lat [-90; 90] -->
-    <query>SELECT count() FROM system.numbers WHERE NOT ignore(greatCircleDistance((rand() % 360) * 1. - 180, (number % 150) * 1.2 - 90, (number % 360) + toFloat64(rand()) / 4294967296 - 180, (rand() % 180) * 1. - 90))</query>
+    <query>SELECT count() FROM system.numbers WHERE NOT ignore(greatCircleDistance((rand(1) % 360) * 1. - 180, (number % 150) * 1.2 - 90, (number % 360) + toFloat64(rand(2)) / 4294967296 - 180, (rand(3) % 180) * 1. - 90))</query>
    <!-- 55.755830, 37.617780 is center of Moscow -->
-    <query>SELECT count() FROM system.numbers WHERE NOT ignore(greatCircleDistance(55. + toFloat64(rand()) / 4294967296, 37. + toFloat64(rand()) / 4294967296, 55. + toFloat64(rand()) / 4294967296, 37. + toFloat64(rand()) / 4294967296))</query>
+    <query>SELECT count() FROM system.numbers WHERE NOT ignore(greatCircleDistance(55. + toFloat64(rand(1)) / 4294967296, 37. + toFloat64(rand(2)) / 4294967296, 55. + toFloat64(rand(3)) / 4294967296, 37. + toFloat64(rand(4)) / 4294967296))</query>
 </test>

--- a/dbms/tests/queries/0_stateless/01043_geo_distance.reference
+++ b/dbms/tests/queries/0_stateless/01043_geo_distance.reference
@ -0,0 +1,8 @@
+111194.93
+111194.93
+110567.33
+111699.25
+10007543
+10007543
+10007543
+10001780
--- a/dbms/tests/queries/0_stateless/01043_geo_distance.sql
+++ b/dbms/tests/queries/0_stateless/01043_geo_distance.sql
@ -0,0 +1,11 @@
+SELECT greatCircleDistance(0., 0., 0., 1.);
+SELECT greatCircleDistance(0., 89., 0, 90.);
+
+SELECT geoDistance(0., 0., 0., 1.);
+SELECT geoDistance(0., 89., 0., 90.);
+
+SELECT greatCircleDistance(0., 0., 90., 0.);
+SELECT greatCircleDistance(0., 0., 0., 90.);
+
+SELECT geoDistance(0., 0., 90., 0.);
+SELECT geoDistance(0., 0., 0., 90.);