Merge pull request #323 from ludv1x/accurate-compare

Accurate numbers comparison
This commit is contained in:
alexey-milovidov 2017-01-12 23:31:07 +04:00 committed by GitHub
commit 9808a5ab9e
6 changed files with 519 additions and 16 deletions

View File

@ -0,0 +1,337 @@
#include <DB/Core/Types.h>
/** Preceptually-correct number comparisons.
* Example: Int8(-1) != UInt8(255)
*/
namespace accurate
{
/** Cases:
1) Safe conversion (in case of default C++ operators)
a) int vs any int
b) uint vs any uint
c) float vs any float
2) int vs uint
a) sizeof(int) <= sizeof(uint). Accurate comparison with MAX_INT tresholds
b) sizeof(int) > sizeof(uint). Casting to int
3) integral_type vs floating_type
a) sizeof(integral_type) <= 4. Comparison via casting arguments to Float64
b) sizeof(integral_type) == 8. Accurate comparison. Consider 3 sets of intervals:
1) interval between adjacent floats less or equal 1
2) interval between adjacent floats greater then 2
3) float is outside [MIN_INT64; MAX_INT64]
*/
// Case 1. Is pair of floats or pair of ints or pair of uints
template <typename A, typename B>
using is_safe_convervsion = std::integral_constant<bool, (std::is_floating_point<A>::value && std::is_floating_point<B>::value)
|| (std::is_integral<A>::value && std::is_integral<B>::value && !(std::is_signed<A>::value ^ std::is_signed<B>::value))>;
template <typename A, typename B>
using bool_if_safe_convervsion = std::enable_if_t<is_safe_convervsion<A, B>::value, bool>;
template <typename A, typename B>
using bool_if_not_safe_convervsion = std::enable_if_t<!is_safe_convervsion<A, B>::value, bool>;
/// Case 2. Are params IntXX and UIntYY ?
template <typename TInt, typename TUInt>
using is_any_int_vs_uint = std::integral_constant<bool,
std::is_integral<TInt>::value && std::is_integral<TUInt>::value &&
std::is_signed<TInt>::value && std::is_unsigned<TUInt>::value>;
// Case 2a. Are params IntXX and UIntYY and sizeof(IntXX) >= sizeof(UIntYY) (in such case will use accurate compare)
template <typename TInt, typename TUInt>
using is_le_int_vs_uint_t = std::integral_constant<bool, is_any_int_vs_uint<TInt, TUInt>::value && (sizeof(TInt) <= sizeof(TUInt))>;
template <typename TInt, typename TUInt>
using bool_if_le_int_vs_uint_t = std::enable_if_t<is_le_int_vs_uint_t<TInt, TUInt>::value, bool>;
template <typename TInt, typename TUInt>
bool_if_le_int_vs_uint_t<TInt, TUInt> greaterOpTmpl(TInt a, TUInt b)
{
return (b > static_cast<TUInt>(std::numeric_limits<TInt>::max()) || a < 0) ? false : static_cast<TUInt>(a) > b;
}
template <typename TUInt, typename TInt>
bool_if_le_int_vs_uint_t<TInt, TUInt> greaterOpTmpl(TUInt a, TInt b)
{
return (a > static_cast<TUInt>(std::numeric_limits<TInt>::max()) || b < 0) ? true : a > static_cast<TUInt>(b);
}
template <typename TInt, typename TUInt>
bool_if_le_int_vs_uint_t<TInt, TUInt> equalsOpTmpl(TInt a, TUInt b)
{
return (a < 0 || b > static_cast<TUInt>(std::numeric_limits<TInt>::max())) ? false : static_cast<TUInt>(a) == b;
}
template <typename TUInt, typename TInt>
bool_if_le_int_vs_uint_t<TInt, TUInt> equalsOpTmpl(TUInt a, TInt b)
{
return (b < 0 || a > static_cast<TUInt>(std::numeric_limits<TInt>::max())) ? false : a == static_cast<TUInt>(b);
}
// Case 2b. Are params IntXX and UIntYY and sizeof(IntXX) > sizeof(UIntYY) (in such case will cast UIntYY to IntXX and compare)
template <typename TInt, typename TUInt>
using is_gt_int_vs_uint = std::integral_constant<bool, is_any_int_vs_uint<TInt, TUInt>::value && (sizeof(TInt) > sizeof(TUInt))>;
template <typename TInt, typename TUInt>
using bool_if_gt_int_vs_uint = std::enable_if_t<is_gt_int_vs_uint<TInt, TUInt>::value, bool>;
template <typename TInt, typename TUInt>
bool_if_gt_int_vs_uint<TInt, TUInt> greaterOpTmpl(TInt a, TUInt b)
{
return static_cast<TInt>(a) > static_cast<TInt>(b);
}
template <typename TInt, typename TUInt>
bool_if_gt_int_vs_uint<TInt, TUInt> greaterOpTmpl(TUInt a, TInt b)
{
return static_cast<TInt>(a) > static_cast<TInt>(b);
}
template <typename TInt, typename TUInt>
bool_if_gt_int_vs_uint<TInt, TUInt> equalsOpTmpl(TInt a, TUInt b)
{
return static_cast<TInt>(a) == static_cast<TInt>(b);
}
template <typename TInt, typename TUInt>
bool_if_gt_int_vs_uint<TInt, TUInt> equalsOpTmpl(TUInt a, TInt b)
{
return static_cast<TInt>(a) == static_cast<TInt>(b);
}
// Case 3a. Comparison via conversion to double.
template <typename TAInt, typename TAFloat>
using bool_if_double_can_be_used = std::enable_if_t<
std::is_integral<TAInt>::value && (sizeof(TAInt) <= 4) && std::is_floating_point<TAFloat>::value,
bool>;
template <typename TAInt, typename TAFloat>
bool_if_double_can_be_used<TAInt, TAFloat> greaterOpTmpl(TAInt a, TAFloat b)
{
return static_cast<double>(a) > static_cast<double>(b);
}
template <typename TAInt, typename TAFloat>
bool_if_double_can_be_used<TAInt, TAFloat> greaterOpTmpl(TAFloat a, TAInt b)
{
return static_cast<double>(a) > static_cast<double>(b);
}
template <typename TAInt, typename TAFloat>
bool_if_double_can_be_used<TAInt, TAFloat> equalsOpTmpl(TAInt a, TAFloat b)
{
return static_cast<double>(a) == static_cast<double>(b);
}
template <typename TAInt, typename TAFloat>
bool_if_double_can_be_used<TAInt, TAFloat> equalsOpTmpl(TAFloat a, TAInt b)
{
return static_cast<double>(a) == static_cast<double>(b);
}
/* Final realiztions */
template <typename A, typename B>
inline bool_if_not_safe_convervsion<A, B> greaterOp(A a, B b)
{
return greaterOpTmpl(a, b);
}
template <typename A, typename B>
inline bool_if_safe_convervsion<A, B> greaterOp(A a, B b)
{
return a > b;
}
// Case 3b. 64-bit integers vs floats comparison.
// See hint at https://github.com/JuliaLang/julia/issues/257 (but it doesn't work properly for -2**63)
constexpr DB::Int64 MAX_INT64_WITH_EXACT_FLOAT64_REPR = 9007199254740992LL; // 2^53
template<>
inline bool greaterOp<DB::Float64, DB::Int64>(DB::Float64 f, DB::Int64 i)
{
if (-MAX_INT64_WITH_EXACT_FLOAT64_REPR <= i && i <= MAX_INT64_WITH_EXACT_FLOAT64_REPR)
return f > static_cast<DB::Float64>(i);
return (f >= static_cast<DB::Float64>(std::numeric_limits<DB::Int64>::max())) // rhs is 2**63 (not 2^63 - 1)
|| (f > static_cast<DB::Float64>(std::numeric_limits<DB::Int64>::min()) && static_cast<DB::Int64>(f) > i);
}
template<>
inline bool greaterOp<DB::Int64, DB::Float64>(DB::Int64 i, DB::Float64 f)
{
if (-MAX_INT64_WITH_EXACT_FLOAT64_REPR <= i && i <= MAX_INT64_WITH_EXACT_FLOAT64_REPR)
return f < static_cast<DB::Float64>(i);
return (f < static_cast<DB::Float64>(std::numeric_limits<DB::Int64>::min()))
|| (f < static_cast<DB::Float64>(std::numeric_limits<DB::Int64>::max()) && i > static_cast<DB::Int64>(f));
}
template<>
inline bool greaterOp<DB::Float64, DB::UInt64>(DB::Float64 f, DB::UInt64 u)
{
if (u <= static_cast<DB::UInt64>(MAX_INT64_WITH_EXACT_FLOAT64_REPR))
return f > static_cast<DB::Float64>(u);
return (f >= static_cast<DB::Float64>(std::numeric_limits<DB::UInt64>::max()))
|| (f >= 0 && static_cast<DB::UInt64>(f) > u);
}
template<>
inline bool greaterOp<DB::UInt64, DB::Float64>(DB::UInt64 u, DB::Float64 f)
{
if (u <= static_cast<DB::UInt64>(MAX_INT64_WITH_EXACT_FLOAT64_REPR))
return static_cast<DB::Float64>(u) > f;
return (f < 0)
|| (f < static_cast<DB::Float64>(std::numeric_limits<DB::UInt64>::max()) && u > static_cast<UInt64>(f));
}
// Case 3b for float32
template<>
inline bool greaterOp<DB::Float32, DB::Int64>(DB::Float32 f, DB::Int64 i)
{
return greaterOp(static_cast<DB::Float64>(f), i);
}
template<>
inline bool greaterOp<DB::Int64, DB::Float32>(DB::Int64 i, DB::Float32 f)
{
return greaterOp(i, static_cast<DB::Float64>(f));
}
template<>
inline bool greaterOp<DB::Float32, DB::UInt64>(DB::Float32 f, DB::UInt64 u)
{
return greaterOp(static_cast<DB::Float64>(f), u);
}
template<>
inline bool greaterOp<DB::UInt64, DB::Float32>(DB::UInt64 u, DB::Float32 f)
{
return greaterOp(u, static_cast<DB::Float64>(f));
}
template <typename A, typename B>
inline bool_if_not_safe_convervsion<A, B> equalsOp(A a, B b)
{
return equalsOpTmpl(a, b);
}
template <typename A, typename B>
inline bool_if_safe_convervsion<A, B> equalsOp(A a, B b)
{
return a == b;
}
template<>
inline bool equalsOp<DB::Float64, DB::UInt64>(DB::Float64 f, DB::UInt64 u)
{
return static_cast<DB::UInt64>(f) == u && f == static_cast<DB::Float64>(u);
}
template<>
inline bool equalsOp<DB::UInt64, DB::Float64>(DB::UInt64 u, DB::Float64 f)
{
return u == static_cast<DB::UInt64>(f) && static_cast<DB::Float64>(u) == f;
}
template<>
inline bool equalsOp<DB::Float64, DB::Int64>(DB::Float64 f, DB::Int64 u)
{
return static_cast<DB::Int64>(f) == u && f == static_cast<DB::Float64>(u);
}
template<>
inline bool equalsOp<DB::Int64, DB::Float64>(DB::Int64 u, DB::Float64 f)
{
return u == static_cast<DB::Int64>(f) && static_cast<DB::Float64>(u) == f;
}
template<>
inline bool equalsOp<DB::Float32, DB::UInt64>(DB::Float32 f, DB::UInt64 u)
{
return static_cast<DB::UInt64>(f) == u && f == static_cast<DB::Float32>(u);
}
template<>
inline bool equalsOp<DB::UInt64, DB::Float32>(DB::UInt64 u, DB::Float32 f)
{
return u == static_cast<DB::UInt64>(f) && static_cast<DB::Float32>(u) == f;
}
template<>
inline bool equalsOp<DB::Float32, DB::Int64>(DB::Float32 f, DB::Int64 u)
{
return static_cast<DB::Int64>(f) == u && f == static_cast<DB::Float32>(u);
}
template<>
inline bool equalsOp<DB::Int64, DB::Float32>(DB::Int64 u, DB::Float32 f)
{
return u == static_cast<DB::Int64>(f) && static_cast<DB::Float32>(u) == f;
}
template <typename A, typename B>
inline bool_if_not_safe_convervsion<A, B> notEqualsOp(A a, B b)
{
return !equalsOp(a, b);
}
template <typename A, typename B>
inline bool_if_safe_convervsion<A, B> notEqualsOp(A a, B b)
{
return a != b;
}
template <typename A, typename B>
inline bool_if_not_safe_convervsion<A, B> lessOp(A a, B b)
{
return greaterOp(b, a);
}
template <typename A, typename B>
inline bool_if_safe_convervsion<A, B> lessOp(A a, B b)
{
return a < b;
}
template <typename A, typename B>
inline bool_if_not_safe_convervsion<A, B> lessOrEqualsOp(A a, B b)
{
return !greaterOp(a, b);
}
template <typename A, typename B>
inline bool_if_safe_convervsion<A, B> lessOrEqualsOp(A a, B b)
{
return a <= b;
}
template <typename A, typename B>
inline bool_if_not_safe_convervsion<A, B> greaterOrEqualsOp(A a, B b)
{
return !greaterOp(b, a);
}
template <typename A, typename B>
inline bool_if_safe_convervsion<A, B> greaterOrEqualsOp(A a, B b)
{
return a >= b;
}
}

View File

@ -16,10 +16,14 @@
#include <DB/Functions/FunctionsLogical.h>
#include <DB/Functions/IFunction.h>
#include <DB/Functions/AccurateComparison.h>
#include <DB/IO/ReadBufferFromString.h>
#include <DB/IO/ReadHelpers.h>
#include <limits>
#include <type_traits>
namespace DB
{
@ -40,21 +44,12 @@ namespace DB
* TODO Массивы.
*/
/** Игнорируем warning о сравнении signed и unsigned.
* (Результат может быть некорректным.)
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-compare"
template <typename A, typename B> struct EqualsOp { static UInt8 apply(A a, B b) { return a == b; } };
template <typename A, typename B> struct NotEqualsOp { static UInt8 apply(A a, B b) { return a != b; } };
template <typename A, typename B> struct LessOp { static UInt8 apply(A a, B b) { return a < b; } };
template <typename A, typename B> struct GreaterOp { static UInt8 apply(A a, B b) { return a > b; } };
template <typename A, typename B> struct LessOrEqualsOp { static UInt8 apply(A a, B b) { return a <= b; } };
template <typename A, typename B> struct GreaterOrEqualsOp { static UInt8 apply(A a, B b) { return a >= b; } };
#pragma GCC diagnostic pop
template <typename A, typename B> struct EqualsOp { static UInt8 apply(A a, B b) { return accurate::equalsOp(a, b); } };
template <typename A, typename B> struct NotEqualsOp { static UInt8 apply(A a, B b) { return accurate::notEqualsOp(a, b); } };
template <typename A, typename B> struct LessOp { static UInt8 apply(A a, B b) { return accurate::lessOp(a, b); } };
template <typename A, typename B> struct GreaterOp { static UInt8 apply(A a, B b) { return accurate::greaterOp(a, b); } };
template <typename A, typename B> struct LessOrEqualsOp { static UInt8 apply(A a, B b) { return accurate::lessOrEqualsOp(a, b); } };
template <typename A, typename B> struct GreaterOrEqualsOp { static UInt8 apply(A a, B b) { return accurate::greaterOrEqualsOp(a, b); } };
template<typename A, typename B, typename Op>

View File

@ -88,7 +88,7 @@ def main(args):
break
case_file = os.path.join(suite_dir, case)
if os.path.isfile(case_file) and (case.endswith('.sh') or case.endswith('.sql')):
if os.path.isfile(case_file) and (case.endswith('.sh') or case.endswith('.py') or case.endswith('.sql')):
(name, ext) = os.path.splitext(case)
report_testcase = et.Element("testcase", attrib = {"name": name})

View File

@ -0,0 +1,27 @@
#!/bin/bash
set -e
clickhouse-client -q "DROP TABLE IF EXISTS test.comparisons"
clickhouse-client -q "CREATE TABLE test.comparisons (i64 Int64, u64 UInt64, f64 Float64) ENGINE = Memory"
clickhouse-client -q "INSERT INTO test.comparisons SELECT toInt64(rand64()) + number AS i64, number AS u64, reinterpretAsFloat64(reinterpretAsString(rand64())) AS f64 FROM system.numbers LIMIT 90000000"
function test_cmp {
echo -n "$1 : "
echo "SELECT count() FROM test.comparisons WHERE ($1)" | clickhouse-benchmark --max_threads=1 -i 20 -d 0 --json test.json 1>&2 2>/dev/null
python2 -c "import json; print json.load(open('test.json'))['query_time_percentiles']['0']"
rm test.json
}
test_cmp "u64 = i64"
test_cmp "u64 >= i64"
test_cmp "i64 > -1 "
test_cmp "i64 = 0 "
test_cmp "u64 != 0 "
test_cmp "i64 = f64"
test_cmp "i64 < f64"
test_cmp "f64 >= 0 "
clickhouse-client -q "DROP TABLE IF EXISTS test.comparisons"

View File

@ -0,0 +1,143 @@
#!/usr/bin/env python
from __future__ import print_function
import os, itertools, urllib
def get_ch_answer(query):
return urllib.urlopen('http://127.0.0.1:8123', data=query).read()
def check_answers(query, answer):
ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip():
print("FAIL on query:", query)
print("Expected answer:", answer)
print("Fetched answer :", ch_answer)
exit(-1)
def get_values():
values = [0, 1, -1]
for bits in [8, 16, 32, 64]:
values += [2**bits, 2**bits - 1]
values += [2**(bits-1) - 1, 2**(bits-1), 2**(bits-1) + 1]
values += [-2**(bits-1) - 1, -2**(bits-1), -2**(bits-1) + 1]
return values
def is_valid_integer(x):
return -2**63 <= x and x <= 2**64-1
TEST_WITH_CASTING=True
GENERATE_TEST_FILES=False
TYPES = {
"UInt8" : { "bits" : 8, "sign" : False, "float" : False },
"Int8" : { "bits" : 8, "sign" : True, "float" : False },
"UInt16": { "bits" : 16, "sign" : False, "float" : False },
"Int16" : { "bits" : 16, "sign" : True, "float" : False },
"UInt32": { "bits" : 32, "sign" : False, "float" : False },
"Int32" : { "bits" : 32, "sign" : True, "float" : False },
"UInt64": { "bits" : 64, "sign" : False, "float" : False },
"Int64" : { "bits" : 64, "sign" : True, "float" : False }
#"Float32" : { "bits" : 32, "sign" : True, "float" : True },
#"Float64" : { "bits" : 64, "sign" : True, "float" : True }
}
def inside_range(value, type_name):
bits = TYPES[type_name]["bits"]
signed = TYPES[type_name]["sign"]
is_float = TYPES[type_name]["float"]
if is_float:
return True
if signed:
return -2**(bits-1) <= value and value <= 2**(bits-1) - 1
else:
return 0 <= value and value <= 2**bits - 1
def test_operators(v1, v2, v1_passed, v2_passed):
query_str = "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2},\t".format(v1=v1_passed, v2=v2_passed)
query_str += "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2} ".format(v1=v2_passed, v2=v1_passed)
answers = [v1 == v2, v1 != v2, v1 < v2, v1 <= v2, v1 > v2, v1 >= v2]
answers += [v2 == v1, v2 != v1, v2 < v1, v2 <= v1, v2 > v1, v2 >= v1]
answers_str = "\t".join([str(int(x)) for x in answers])
return (query_str, answers_str)
VALUES = [x for x in get_values() if is_valid_integer(x)]
def test_pair(v1, v2):
query = "SELECT {}, {}, ".format(v1, v2)
answers = "{}\t{}\t".format(v1, v2)
q, a = test_operators(v1, v2, str(v1), str(v2))
query += q
answers += a
if TEST_WITH_CASTING:
for t1 in TYPES.iterkeys():
if inside_range(v1, t1):
for t2 in TYPES.iterkeys():
if inside_range(v2, t2):
q, a = test_operators(v1, v2, 'to{}({})'.format(t1, v1), 'to{}({})'.format(t2, v2))
query += ', ' + q
answers += "\t" + a
check_answers(query, answers)
return query, answers
VALUES_INT = [0, -1, 1, 2**64-1, 2**63, -2**63, 2**63-1, 2**51, 2**52, 2**53-1, 2**53, 2**53+1, 2**53+2, -2**53+1, -2**53, -2**53-1, -2**53-2, 2*52, -2**52]
VALUES_FLOAT = [float(x) for x in VALUES_INT + [-0.5, 0.5, -1.5, 1.5, 2**53, 2**51 - 0.5, 2**51 + 0.5, 2**60, -2**60, -2**63 - 10000, 2**63 + 10000]]
def test_float_pair(i, f):
f_str = ("%.9f" % f)
query = "SELECT '{}', '{}', ".format(i, f_str)
answers = "{}\t{}\t".format(i, f_str)
q, a = test_operators(i, f, i, f_str)
query += q
answers += a
if TEST_WITH_CASTING:
for t1 in TYPES.iterkeys():
if inside_range(i, t1):
q, a = test_operators(i, f, 'to{}({})'.format(t1, i), f_str)
query += ', ' + q
answers += "\t" + a
check_answers(query, answers)
return query, answers
def main():
if GENERATE_TEST_FILES:
base_name = '00411_accurate_number_comparison'
sql_file = open(base_name + '.sql', 'wt')
ref_file = open(base_name + '.reference', 'wt')
for (v1, v2) in itertools.combinations(VALUES, 2):
q, a = test_pair(v1, v2)
if GENERATE_TEST_FILES:
sql_file.write(q + ";\n")
ref_file.write(a + "\n")
for (i, f) in itertools.product(VALUES_INT, VALUES_FLOAT):
q, a = test_float_pair(i, f)
if GENERATE_TEST_FILES:
sql_file.write(q + ";\n")
ref_file.write(a + "\n")
print("PASSED")
if __name__ == "__main__":
main()