2019-02-21 05:08:37 +00:00
|
|
|
#include <Functions/FunctionsStringSimilarity.h>
|
|
|
|
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionsHashing.h>
|
|
|
|
#include <Common/HashTable/ClearableHashMap.h>
|
|
|
|
#include <Common/HashTable/Hash.h>
|
|
|
|
#include <Common/UTF8Helpers.h>
|
|
|
|
|
2019-02-22 03:02:10 +00:00
|
|
|
#include <algorithm>
|
|
|
|
#include <cstring>
|
|
|
|
#include <limits>
|
|
|
|
|
2019-02-21 05:08:37 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
/** Distance function implementation.
|
|
|
|
* We calculate all the trigrams from left string and count by the index of
|
|
|
|
* 16 bits hash of them in the map.
|
|
|
|
* Then calculate all the trigrams from the right string and calculate
|
|
|
|
* the trigram distance on the flight by adding and subtracting from the hashmap.
|
|
|
|
* Then return the map into the condition of which it was after left string
|
|
|
|
* calculation. If the right string size is big (more than 2**15 bytes),
|
|
|
|
* the strings are not similar at all and we return 1.
|
|
|
|
*/
|
|
|
|
struct DistanceImpl
|
2019-02-21 05:08:37 +00:00
|
|
|
{
|
|
|
|
using ResultType = Float32;
|
|
|
|
using CodePoint = UInt32;
|
|
|
|
|
2019-02-22 03:50:06 +00:00
|
|
|
/// map_size for trigram difference
|
|
|
|
static constexpr size_t map_size = 1u << 16;
|
2019-02-22 03:02:10 +00:00
|
|
|
|
|
|
|
/// If the haystack size is bigger than this, behaviour is unspecified for this function
|
2019-02-22 03:50:06 +00:00
|
|
|
static constexpr size_t max_string_size = 1u << 15;
|
2019-02-22 03:02:10 +00:00
|
|
|
|
|
|
|
/// This fits mostly in L2 cache all the time
|
2019-02-22 03:50:06 +00:00
|
|
|
using TrigramStats = UInt16[map_size];
|
2019-02-21 05:08:37 +00:00
|
|
|
|
|
|
|
static inline CodePoint readCodePoint(const char *& pos, const char * end) noexcept
|
|
|
|
{
|
|
|
|
size_t length = UTF8::seqLength(*pos);
|
|
|
|
|
|
|
|
if (pos + length > end)
|
|
|
|
length = end - pos;
|
|
|
|
|
|
|
|
CodePoint res = 0;
|
2019-02-22 03:02:10 +00:00
|
|
|
/// this is faster than just memcpy because of compiler optimizations with moving bytes
|
2019-02-22 03:07:35 +00:00
|
|
|
switch (length)
|
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
case 1:
|
|
|
|
memcpy(&res, pos, 1);
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
memcpy(&res, pos, 2);
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
memcpy(&res, pos, 3);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
memcpy(&res, pos, 4);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-02-21 05:08:37 +00:00
|
|
|
pos += length;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2019-02-22 03:50:06 +00:00
|
|
|
static inline size_t calculateNeedleStats(const char * data, const size_t size, TrigramStats & trigram_stats) noexcept
|
2019-02-21 05:08:37 +00:00
|
|
|
{
|
|
|
|
size_t len = 0;
|
2019-02-22 03:02:10 +00:00
|
|
|
size_t trigram_cnt = 0;
|
2019-02-21 05:08:37 +00:00
|
|
|
const char * start = data;
|
|
|
|
const char * end = data + size;
|
|
|
|
CodePoint cp1 = 0;
|
|
|
|
CodePoint cp2 = 0;
|
|
|
|
CodePoint cp3 = 0;
|
|
|
|
while (start != end)
|
|
|
|
{
|
|
|
|
cp1 = cp2;
|
|
|
|
cp2 = cp3;
|
|
|
|
cp3 = readCodePoint(start, end);
|
|
|
|
++len;
|
|
|
|
if (len < 3)
|
|
|
|
continue;
|
2019-02-22 03:02:10 +00:00
|
|
|
++trigram_cnt;
|
2019-02-22 03:50:06 +00:00
|
|
|
++trigram_stats[(intHashCRC32(intHashCRC32(cp1) ^ cp2) ^ cp3) & 0xFFFFu];
|
2019-02-21 05:08:37 +00:00
|
|
|
}
|
2019-02-22 03:02:10 +00:00
|
|
|
return trigram_cnt;
|
2019-02-21 05:08:37 +00:00
|
|
|
}
|
|
|
|
|
2019-02-22 03:50:06 +00:00
|
|
|
static inline UInt64 calculateHaystackStatsAndMetric(const char * data, const size_t size, TrigramStats & trigram_stats, size_t & distance)
|
2019-02-21 05:08:37 +00:00
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
size_t len = 0;
|
|
|
|
size_t trigram_cnt = 0;
|
|
|
|
const char * start = data;
|
|
|
|
const char * end = data + size;
|
|
|
|
CodePoint cp1 = 0;
|
|
|
|
CodePoint cp2 = 0;
|
|
|
|
CodePoint cp3 = 0;
|
|
|
|
|
|
|
|
/// allocation tricks, most strings are relatively small
|
|
|
|
static constexpr size_t small_buffer_size = 256;
|
|
|
|
std::unique_ptr<UInt16[]> big_buffer;
|
|
|
|
UInt16 small_buffer[small_buffer_size];
|
|
|
|
UInt16 * trigram_storage = small_buffer;
|
2019-02-21 05:08:37 +00:00
|
|
|
|
2019-02-22 03:02:10 +00:00
|
|
|
if (size > small_buffer_size)
|
2019-02-21 05:08:37 +00:00
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
trigram_storage = new UInt16[size];
|
|
|
|
big_buffer.reset(trigram_storage);
|
2019-02-21 05:08:37 +00:00
|
|
|
}
|
|
|
|
|
2019-02-22 03:02:10 +00:00
|
|
|
while (start != end)
|
|
|
|
{
|
|
|
|
cp1 = cp2;
|
|
|
|
cp2 = cp3;
|
|
|
|
cp3 = readCodePoint(start, end);
|
|
|
|
++len;
|
|
|
|
if (len < 3)
|
|
|
|
continue;
|
|
|
|
UInt16 hash = (intHashCRC32(intHashCRC32(cp1) ^ cp2) ^ cp3) & 0xFFFFu;
|
|
|
|
|
|
|
|
/// Unsigned integer tricks
|
2019-02-22 03:50:06 +00:00
|
|
|
if (trigram_stats[hash] < std::numeric_limits<UInt16>::max() / 2)
|
2019-02-22 03:02:10 +00:00
|
|
|
--distance;
|
2019-02-22 03:07:35 +00:00
|
|
|
else
|
2019-02-22 03:02:10 +00:00
|
|
|
++distance;
|
|
|
|
trigram_storage[trigram_cnt++] = hash;
|
2019-02-22 03:50:06 +00:00
|
|
|
--trigram_stats[hash];
|
2019-02-22 03:02:10 +00:00
|
|
|
}
|
2019-02-22 03:07:35 +00:00
|
|
|
for (size_t i = 0; i < trigram_cnt; ++i)
|
2019-02-22 03:50:06 +00:00
|
|
|
++trigram_stats[trigram_storage[i]];
|
2019-02-22 03:02:10 +00:00
|
|
|
return trigram_cnt;
|
2019-02-21 05:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void constant_constant(const std::string & data, const std::string & needle, Float32 & res)
|
|
|
|
{
|
2019-02-22 03:50:06 +00:00
|
|
|
TrigramStats common_stats;
|
2019-02-22 03:02:10 +00:00
|
|
|
memset(common_stats, std::numeric_limits<UInt8>::max(), sizeof(common_stats));
|
|
|
|
size_t second_size = calculateNeedleStats(needle.data(), needle.size(), common_stats);
|
|
|
|
size_t distance = second_size;
|
2019-02-22 03:50:06 +00:00
|
|
|
if (data.size() <= max_string_size)
|
2019-02-22 03:07:35 +00:00
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
size_t first_size = calculateHaystackStatsAndMetric(data.data(), data.size(), common_stats, distance);
|
|
|
|
res = distance * 1.0 / std::max(first_size + second_size, size_t(1));
|
2019-02-22 03:07:35 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
res = 1.f;
|
|
|
|
}
|
2019-02-21 05:08:37 +00:00
|
|
|
}
|
|
|
|
|
2019-02-22 03:07:35 +00:00
|
|
|
static void vector_constant(
|
|
|
|
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray<Float32> & res)
|
2019-02-21 05:08:37 +00:00
|
|
|
{
|
2019-02-22 03:50:06 +00:00
|
|
|
TrigramStats common_stats;
|
2019-02-22 03:02:10 +00:00
|
|
|
memset(common_stats, std::numeric_limits<UInt8>::max(), sizeof(common_stats));
|
|
|
|
const size_t needle_stats_size = calculateNeedleStats(needle.data(), needle.size(), common_stats);
|
|
|
|
size_t distance = needle_stats_size;
|
2019-02-21 05:08:37 +00:00
|
|
|
size_t prev_offset = 0;
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
{
|
|
|
|
const auto * haystack = &data[prev_offset];
|
|
|
|
const size_t haystack_size = offsets[i] - prev_offset - 1;
|
2019-02-22 03:50:06 +00:00
|
|
|
if (haystack_size <= max_string_size)
|
2019-02-22 03:07:35 +00:00
|
|
|
{
|
|
|
|
size_t haystack_stats_size
|
|
|
|
= calculateHaystackStatsAndMetric(reinterpret_cast<const char *>(haystack), haystack_size, common_stats, distance);
|
2019-02-22 03:02:10 +00:00
|
|
|
res[i] = distance * 1.0 / std::max(haystack_stats_size + needle_stats_size, size_t(1));
|
2019-02-22 03:07:35 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
res[i] = 1.f;
|
|
|
|
}
|
|
|
|
distance = needle_stats_size;
|
2019-02-21 05:08:37 +00:00
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-02-22 03:02:10 +00:00
|
|
|
struct DistanceName
|
2019-02-21 05:08:37 +00:00
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
static constexpr auto name = "distance";
|
2019-02-21 05:08:37 +00:00
|
|
|
};
|
|
|
|
|
2019-02-22 03:02:10 +00:00
|
|
|
using FunctionDistance = FunctionsStringSimilarity<DistanceImpl, DistanceName>;
|
2019-02-21 05:08:37 +00:00
|
|
|
|
|
|
|
void registerFunctionsStringSimilarity(FunctionFactory & factory)
|
|
|
|
{
|
2019-02-22 03:02:10 +00:00
|
|
|
factory.registerFunction<FunctionDistance>();
|
2019-02-21 05:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|