Slightly improve ip_dict loading performance, handle v4 to v6 masks in prepossessing, add more tests

This commit is contained in:
vdimir 2020-11-15 17:36:05 +03:00
parent 8b91e0984c
commit 5e0e22301b
No known key found for this signature in database
GPG Key ID: 4F25F52AFAF0C2C0
4 changed files with 161 additions and 26 deletions

View File

@ -28,11 +28,33 @@ namespace ErrorCodes
namespace
{
/// Intermediate stucture used for loading data
struct IPRecord
{
Poco::Net::IPAddress addr;
UInt8 prefix;
size_t row;
bool isv6;
IPRecord(const Poco::Net::IPAddress & addr_, UInt8 prefix_, size_t row_)
: addr(addr_)
, prefix(prefix_)
, row(row_)
, isv6(addr.family() == Poco::Net::IPAddress::IPv6)
{
}
const uint8_t * asIPv6Binary(uint8_t * buf) const
{
if (isv6)
return reinterpret_cast<const uint8_t *>(addr.addr());
memset(buf, 0, 10);
buf[10] = '\xFF';
buf[11] = '\xFF';
memcpy(&buf[12], addr.addr(), 4);
return buf;
}
};
struct IPv4Subnet
@ -70,6 +92,11 @@ static inline bool compPrefixes(UInt8 a, UInt8 b)
return a < b;
}
inline static UInt32 IPv4AsUInt32(const void * addr)
{
return Poco::ByteOrder::fromNetwork(*reinterpret_cast<const UInt32 *>(addr));
}
/// Convert mapped IPv6 to IPv4 if possible
inline static UInt32 mappedIPv4ToBinary(const uint8_t * addr, bool & success)
{
@ -81,7 +108,7 @@ inline static UInt32 mappedIPv4ToBinary(const uint8_t * addr, bool & success)
addr[10] == 0xff && addr[11] == 0xff;
if (!success)
return 0;
return Poco::ByteOrder::fromNetwork(*reinterpret_cast<const UInt32 *>(&addr[12]));
return IPv4AsUInt32(&addr[12]);
}
/// Convert IPv4 to IPv6-mapped and save results to buf
@ -114,10 +141,8 @@ static bool matchIPv6Subnet(const uint8_t * target, const uint8_t * addr, UInt8
{
auto offset = __builtin_ctz(mask);
if (offset < prefix / 8)
return false;
if (offset >= prefix / 8 + 1)
return true;
if (prefix / 8 != offset)
return prefix / 8 < offset;
auto cmpmask = ~(0xff >> (prefix % 8));
return (target[offset] & cmpmask) == addr[offset];
@ -454,14 +479,14 @@ void TrieDictionary::loadData()
UInt8 prefix = std::stoi(addr_str.substr(pos + 1), nullptr, 10);
addr = addr & IPAddress(prefix, addr.family());
ip_records.emplace_back(IPRecord{addr, prefix, row_number});
ip_records.emplace_back(addr, prefix, row_number);
}
else
{
IPAddress addr(addr_str);
has_ipv6 = has_ipv6 || (addr.family() == Poco::Net::IPAddress::IPv6);
UInt8 prefix = addr.length() * 8;
ip_records.emplace_back(IPRecord{addr, prefix, row_number});
ip_records.emplace_back(addr, prefix, row_number);
}
}
}
@ -473,10 +498,10 @@ void TrieDictionary::loadData()
std::sort(ip_records.begin(), ip_records.end(),
[](const auto & record_a, const auto & record_b)
{
auto a = IPv6ToBinary(record_a.addr);
auto b = IPv6ToBinary(record_b.addr);
auto cmpres = memcmp16(reinterpret_cast<const uint8_t *>(a.data()),
reinterpret_cast<const uint8_t *>(b.data()));
uint8_t a_buf[IPV6_BINARY_LENGTH];
uint8_t b_buf[IPV6_BINARY_LENGTH];
auto cmpres = memcmp16(record_a.asIPv6Binary(a_buf), record_b.asIPv6Binary(b_buf));
if (cmpres == 0)
return compPrefixes(record_a.prefix, record_b.prefix);
@ -520,7 +545,7 @@ void TrieDictionary::loadData()
ipv4_col.reserve(ip_records.size());
for (const auto & record : ip_records)
{
auto addr = Poco::ByteOrder::fromNetwork(*reinterpret_cast<const UInt32 *>(record.addr.addr()));
auto addr = IPv4AsUInt32(record.addr.addr());
ipv4_col.push_back(addr);
mask_column.push_back(record.prefix);
row_idx.push_back(record.row);
@ -532,24 +557,42 @@ void TrieDictionary::loadData()
for (const auto i : ext::range(0, ip_records.size()))
{
parent_subnet[i] = i;
const auto & cur_address = ip_records[i].addr;
while (!subnets_stack.empty())
{
size_t subnet_idx = subnets_stack.top();
const auto cur_subnet = ip_records[subnet_idx];
auto cur_addr_masked = cur_address & IPAddress(cur_subnet.prefix, cur_address.family());
if (cur_subnet.addr == cur_addr_masked)
size_t pi = subnets_stack.top();
if (has_ipv6)
{
parent_subnet[i] = subnet_idx;
uint8_t a_buf[IPV6_BINARY_LENGTH];
uint8_t b_buf[IPV6_BINARY_LENGTH];
const auto * cur_address = ip_records[i].asIPv6Binary(a_buf);
const auto * cur_subnet = ip_records[pi].asIPv6Binary(b_buf);
bool is_mask_smaller = ip_records[pi].prefix < ip_records[i].prefix;
if (is_mask_smaller && matchIPv6Subnet(cur_address, cur_subnet, ip_records[pi].prefix))
{
parent_subnet[i] = pi;
break;
}
}
else
{
UInt32 cur_address = IPv4AsUInt32(ip_records[i].addr.addr());
UInt32 cur_subnet = IPv4AsUInt32(ip_records[pi].addr.addr());
bool is_mask_smaller = ip_records[pi].prefix < ip_records[i].prefix;
if (is_mask_smaller && matchIPv4Subnet(cur_address, cur_subnet, ip_records[pi].prefix))
{
parent_subnet[i] = pi;
break;
}
}
subnets_stack.pop();
}
subnets_stack.push(i);
}
LOG_TRACE(logger, "{} ip records are read", ip_records.size());
if (require_nonempty && 0 == element_count)
throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY};
}

View File

@ -3,6 +3,7 @@
CREATE TABLE table_ip_trie
(
ip String,
ver UInt8,
val Float32
) ENGINE = TinyLog
</create_query>
@ -10,16 +11,18 @@
<create_query>
INSERT INTO table_ip_trie
SELECT
IPv4NumToString(ipv4) || '/' || toString(rand() % 25 + 8) as ip,
IPv4NumToString(ipv4) || '/' || toString(rand() % 17 + 16) as ip,
4 as ver,
val
FROM generateRandom('ipv4 UInt32, val Float32', 0, 30, 30)
LIMIT 1000000
LIMIT 500000
</create_query>
<create_query>
INSERT INTO table_ip_trie
SELECT
IPv6NumToString(ipv6) || '/' || toString(rand() % 113 + 16) as ip,
IPv6NumToString(ipv6) || '/' || toString(rand() % 65 + 64) as ip,
6 as ver,
val
FROM generateRandom('ipv6 FixedString(16), val Float32', 0, 30, 30)
LIMIT 2500000
@ -29,6 +32,7 @@
CREATE DICTIONARY dict_ip_trie
(
ip String,
ver UInt8,
val Float32
)
PRIMARY KEY ip
@ -41,10 +45,19 @@
CREATE TABLE dict_ip_trie_table
(
`ip` String,
`ver` UInt8,
`val` Float32
) ENGINE = Dictionary(default.dict_ip_trie)
</create_query>
<create_query>
CREATE TABLE table_ip_from_dict (`ip` String, `ver` UInt8) ENGINE = TinyLog
</create_query>
<create_query>
INSERT INTO table_ip_from_dict
SELECT ip, ver FROM dict_ip_trie_table
</create_query>
<query>
SELECT dictGetFloat32('default.dict_ip_trie', 'val', tuple(rand32()))
FROM numbers(500000)
@ -55,7 +68,22 @@
FROM numbers(500000)
</query>
<query>
SELECT dictGetFloat32('default.dict_ip_trie', 'val', tuple(IPv4StringToNum(ip)))
FROM table_ip_from_dict
WHERE ver == 4
LIMIT 500000
</query>
<query>
SELECT dictGetFloat32('default.dict_ip_trie', 'val', tuple(IPv6StringToNum(ip)))
FROM table_ip_from_dict
WHERE ver == 6
LIMIT 500000
</query>
<drop_query>DROP DICTIONARY IF EXISTS default.dict_ip_trie</drop_query>
<drop_query>DROP TABLE IF EXISTS table_ip_trie</drop_query>
<drop_query>DROP TABLE IF EXISTS dict_ip_trie_table</drop_query>
<drop_query>DROP TABLE IF EXISTS table_ip_from_dict</drop_query>
</test>

View File

@ -396,3 +396,29 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -266,13 +266,13 @@ SELECT 1 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('200
SELECT 1 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('2001:db8:ffff:ffff::')));
SELECT 1 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('2001:db8:ffff:1::')));
SELECT '' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('654f:3716::')));
SELECT 0 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('654f:3716::')));
SELECT 0 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('654f:3716:ffff::')));
SELECT '0' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('654f:3716::')));
SELECT 'JA' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('::ffff:654f:3716')));
SELECT 'JA' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('::ffff:101.79.55.22')));
SELECT 'JA' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv4StringToNum('101.79.55.22')));
SELECT 1 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv4StringToNum('127.0.0.1')));
SELECT 1 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('::ffff:127.0.0.1')));
SELECT '0' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('::0')));
SELECT '1' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('8000::')));
@ -439,6 +439,14 @@ SELECT
toString(number) AS val
FROM VALUES ('number UInt32', 5, 13, 24, 48, 49, 99, 127);
INSERT INTO database_for_dict.table_ip_trie VALUES ('101.79.55.22', 'JA');
INSERT INTO database_for_dict.table_ipv4_trie
SELECT
'255.255.255.255/' || toString(number) AS prefix,
toString(number) AS val
FROM VALUES ('number UInt32', 5, 13, 24, 30);
CREATE DICTIONARY database_for_dict.dict_ip_trie
(
prefix String,
@ -451,6 +459,14 @@ LIFETIME(MIN 10 MAX 100);
SELECT 0 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('::ffff:1:1')));
SELECT '' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('654f:3716::')));
SELECT 0 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('654f:3716::')));
SELECT 0 == dictHas('database_for_dict.dict_ip_trie', tuple(IPv6StringToNum('654f:3716:ffff::')));
SELECT 'JA' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('::ffff:654f:3716')));
SELECT 'JA' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('::ffff:101.79.55.22')));
SELECT 'JA' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv4StringToNum('101.79.55.22')));
SELECT '' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('::0')));
SELECT '' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('8000::')));
SELECT '' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('c000::')));
@ -587,4 +603,26 @@ SELECT '127' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv
SELECT '127' == dictGetString('database_for_dict.dict_ip_trie', 'val', tuple(IPv6StringToNum('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff')));
SELECT '3' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.0.0.0')));
SELECT '4' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.0.0.1')));
SELECT '3' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.0.0.127')));
SELECT '2' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.0.255.127')));
SELECT '15' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.255.127.127')));
SELECT '16' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.255.128.9')));
SELECT '16' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.255.128.127')));
SELECT '18' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.255.128.10')));
SELECT '19' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.255.128.255')));
SELECT '20' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv4StringToNum('127.255.255.128')));
SELECT '3' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7f00:0')));
SELECT '4' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7f00:1')));
SELECT '3' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7f00:7f')));
SELECT '2' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7f00:ff7f')));
SELECT '15' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7fff:7f7f')));
SELECT '16' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7fff:8009')));
SELECT '16' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7fff:807f')));
SELECT '18' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7fff:800a')));
SELECT '19' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7fff:80ff')));
SELECT '20' == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'val', tuple(IPv6StringToNum('::ffff:7fff:ff80')));
DROP DATABASE IF EXISTS database_for_dict;