Use gperf perfect hash for checking with a better accurency the tld for firstSignificantSubdomain and cutToFirstSignificantSubdomain

This commit is contained in:
Guillaume Tassery 2019-04-17 13:21:26 +07:00
parent cdc65eca23
commit 4cc0ee677a
9 changed files with 49712 additions and 28 deletions

3
.gitignore vendored
View File

@ -223,6 +223,9 @@ config-preprocessed.xml
*.pb.cpp
*.pb.h
# Gperf generated file
dbms/src/Functions/tldLookup.cpp
# Ignore symlink to private repository
/private

View File

@ -250,6 +250,15 @@ if (USE_INCLUDE_WHAT_YOU_USE)
endif()
endif ()
#Check if gperf was installed
option (USE_GPERF "Use gperf function hash generator tool" ON)
if (USE_GPERF)
find_program(GPERF gperf)
if (NOT GPERF)
message(FATAL_ERROR "Could not find the program gperf")
endif()
endif()
# Flags for test coverage
if (TEST_COVERAGE)
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG")
@ -323,6 +332,7 @@ find_contrib_lib(metrohash)
find_contrib_lib(btrie)
find_contrib_lib(double-conversion)
include (cmake/find_parquet.cmake)
if (ENABLE_TESTS)
include (cmake/find_gtest.cmake)
endif ()

View File

@ -3,6 +3,12 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_functions ./GatherUtils)
add_headers_and_sources(clickhouse_functions .)
add_custom_command(
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/tldLookup.cpp
COMMAND ${GPERF} ${CMAKE_CURRENT_SOURCE_DIR}/gperf/tldLookup.gperf --output-file=${CMAKE_CURRENT_SOURCE_DIR}/tldLookup.cpp
)
list(APPEND clickhouse_functions_sources tldLookup.cpp)
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)

View File

@ -2,7 +2,7 @@
#include <Functions/domain.h>
#include <common/find_symbols.h>
#include <Functions/tldLookup.h>
namespace DB
{
@ -58,33 +58,16 @@ struct ExtractFirstSignificantSubdomain
if (!last_3_periods[2])
last_3_periods[2] = begin - 1;
size_t size_of_second_subdomain_plus_period = last_3_periods[0] - last_3_periods[1];
if (size_of_second_subdomain_plus_period == 4 || size_of_second_subdomain_plus_period == 3)
{
/// We will key by four bytes that are either ".xyz" or ".xy.".
UInt32 key = unalignedLoad<UInt32>(last_3_periods[1]);
if (tldLookup::is_valid(last_3_periods[1] + 1, end - last_3_periods[1] - 1) != nullptr)
{
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
} else {
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
/// NOTE: assuming little endian.
/// NOTE: does the compiler generate SIMD code?
/// NOTE: for larger amount of cases we can use a perfect hash table (see 'gperf' as an example).
if ( key == '.' + 'c' * 0x100U + 'o' * 0x10000U + 'm' * 0x1000000U
|| key == '.' + 'n' * 0x100U + 'e' * 0x10000U + 't' * 0x1000000U
|| key == '.' + 'o' * 0x100U + 'r' * 0x10000U + 'g' * 0x1000000U
|| key == '.' + 'b' * 0x100U + 'i' * 0x10000U + 'z' * 0x1000000U
|| key == '.' + 'g' * 0x100U + 'o' * 0x10000U + 'v' * 0x1000000U
|| key == '.' + 'm' * 0x100U + 'i' * 0x10000U + 'l' * 0x1000000U
|| key == '.' + 'e' * 0x100U + 'd' * 0x10000U + 'u' * 0x1000000U
|| key == '.' + 'c' * 0x100U + 'o' * 0x10000U + '.' * 0x1000000U)
{
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
return;
}
}
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
}
};
}

41028
dbms/src/Functions/gperf/f.cpp Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
#pragma once
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
class tldLookupHash
{
private:
static inline unsigned int hash (const char *str, size_t len);
public:
static const char *is_valid (const char *str, size_t len);
};
namespace DB
{
using tldLookup = tldLookupHash;
}

View File

@ -70,6 +70,11 @@ export CXX=g++-7
sudo apt-get install libicu-dev libreadline-dev
```
## Install Additional dependencies
```bash
sudo apt-get install gperf
```
## Checkout ClickHouse Sources
```bash

View File

@ -11,7 +11,7 @@ Build should work on Mac OS X 10.12.
## Install Required Compilers, Tools, and Libraries
```bash
brew install cmake ninja gcc icu4c openssl libtool gettext readline
brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf
```
## Checkout ClickHouse Sources