mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Use gperf perfect hash for checking with a better accurency the tld for firstSignificantSubdomain and cutToFirstSignificantSubdomain
This commit is contained in:
parent
cdc65eca23
commit
4cc0ee677a
3
.gitignore
vendored
3
.gitignore
vendored
@ -223,6 +223,9 @@ config-preprocessed.xml
|
||||
*.pb.cpp
|
||||
*.pb.h
|
||||
|
||||
# Gperf generated file
|
||||
dbms/src/Functions/tldLookup.cpp
|
||||
|
||||
# Ignore symlink to private repository
|
||||
/private
|
||||
|
||||
|
@ -250,6 +250,15 @@ if (USE_INCLUDE_WHAT_YOU_USE)
|
||||
endif()
|
||||
endif ()
|
||||
|
||||
#Check if gperf was installed
|
||||
option (USE_GPERF "Use gperf function hash generator tool" ON)
|
||||
if (USE_GPERF)
|
||||
find_program(GPERF gperf)
|
||||
if (NOT GPERF)
|
||||
message(FATAL_ERROR "Could not find the program gperf")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Flags for test coverage
|
||||
if (TEST_COVERAGE)
|
||||
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG")
|
||||
@ -323,6 +332,7 @@ find_contrib_lib(metrohash)
|
||||
find_contrib_lib(btrie)
|
||||
find_contrib_lib(double-conversion)
|
||||
include (cmake/find_parquet.cmake)
|
||||
|
||||
if (ENABLE_TESTS)
|
||||
include (cmake/find_gtest.cmake)
|
||||
endif ()
|
||||
|
@ -3,6 +3,12 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
|
||||
add_headers_and_sources(clickhouse_functions ./GatherUtils)
|
||||
add_headers_and_sources(clickhouse_functions .)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/tldLookup.cpp
|
||||
COMMAND ${GPERF} ${CMAKE_CURRENT_SOURCE_DIR}/gperf/tldLookup.gperf --output-file=${CMAKE_CURRENT_SOURCE_DIR}/tldLookup.cpp
|
||||
)
|
||||
|
||||
list(APPEND clickhouse_functions_sources tldLookup.cpp)
|
||||
|
||||
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
|
||||
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
#include <Functions/domain.h>
|
||||
#include <common/find_symbols.h>
|
||||
|
||||
#include <Functions/tldLookup.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -58,33 +58,16 @@ struct ExtractFirstSignificantSubdomain
|
||||
if (!last_3_periods[2])
|
||||
last_3_periods[2] = begin - 1;
|
||||
|
||||
size_t size_of_second_subdomain_plus_period = last_3_periods[0] - last_3_periods[1];
|
||||
if (size_of_second_subdomain_plus_period == 4 || size_of_second_subdomain_plus_period == 3)
|
||||
{
|
||||
/// We will key by four bytes that are either ".xyz" or ".xy.".
|
||||
UInt32 key = unalignedLoad<UInt32>(last_3_periods[1]);
|
||||
if (tldLookup::is_valid(last_3_periods[1] + 1, end - last_3_periods[1] - 1) != nullptr)
|
||||
{
|
||||
res_data += last_3_periods[2] + 1 - begin;
|
||||
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
||||
} else {
|
||||
res_data += last_3_periods[1] + 1 - begin;
|
||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||
}
|
||||
|
||||
/// NOTE: assuming little endian.
|
||||
/// NOTE: does the compiler generate SIMD code?
|
||||
/// NOTE: for larger amount of cases we can use a perfect hash table (see 'gperf' as an example).
|
||||
if ( key == '.' + 'c' * 0x100U + 'o' * 0x10000U + 'm' * 0x1000000U
|
||||
|| key == '.' + 'n' * 0x100U + 'e' * 0x10000U + 't' * 0x1000000U
|
||||
|| key == '.' + 'o' * 0x100U + 'r' * 0x10000U + 'g' * 0x1000000U
|
||||
|| key == '.' + 'b' * 0x100U + 'i' * 0x10000U + 'z' * 0x1000000U
|
||||
|| key == '.' + 'g' * 0x100U + 'o' * 0x10000U + 'v' * 0x1000000U
|
||||
|| key == '.' + 'm' * 0x100U + 'i' * 0x10000U + 'l' * 0x1000000U
|
||||
|| key == '.' + 'e' * 0x100U + 'd' * 0x10000U + 'u' * 0x1000000U
|
||||
|| key == '.' + 'c' * 0x100U + 'o' * 0x10000U + '.' * 0x1000000U)
|
||||
{
|
||||
res_data += last_3_periods[2] + 1 - begin;
|
||||
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
res_data += last_3_periods[1] + 1 - begin;
|
||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
41028
dbms/src/Functions/gperf/f.cpp
Normal file
41028
dbms/src/Functions/gperf/f.cpp
Normal file
File diff suppressed because it is too large
Load Diff
8633
dbms/src/Functions/gperf/tldLookup.gperf
Normal file
8633
dbms/src/Functions/gperf/tldLookup.gperf
Normal file
File diff suppressed because it is too large
Load Diff
16
dbms/src/Functions/tldLookup.h
Normal file
16
dbms/src/Functions/tldLookup.h
Normal file
@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
|
||||
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
|
||||
class tldLookupHash
|
||||
{
|
||||
private:
|
||||
static inline unsigned int hash (const char *str, size_t len);
|
||||
public:
|
||||
static const char *is_valid (const char *str, size_t len);
|
||||
};
|
||||
|
||||
namespace DB
|
||||
{
|
||||
using tldLookup = tldLookupHash;
|
||||
}
|
@ -70,6 +70,11 @@ export CXX=g++-7
|
||||
sudo apt-get install libicu-dev libreadline-dev
|
||||
```
|
||||
|
||||
## Install Additional dependencies
|
||||
```bash
|
||||
sudo apt-get install gperf
|
||||
```
|
||||
|
||||
## Checkout ClickHouse Sources
|
||||
|
||||
```bash
|
||||
|
@ -11,7 +11,7 @@ Build should work on Mac OS X 10.12.
|
||||
## Install Required Compilers, Tools, and Libraries
|
||||
|
||||
```bash
|
||||
brew install cmake ninja gcc icu4c openssl libtool gettext readline
|
||||
brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf
|
||||
```
|
||||
|
||||
## Checkout ClickHouse Sources
|
||||
|
Loading…
Reference in New Issue
Block a user