Merge pull request #5030 from PerformanceVision/top_level_domain

Changing how the function (cutTo)firstSignificantSubdomain detect the tld (proposal)
This commit is contained in:
alexey-milovidov 2019-06-19 02:08:43 +03:00 committed by GitHub
commit 9bdbd84263
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 4956 additions and 36 deletions

View File

@ -268,6 +268,15 @@ if (USE_INCLUDE_WHAT_YOU_USE)
endif()
endif ()
# Check if gperf was installed
option (USE_GPERF "Use gperf function hash generator tool" ON)
if (USE_GPERF)
find_program(GPERF gperf)
if (NOT GPERF)
message(FATAL_ERROR "Could not find the program gperf")
endif()
endif()
# Flags for test coverage
if (TEST_COVERAGE)
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG")
@ -344,6 +353,7 @@ find_contrib_lib(metrohash)
find_contrib_lib(btrie)
find_contrib_lib(double-conversion)
include (cmake/find_parquet.cmake)
if (ENABLE_TESTS)
include (cmake/find_gtest.cmake)
endif ()

View File

@ -29,6 +29,7 @@
#cmakedefine01 USE_RAPIDJSON
#cmakedefine01 USE_LFALLOC
#cmakedefine01 USE_LFALLOC_RANDOM_HINT
#cmakedefine01 USE_GPERF
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
#cmakedefine01 LLVM_HAS_RTTI

View File

@ -3,6 +3,14 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_functions ./GatherUtils)
add_headers_and_sources(clickhouse_functions .)
if (USE_GPERF)
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
COMMAND ${GPERF} ${CMAKE_CURRENT_SOURCE_DIR}/gperf/tldLookup.gperf --output-file=${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
)
list(APPEND clickhouse_functions_sources ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp)
endif ()
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)

View File

@ -2,7 +2,10 @@
#include <Functions/domain.h>
#include <common/find_symbols.h>
#include <Common/config.h>
#if USE_GPERF
# include <Functions/tldLookup.h>
#endif
namespace DB
{
@ -58,33 +61,25 @@ struct ExtractFirstSignificantSubdomain
if (!last_3_periods[2])
last_3_periods[2] = begin - 1;
size_t size_of_second_subdomain_plus_period = last_3_periods[0] - last_3_periods[1];
if (size_of_second_subdomain_plus_period == 4 || size_of_second_subdomain_plus_period == 3)
auto end_of_level_domain = find_first_symbols<'/'>(last_3_periods[0], end);
if (!end_of_level_domain)
{
/// We will key by four bytes that are either ".xyz" or ".xy.".
UInt32 key = unalignedLoad<UInt32>(last_3_periods[1]);
/// NOTE: assuming little endian.
/// NOTE: does the compiler generate SIMD code?
/// NOTE: for larger amount of cases we can use a perfect hash table (see 'gperf' as an example).
if ( key == '.' + 'c' * 0x100U + 'o' * 0x10000U + 'm' * 0x1000000U
|| key == '.' + 'n' * 0x100U + 'e' * 0x10000U + 't' * 0x1000000U
|| key == '.' + 'o' * 0x100U + 'r' * 0x10000U + 'g' * 0x1000000U
|| key == '.' + 'b' * 0x100U + 'i' * 0x10000U + 'z' * 0x1000000U
|| key == '.' + 'g' * 0x100U + 'o' * 0x10000U + 'v' * 0x1000000U
|| key == '.' + 'm' * 0x100U + 'i' * 0x10000U + 'l' * 0x1000000U
|| key == '.' + 'e' * 0x100U + 'd' * 0x10000U + 'u' * 0x1000000U
|| key == '.' + 'c' * 0x100U + 'o' * 0x10000U + '.' * 0x1000000U)
{
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
return;
}
end_of_level_domain = end;
}
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
#if USE_GPERF
if (tldLookup::is_valid(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1) != nullptr)
{
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
}
else
{
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
#endif
}
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
#pragma once
#include <Common/config.h>
#if USE_GPERF
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
class tldLookupHash
{
private:
static inline unsigned int hash (const char *str, size_t len);
public:
static const char *is_valid (const char *str, size_t len);
};
namespace DB
{
using tldLookup = tldLookupHash;
}
#endif

View File

@ -1,3 +1,3 @@
canada congo net-domena
yandex yandex yandex yandex яндекс яндекс yandex
yandex yandex yandex яндекс yandex
canada hello hello hello hello hello canada canada

View File

@ -8,8 +8,6 @@ SELECT
firstSignificantSubdomain('https://www.yandex.ua/news.html'),
firstSignificantSubdomain('magnet:yandex.abc'),
firstSignificantSubdomain('ftp://www.yandex.co.uk/news.html'),
firstSignificantSubdomain('ftp://yandex.co.yandex'),
firstSignificantSubdomain('http://ввв.яндекс.org.рф'),
firstSignificantSubdomain('https://api.www3.static.dev.ввв.яндекс.рф'),
firstSignificantSubdomain('//www.yandex.com.tr/news.html');

View File

@ -1,3 +1,3 @@
usa
pentagon
stanford
gov
mil
edu

View File

@ -23,7 +23,8 @@ RUN apt-get update -y \
python-requests \
python-termcolor \
sudo \
tzdata
tzdata \
gperf
COPY build.sh /

View File

@ -45,7 +45,8 @@ RUN apt-get update -y \
ninja-build \
gperf \
git \
tzdata
tzdata \
gperf
COPY build.sh /
CMD ["/bin/bash", "/build.sh"]

View File

@ -66,7 +66,8 @@ RUN apt-get --allow-unauthenticated update -y \
libjemalloc-dev \
unixodbc-dev \
odbcinst \
tzdata
tzdata \
gperf
COPY build.sh /
CMD ["/bin/bash", "/build.sh"]

View File

@ -67,7 +67,7 @@ export CXX=g++-7
## Install Required Libraries from Packages
```bash
sudo apt-get install libicu-dev libreadline-dev
sudo apt-get install libicu-dev libreadline-dev gperf
```
## Checkout ClickHouse Sources

View File

@ -11,7 +11,7 @@ Build should work on Mac OS X 10.12.
## Install Required Compilers, Tools, and Libraries
```bash
brew install cmake ninja gcc icu4c openssl libtool gettext readline
brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf
```
## Checkout ClickHouse Sources