Merge pull request #5030 from PerformanceVision/top_level_domain

Changing how the function (cutTo)firstSignificantSubdomain detect the tld (proposal)
This commit is contained in:
alexey-milovidov 2019-06-19 02:08:43 +03:00 committed by GitHub
commit 9bdbd84263
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 4956 additions and 36 deletions

View File

@ -268,6 +268,15 @@ if (USE_INCLUDE_WHAT_YOU_USE)
endif() endif()
endif () endif ()
# Check if gperf was installed
option (USE_GPERF "Use gperf function hash generator tool" ON)
if (USE_GPERF)
find_program(GPERF gperf)
if (NOT GPERF)
message(FATAL_ERROR "Could not find the program gperf")
endif()
endif()
# Flags for test coverage # Flags for test coverage
if (TEST_COVERAGE) if (TEST_COVERAGE)
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG") set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG")
@ -344,6 +353,7 @@ find_contrib_lib(metrohash)
find_contrib_lib(btrie) find_contrib_lib(btrie)
find_contrib_lib(double-conversion) find_contrib_lib(double-conversion)
include (cmake/find_parquet.cmake) include (cmake/find_parquet.cmake)
if (ENABLE_TESTS) if (ENABLE_TESTS)
include (cmake/find_gtest.cmake) include (cmake/find_gtest.cmake)
endif () endif ()

View File

@ -29,6 +29,7 @@
#cmakedefine01 USE_RAPIDJSON #cmakedefine01 USE_RAPIDJSON
#cmakedefine01 USE_LFALLOC #cmakedefine01 USE_LFALLOC
#cmakedefine01 USE_LFALLOC_RANDOM_HINT #cmakedefine01 USE_LFALLOC_RANDOM_HINT
#cmakedefine01 USE_GPERF
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY #cmakedefine01 CLICKHOUSE_SPLIT_BINARY
#cmakedefine01 LLVM_HAS_RTTI #cmakedefine01 LLVM_HAS_RTTI

View File

@ -3,6 +3,14 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_functions ./GatherUtils) add_headers_and_sources(clickhouse_functions ./GatherUtils)
add_headers_and_sources(clickhouse_functions .) add_headers_and_sources(clickhouse_functions .)
if (USE_GPERF)
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
COMMAND ${GPERF} ${CMAKE_CURRENT_SOURCE_DIR}/gperf/tldLookup.gperf --output-file=${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
)
list(APPEND clickhouse_functions_sources ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp)
endif ()
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp) list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h) list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)

View File

@ -2,7 +2,10 @@
#include <Functions/domain.h> #include <Functions/domain.h>
#include <common/find_symbols.h> #include <common/find_symbols.h>
#include <Common/config.h>
#if USE_GPERF
# include <Functions/tldLookup.h>
#endif
namespace DB namespace DB
{ {
@ -58,33 +61,25 @@ struct ExtractFirstSignificantSubdomain
if (!last_3_periods[2]) if (!last_3_periods[2])
last_3_periods[2] = begin - 1; last_3_periods[2] = begin - 1;
size_t size_of_second_subdomain_plus_period = last_3_periods[0] - last_3_periods[1]; auto end_of_level_domain = find_first_symbols<'/'>(last_3_periods[0], end);
if (size_of_second_subdomain_plus_period == 4 || size_of_second_subdomain_plus_period == 3) if (!end_of_level_domain)
{ {
/// We will key by four bytes that are either ".xyz" or ".xy.". end_of_level_domain = end;
UInt32 key = unalignedLoad<UInt32>(last_3_periods[1]);
/// NOTE: assuming little endian.
/// NOTE: does the compiler generate SIMD code?
/// NOTE: for larger amount of cases we can use a perfect hash table (see 'gperf' as an example).
if ( key == '.' + 'c' * 0x100U + 'o' * 0x10000U + 'm' * 0x1000000U
|| key == '.' + 'n' * 0x100U + 'e' * 0x10000U + 't' * 0x1000000U
|| key == '.' + 'o' * 0x100U + 'r' * 0x10000U + 'g' * 0x1000000U
|| key == '.' + 'b' * 0x100U + 'i' * 0x10000U + 'z' * 0x1000000U
|| key == '.' + 'g' * 0x100U + 'o' * 0x10000U + 'v' * 0x1000000U
|| key == '.' + 'm' * 0x100U + 'i' * 0x10000U + 'l' * 0x1000000U
|| key == '.' + 'e' * 0x100U + 'd' * 0x10000U + 'u' * 0x1000000U
|| key == '.' + 'c' * 0x100U + 'o' * 0x10000U + '.' * 0x1000000U)
{
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
return;
}
} }
res_data += last_3_periods[1] + 1 - begin; #if USE_GPERF
res_size = last_3_periods[0] - last_3_periods[1] - 1; if (tldLookup::is_valid(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1) != nullptr)
} {
res_data += last_3_periods[2] + 1 - begin;
res_size = last_3_periods[1] - last_3_periods[2] - 1;
}
else
{
res_data += last_3_periods[1] + 1 - begin;
res_size = last_3_periods[0] - last_3_periods[1] - 1;
}
#endif
}
}; };
} }

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
#pragma once
#include <Common/config.h>
#if USE_GPERF
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
class tldLookupHash
{
private:
static inline unsigned int hash (const char *str, size_t len);
public:
static const char *is_valid (const char *str, size_t len);
};
namespace DB
{
using tldLookup = tldLookupHash;
}
#endif

View File

@ -1,3 +1,3 @@
canada congo net-domena canada congo net-domena
yandex yandex yandex yandex яндекс яндекс yandex yandex yandex yandex яндекс yandex
canada hello hello hello hello hello canada canada canada hello hello hello hello hello canada canada

View File

@ -8,8 +8,6 @@ SELECT
firstSignificantSubdomain('https://www.yandex.ua/news.html'), firstSignificantSubdomain('https://www.yandex.ua/news.html'),
firstSignificantSubdomain('magnet:yandex.abc'), firstSignificantSubdomain('magnet:yandex.abc'),
firstSignificantSubdomain('ftp://www.yandex.co.uk/news.html'), firstSignificantSubdomain('ftp://www.yandex.co.uk/news.html'),
firstSignificantSubdomain('ftp://yandex.co.yandex'),
firstSignificantSubdomain('http://ввв.яндекс.org.рф'),
firstSignificantSubdomain('https://api.www3.static.dev.ввв.яндекс.рф'), firstSignificantSubdomain('https://api.www3.static.dev.ввв.яндекс.рф'),
firstSignificantSubdomain('//www.yandex.com.tr/news.html'); firstSignificantSubdomain('//www.yandex.com.tr/news.html');

View File

@ -1,3 +1,3 @@
usa gov
pentagon mil
stanford edu

View File

@ -23,7 +23,8 @@ RUN apt-get update -y \
python-requests \ python-requests \
python-termcolor \ python-termcolor \
sudo \ sudo \
tzdata tzdata \
gperf
COPY build.sh / COPY build.sh /

View File

@ -45,7 +45,8 @@ RUN apt-get update -y \
ninja-build \ ninja-build \
gperf \ gperf \
git \ git \
tzdata tzdata \
gperf
COPY build.sh / COPY build.sh /
CMD ["/bin/bash", "/build.sh"] CMD ["/bin/bash", "/build.sh"]

View File

@ -66,7 +66,8 @@ RUN apt-get --allow-unauthenticated update -y \
libjemalloc-dev \ libjemalloc-dev \
unixodbc-dev \ unixodbc-dev \
odbcinst \ odbcinst \
tzdata tzdata \
gperf
COPY build.sh / COPY build.sh /
CMD ["/bin/bash", "/build.sh"] CMD ["/bin/bash", "/build.sh"]

View File

@ -67,7 +67,7 @@ export CXX=g++-7
## Install Required Libraries from Packages ## Install Required Libraries from Packages
```bash ```bash
sudo apt-get install libicu-dev libreadline-dev sudo apt-get install libicu-dev libreadline-dev gperf
``` ```
## Checkout ClickHouse Sources ## Checkout ClickHouse Sources

View File

@ -11,7 +11,7 @@ Build should work on Mac OS X 10.12.
## Install Required Compilers, Tools, and Libraries ## Install Required Compilers, Tools, and Libraries
```bash ```bash
brew install cmake ninja gcc icu4c openssl libtool gettext readline brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf
``` ```
## Checkout ClickHouse Sources ## Checkout ClickHouse Sources