mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
Merge pull request #5030 from PerformanceVision/top_level_domain
Changing how the function (cutTo)firstSignificantSubdomain detect the tld (proposal)
This commit is contained in:
commit
9bdbd84263
@ -268,6 +268,15 @@ if (USE_INCLUDE_WHAT_YOU_USE)
|
||||
endif()
|
||||
endif ()
|
||||
|
||||
# Check if gperf was installed
|
||||
option (USE_GPERF "Use gperf function hash generator tool" ON)
|
||||
if (USE_GPERF)
|
||||
find_program(GPERF gperf)
|
||||
if (NOT GPERF)
|
||||
message(FATAL_ERROR "Could not find the program gperf")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Flags for test coverage
|
||||
if (TEST_COVERAGE)
|
||||
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG")
|
||||
@ -344,6 +353,7 @@ find_contrib_lib(metrohash)
|
||||
find_contrib_lib(btrie)
|
||||
find_contrib_lib(double-conversion)
|
||||
include (cmake/find_parquet.cmake)
|
||||
|
||||
if (ENABLE_TESTS)
|
||||
include (cmake/find_gtest.cmake)
|
||||
endif ()
|
||||
|
@ -29,6 +29,7 @@
|
||||
#cmakedefine01 USE_RAPIDJSON
|
||||
#cmakedefine01 USE_LFALLOC
|
||||
#cmakedefine01 USE_LFALLOC_RANDOM_HINT
|
||||
#cmakedefine01 USE_GPERF
|
||||
|
||||
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
|
||||
#cmakedefine01 LLVM_HAS_RTTI
|
||||
|
@ -3,6 +3,14 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
|
||||
add_headers_and_sources(clickhouse_functions ./GatherUtils)
|
||||
add_headers_and_sources(clickhouse_functions .)
|
||||
|
||||
if (USE_GPERF)
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
|
||||
COMMAND ${GPERF} ${CMAKE_CURRENT_SOURCE_DIR}/gperf/tldLookup.gperf --output-file=${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
|
||||
)
|
||||
|
||||
list(APPEND clickhouse_functions_sources ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp)
|
||||
endif ()
|
||||
|
||||
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
|
||||
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)
|
||||
|
@ -2,7 +2,10 @@
|
||||
|
||||
#include <Functions/domain.h>
|
||||
#include <common/find_symbols.h>
|
||||
|
||||
#include <Common/config.h>
|
||||
#if USE_GPERF
|
||||
# include <Functions/tldLookup.h>
|
||||
#endif
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -58,33 +61,25 @@ struct ExtractFirstSignificantSubdomain
|
||||
if (!last_3_periods[2])
|
||||
last_3_periods[2] = begin - 1;
|
||||
|
||||
size_t size_of_second_subdomain_plus_period = last_3_periods[0] - last_3_periods[1];
|
||||
if (size_of_second_subdomain_plus_period == 4 || size_of_second_subdomain_plus_period == 3)
|
||||
auto end_of_level_domain = find_first_symbols<'/'>(last_3_periods[0], end);
|
||||
if (!end_of_level_domain)
|
||||
{
|
||||
/// We will key by four bytes that are either ".xyz" or ".xy.".
|
||||
UInt32 key = unalignedLoad<UInt32>(last_3_periods[1]);
|
||||
|
||||
/// NOTE: assuming little endian.
|
||||
/// NOTE: does the compiler generate SIMD code?
|
||||
/// NOTE: for larger amount of cases we can use a perfect hash table (see 'gperf' as an example).
|
||||
if ( key == '.' + 'c' * 0x100U + 'o' * 0x10000U + 'm' * 0x1000000U
|
||||
|| key == '.' + 'n' * 0x100U + 'e' * 0x10000U + 't' * 0x1000000U
|
||||
|| key == '.' + 'o' * 0x100U + 'r' * 0x10000U + 'g' * 0x1000000U
|
||||
|| key == '.' + 'b' * 0x100U + 'i' * 0x10000U + 'z' * 0x1000000U
|
||||
|| key == '.' + 'g' * 0x100U + 'o' * 0x10000U + 'v' * 0x1000000U
|
||||
|| key == '.' + 'm' * 0x100U + 'i' * 0x10000U + 'l' * 0x1000000U
|
||||
|| key == '.' + 'e' * 0x100U + 'd' * 0x10000U + 'u' * 0x1000000U
|
||||
|| key == '.' + 'c' * 0x100U + 'o' * 0x10000U + '.' * 0x1000000U)
|
||||
{
|
||||
res_data += last_3_periods[2] + 1 - begin;
|
||||
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
||||
return;
|
||||
}
|
||||
end_of_level_domain = end;
|
||||
}
|
||||
|
||||
res_data += last_3_periods[1] + 1 - begin;
|
||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||
}
|
||||
#if USE_GPERF
|
||||
if (tldLookup::is_valid(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1) != nullptr)
|
||||
{
|
||||
res_data += last_3_periods[2] + 1 - begin;
|
||||
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
res_data += last_3_periods[1] + 1 - begin;
|
||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
4887
dbms/src/Functions/gperf/tldLookup.gperf
Normal file
4887
dbms/src/Functions/gperf/tldLookup.gperf
Normal file
File diff suppressed because it is too large
Load Diff
18
dbms/src/Functions/tldLookup.h
Normal file
18
dbms/src/Functions/tldLookup.h
Normal file
@ -0,0 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/config.h>
|
||||
#if USE_GPERF
|
||||
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
|
||||
class tldLookupHash
|
||||
{
|
||||
private:
|
||||
static inline unsigned int hash (const char *str, size_t len);
|
||||
public:
|
||||
static const char *is_valid (const char *str, size_t len);
|
||||
};
|
||||
|
||||
namespace DB
|
||||
{
|
||||
using tldLookup = tldLookupHash;
|
||||
}
|
||||
#endif
|
@ -1,3 +1,3 @@
|
||||
canada congo net-domena
|
||||
yandex yandex yandex yandex яндекс яндекс yandex
|
||||
yandex yandex yandex яндекс yandex
|
||||
canada hello hello hello hello hello canada canada
|
||||
|
@ -8,8 +8,6 @@ SELECT
|
||||
firstSignificantSubdomain('https://www.yandex.ua/news.html'),
|
||||
firstSignificantSubdomain('magnet:yandex.abc'),
|
||||
firstSignificantSubdomain('ftp://www.yandex.co.uk/news.html'),
|
||||
firstSignificantSubdomain('ftp://yandex.co.yandex'),
|
||||
firstSignificantSubdomain('http://ввв.яндекс.org.рф'),
|
||||
firstSignificantSubdomain('https://api.www3.static.dev.ввв.яндекс.рф'),
|
||||
firstSignificantSubdomain('//www.yandex.com.tr/news.html');
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
usa
|
||||
pentagon
|
||||
stanford
|
||||
gov
|
||||
mil
|
||||
edu
|
||||
|
@ -23,7 +23,8 @@ RUN apt-get update -y \
|
||||
python-requests \
|
||||
python-termcolor \
|
||||
sudo \
|
||||
tzdata
|
||||
tzdata \
|
||||
gperf
|
||||
|
||||
COPY build.sh /
|
||||
|
||||
|
@ -45,7 +45,8 @@ RUN apt-get update -y \
|
||||
ninja-build \
|
||||
gperf \
|
||||
git \
|
||||
tzdata
|
||||
tzdata \
|
||||
gperf
|
||||
|
||||
COPY build.sh /
|
||||
CMD ["/bin/bash", "/build.sh"]
|
||||
|
@ -66,7 +66,8 @@ RUN apt-get --allow-unauthenticated update -y \
|
||||
libjemalloc-dev \
|
||||
unixodbc-dev \
|
||||
odbcinst \
|
||||
tzdata
|
||||
tzdata \
|
||||
gperf
|
||||
|
||||
COPY build.sh /
|
||||
CMD ["/bin/bash", "/build.sh"]
|
||||
|
@ -67,7 +67,7 @@ export CXX=g++-7
|
||||
## Install Required Libraries from Packages
|
||||
|
||||
```bash
|
||||
sudo apt-get install libicu-dev libreadline-dev
|
||||
sudo apt-get install libicu-dev libreadline-dev gperf
|
||||
```
|
||||
|
||||
## Checkout ClickHouse Sources
|
||||
|
@ -11,7 +11,7 @@ Build should work on Mac OS X 10.12.
|
||||
## Install Required Compilers, Tools, and Libraries
|
||||
|
||||
```bash
|
||||
brew install cmake ninja gcc icu4c openssl libtool gettext readline
|
||||
brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf
|
||||
```
|
||||
|
||||
## Checkout ClickHouse Sources
|
||||
|
Loading…
Reference in New Issue
Block a user