mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 16:42:05 +00:00
Merge pull request #5030 from PerformanceVision/top_level_domain
Changing how the function (cutTo)firstSignificantSubdomain detect the tld (proposal)
This commit is contained in:
commit
9bdbd84263
@ -268,6 +268,15 @@ if (USE_INCLUDE_WHAT_YOU_USE)
|
|||||||
endif()
|
endif()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# Check if gperf was installed
|
||||||
|
option (USE_GPERF "Use gperf function hash generator tool" ON)
|
||||||
|
if (USE_GPERF)
|
||||||
|
find_program(GPERF gperf)
|
||||||
|
if (NOT GPERF)
|
||||||
|
message(FATAL_ERROR "Could not find the program gperf")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
# Flags for test coverage
|
# Flags for test coverage
|
||||||
if (TEST_COVERAGE)
|
if (TEST_COVERAGE)
|
||||||
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG")
|
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -DIS_DEBUG")
|
||||||
@ -344,6 +353,7 @@ find_contrib_lib(metrohash)
|
|||||||
find_contrib_lib(btrie)
|
find_contrib_lib(btrie)
|
||||||
find_contrib_lib(double-conversion)
|
find_contrib_lib(double-conversion)
|
||||||
include (cmake/find_parquet.cmake)
|
include (cmake/find_parquet.cmake)
|
||||||
|
|
||||||
if (ENABLE_TESTS)
|
if (ENABLE_TESTS)
|
||||||
include (cmake/find_gtest.cmake)
|
include (cmake/find_gtest.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
#cmakedefine01 USE_RAPIDJSON
|
#cmakedefine01 USE_RAPIDJSON
|
||||||
#cmakedefine01 USE_LFALLOC
|
#cmakedefine01 USE_LFALLOC
|
||||||
#cmakedefine01 USE_LFALLOC_RANDOM_HINT
|
#cmakedefine01 USE_LFALLOC_RANDOM_HINT
|
||||||
|
#cmakedefine01 USE_GPERF
|
||||||
|
|
||||||
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
|
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
|
||||||
#cmakedefine01 LLVM_HAS_RTTI
|
#cmakedefine01 LLVM_HAS_RTTI
|
||||||
|
@ -3,6 +3,14 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
|
|||||||
add_headers_and_sources(clickhouse_functions ./GatherUtils)
|
add_headers_and_sources(clickhouse_functions ./GatherUtils)
|
||||||
add_headers_and_sources(clickhouse_functions .)
|
add_headers_and_sources(clickhouse_functions .)
|
||||||
|
|
||||||
|
if (USE_GPERF)
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
|
||||||
|
COMMAND ${GPERF} ${CMAKE_CURRENT_SOURCE_DIR}/gperf/tldLookup.gperf --output-file=${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
list(APPEND clickhouse_functions_sources ${CMAKE_CURRENT_BINARY_DIR}/tldLookup.cpp)
|
||||||
|
endif ()
|
||||||
|
|
||||||
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
|
list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
|
||||||
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)
|
list(REMOVE_ITEM clickhouse_functions_headers IFunction.h FunctionFactory.h FunctionHelpers.h)
|
||||||
|
@ -2,7 +2,10 @@
|
|||||||
|
|
||||||
#include <Functions/domain.h>
|
#include <Functions/domain.h>
|
||||||
#include <common/find_symbols.h>
|
#include <common/find_symbols.h>
|
||||||
|
#include <Common/config.h>
|
||||||
|
#if USE_GPERF
|
||||||
|
# include <Functions/tldLookup.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -58,33 +61,25 @@ struct ExtractFirstSignificantSubdomain
|
|||||||
if (!last_3_periods[2])
|
if (!last_3_periods[2])
|
||||||
last_3_periods[2] = begin - 1;
|
last_3_periods[2] = begin - 1;
|
||||||
|
|
||||||
size_t size_of_second_subdomain_plus_period = last_3_periods[0] - last_3_periods[1];
|
auto end_of_level_domain = find_first_symbols<'/'>(last_3_periods[0], end);
|
||||||
if (size_of_second_subdomain_plus_period == 4 || size_of_second_subdomain_plus_period == 3)
|
if (!end_of_level_domain)
|
||||||
{
|
{
|
||||||
/// We will key by four bytes that are either ".xyz" or ".xy.".
|
end_of_level_domain = end;
|
||||||
UInt32 key = unalignedLoad<UInt32>(last_3_periods[1]);
|
|
||||||
|
|
||||||
/// NOTE: assuming little endian.
|
|
||||||
/// NOTE: does the compiler generate SIMD code?
|
|
||||||
/// NOTE: for larger amount of cases we can use a perfect hash table (see 'gperf' as an example).
|
|
||||||
if ( key == '.' + 'c' * 0x100U + 'o' * 0x10000U + 'm' * 0x1000000U
|
|
||||||
|| key == '.' + 'n' * 0x100U + 'e' * 0x10000U + 't' * 0x1000000U
|
|
||||||
|| key == '.' + 'o' * 0x100U + 'r' * 0x10000U + 'g' * 0x1000000U
|
|
||||||
|| key == '.' + 'b' * 0x100U + 'i' * 0x10000U + 'z' * 0x1000000U
|
|
||||||
|| key == '.' + 'g' * 0x100U + 'o' * 0x10000U + 'v' * 0x1000000U
|
|
||||||
|| key == '.' + 'm' * 0x100U + 'i' * 0x10000U + 'l' * 0x1000000U
|
|
||||||
|| key == '.' + 'e' * 0x100U + 'd' * 0x10000U + 'u' * 0x1000000U
|
|
||||||
|| key == '.' + 'c' * 0x100U + 'o' * 0x10000U + '.' * 0x1000000U)
|
|
||||||
{
|
|
||||||
res_data += last_3_periods[2] + 1 - begin;
|
|
||||||
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
res_data += last_3_periods[1] + 1 - begin;
|
#if USE_GPERF
|
||||||
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
if (tldLookup::is_valid(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1) != nullptr)
|
||||||
}
|
{
|
||||||
|
res_data += last_3_periods[2] + 1 - begin;
|
||||||
|
res_size = last_3_periods[1] - last_3_periods[2] - 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
res_data += last_3_periods[1] + 1 - begin;
|
||||||
|
res_size = last_3_periods[0] - last_3_periods[1] - 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
4887
dbms/src/Functions/gperf/tldLookup.gperf
Normal file
4887
dbms/src/Functions/gperf/tldLookup.gperf
Normal file
File diff suppressed because it is too large
Load Diff
18
dbms/src/Functions/tldLookup.h
Normal file
18
dbms/src/Functions/tldLookup.h
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Common/config.h>
|
||||||
|
#if USE_GPERF
|
||||||
|
// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
|
||||||
|
class tldLookupHash
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
static inline unsigned int hash (const char *str, size_t len);
|
||||||
|
public:
|
||||||
|
static const char *is_valid (const char *str, size_t len);
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
using tldLookup = tldLookupHash;
|
||||||
|
}
|
||||||
|
#endif
|
@ -1,3 +1,3 @@
|
|||||||
canada congo net-domena
|
canada congo net-domena
|
||||||
yandex yandex yandex yandex яндекс яндекс yandex
|
yandex yandex yandex яндекс yandex
|
||||||
canada hello hello hello hello hello canada canada
|
canada hello hello hello hello hello canada canada
|
||||||
|
@ -8,8 +8,6 @@ SELECT
|
|||||||
firstSignificantSubdomain('https://www.yandex.ua/news.html'),
|
firstSignificantSubdomain('https://www.yandex.ua/news.html'),
|
||||||
firstSignificantSubdomain('magnet:yandex.abc'),
|
firstSignificantSubdomain('magnet:yandex.abc'),
|
||||||
firstSignificantSubdomain('ftp://www.yandex.co.uk/news.html'),
|
firstSignificantSubdomain('ftp://www.yandex.co.uk/news.html'),
|
||||||
firstSignificantSubdomain('ftp://yandex.co.yandex'),
|
|
||||||
firstSignificantSubdomain('http://ввв.яндекс.org.рф'),
|
|
||||||
firstSignificantSubdomain('https://api.www3.static.dev.ввв.яндекс.рф'),
|
firstSignificantSubdomain('https://api.www3.static.dev.ввв.яндекс.рф'),
|
||||||
firstSignificantSubdomain('//www.yandex.com.tr/news.html');
|
firstSignificantSubdomain('//www.yandex.com.tr/news.html');
|
||||||
|
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
usa
|
gov
|
||||||
pentagon
|
mil
|
||||||
stanford
|
edu
|
||||||
|
@ -23,7 +23,8 @@ RUN apt-get update -y \
|
|||||||
python-requests \
|
python-requests \
|
||||||
python-termcolor \
|
python-termcolor \
|
||||||
sudo \
|
sudo \
|
||||||
tzdata
|
tzdata \
|
||||||
|
gperf
|
||||||
|
|
||||||
COPY build.sh /
|
COPY build.sh /
|
||||||
|
|
||||||
|
@ -45,7 +45,8 @@ RUN apt-get update -y \
|
|||||||
ninja-build \
|
ninja-build \
|
||||||
gperf \
|
gperf \
|
||||||
git \
|
git \
|
||||||
tzdata
|
tzdata \
|
||||||
|
gperf
|
||||||
|
|
||||||
COPY build.sh /
|
COPY build.sh /
|
||||||
CMD ["/bin/bash", "/build.sh"]
|
CMD ["/bin/bash", "/build.sh"]
|
||||||
|
@ -66,7 +66,8 @@ RUN apt-get --allow-unauthenticated update -y \
|
|||||||
libjemalloc-dev \
|
libjemalloc-dev \
|
||||||
unixodbc-dev \
|
unixodbc-dev \
|
||||||
odbcinst \
|
odbcinst \
|
||||||
tzdata
|
tzdata \
|
||||||
|
gperf
|
||||||
|
|
||||||
COPY build.sh /
|
COPY build.sh /
|
||||||
CMD ["/bin/bash", "/build.sh"]
|
CMD ["/bin/bash", "/build.sh"]
|
||||||
|
@ -67,7 +67,7 @@ export CXX=g++-7
|
|||||||
## Install Required Libraries from Packages
|
## Install Required Libraries from Packages
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo apt-get install libicu-dev libreadline-dev
|
sudo apt-get install libicu-dev libreadline-dev gperf
|
||||||
```
|
```
|
||||||
|
|
||||||
## Checkout ClickHouse Sources
|
## Checkout ClickHouse Sources
|
||||||
|
@ -11,7 +11,7 @@ Build should work on Mac OS X 10.12.
|
|||||||
## Install Required Compilers, Tools, and Libraries
|
## Install Required Compilers, Tools, and Libraries
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew install cmake ninja gcc icu4c openssl libtool gettext readline
|
brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf
|
||||||
```
|
```
|
||||||
|
|
||||||
## Checkout ClickHouse Sources
|
## Checkout ClickHouse Sources
|
||||||
|
Loading…
Reference in New Issue
Block a user