Adds a better way to include binary resources

- Uses a small assembly file to include binary resources, rather than
  objcopy
- Updates `base/common/getResource.cpp` for this new method of inclusion
- Removes linux-only guards in CMake files, as this solution is
  cross-platform.

The resulting binary resources are available in the ClickHouse server
binary on Linux, macOS, and illumos platforms. FreeBSD has not been
tested, but will likely work as well.
This commit is contained in:
Benjamin Naecker 2021-06-07 14:56:32 -07:00
parent 23bf4cf09f
commit c3506bf16d
8 changed files with 169 additions and 124 deletions

View File

@ -183,24 +183,31 @@ endif ()
# Make sure the final executable has symbols exported
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic")
if (OS_LINUX)
find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-12" "llvm-objcopy-11" "llvm-objcopy-10" "llvm-objcopy-9" "llvm-objcopy-8" "objcopy")
if (OBJCOPY_PATH)
message(STATUS "Using objcopy: ${OBJCOPY_PATH}.")
if (OS_LINUX)
if (ARCH_AMD64)
set(OBJCOPY_ARCH_OPTIONS -O elf64-x86-64 -B i386)
elseif (ARCH_AARCH64)
set(OBJCOPY_ARCH_OPTIONS -O elf64-aarch64 -B aarch64)
endif ()
elseif (OS_DARWIN)
set(OBJCOPY_ARCH_OPTIONS -O mach-o-x86-64 -B i386)
elseif (OS_SUNOS)
set(OBJCOPY_ARCH_OPTIONS -O elf64-x86-64-sol2 -B i386)
endif()
else ()
message(FATAL_ERROR "Cannot find objcopy.")
endif ()
endif ()
if (OS_DARWIN)
set(WHOLE_ARCHIVE -all_load)
set(NO_WHOLE_ARCHIVE -noall_load)
# The `-all_load` flag forces loading of all symbols from all libraries,
# and leads to multiply-defined symbols. This flag allows force loading
# from a _specific_ library, which is what we need.
set(WHOLE_ARCHIVE -force_load)
# The `-noall_load` flag is the default and now obsolete.
set(NO_WHOLE_ARCHIVE "")
else ()
set(WHOLE_ARCHIVE --whole-archive)
set(NO_WHOLE_ARCHIVE --no-whole-archive)

View File

@ -4,23 +4,41 @@
#include <string>
#include <boost/algorithm/string/replace.hpp>
std::string_view getResource(std::string_view name)
{
// Convert the resource file name into the form generated by `ld -r -b binary`.
std::string name_replaced(name);
std::replace(name_replaced.begin(), name_replaced.end(), '/', '_');
std::replace(name_replaced.begin(), name_replaced.end(), '-', '_');
std::replace(name_replaced.begin(), name_replaced.end(), '.', '_');
boost::replace_all(name_replaced, "+", "_PLUS_");
/// These are the names that are generated by "ld -r -b binary"
std::string symbol_name_data = "_binary_" + name_replaced + "_start";
std::string symbol_name_size = "_binary_" + name_replaced + "_size";
// In most `dlsym(3)` APIs, one passes the symbol name as it appears via
// something like `nm` or `objdump -t`. For example, a symbol `_foo` would be
// looked up with the string `"_foo"`.
//
// Apple's linker is confusingly different. The NOTES on the man page for
// `dlsym(3)` claim that one looks up the symbol with "the name used in C
// source code". In this example, that would mean using the string `"foo"`.
// This apparently applies even in the case where the symbol did not originate
// from C source, such as the embedded binary resource files used here. So
// the symbol name must not have a leading `_` on Apple platforms. It's not
// clear how this applies to other symbols, such as those which _have_ a leading
// underscore in them by design, many leading underscores, etc.
#if defined OS_DARWIN
std::string prefix = "binary_";
#else
std::string prefix = "_binary_";
#endif
std::string symbol_name_start = prefix + name_replaced + "_start";
std::string symbol_name_end = prefix + name_replaced + "_end";
const void * sym_data = dlsym(RTLD_DEFAULT, symbol_name_data.c_str());
const void * sym_size = dlsym(RTLD_DEFAULT, symbol_name_size.c_str());
auto sym_start = reinterpret_cast<const char*>(dlsym(RTLD_DEFAULT, symbol_name_start.c_str()));
auto sym_end = reinterpret_cast<const char*>(dlsym(RTLD_DEFAULT, symbol_name_end.c_str()));
if (sym_data && sym_size)
return { static_cast<const char *>(sym_data), unalignedLoad<size_t>(&sym_size) };
if (sym_start && sym_end) {
auto resource_size = static_cast<size_t>(std::distance(sym_start, sym_end));
return { sym_start, resource_size };
}
return {};
}

68
cmake/embed_binary.cmake Normal file
View File

@ -0,0 +1,68 @@
# Embed a set of resource files into a resulting object file.
#
# Signature: `clickhouse_embed_binaries(TARGET <target> RESOURCE_DIR <dir> RESOURCES <resource> ...)
#
# This will generate a static library target named `<target>`, which contains the contents of
# each `<resource>` file. The files should be located in `<dir>`. <dir> defaults to
# ${CMAKE_CURRENT_SOURCE_DIR}, and the resources may not be empty.
#
# Each resource will result in three symbols in the final archive, based on the name `<resource>`.
# These are:
# 1. `_binary_<name>_start`: Points to the start of the binary data from `<resource>`.
# 2. `_binary_<name>_end`: Points to the end of the binary data from `<resource>`.
# 2. `_binary_<name>_size`: Points to the size of the binary data from `<resource>`.
#
# `<name>` is a normalized name derived from `<resource>`, by replacing the characters "./-" with
# the character "_", and the character "+" with "_PLUS_". This scheme is similar to those generated
# by `ld -r -b binary`, and matches the expectations in `./base/common/getResource.cpp`.
macro(clickhouse_embed_binaries)
set(one_value_args TARGET RESOURCE_DIR)
set(resources RESOURCES)
cmake_parse_arguments(EMBED "" "${one_value_args}" ${resources} ${ARGN})
if (NOT DEFINED EMBED_TARGET)
message(FATAL_ERROR "A target name must be provided for embedding binary resources into")
endif()
if (NOT DEFINED EMBED_RESOURCE_DIR)
set(EMBED_RESOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
endif()
list(LENGTH EMBED_RESOURCES N_RESOURCES)
if (N_RESOURCES LESS 1)
message(FATAL_ERROR "The list of binary resources to embed may not be empty")
endif()
set(EMBED_TEMPLATE_FILE "${PROJECT_SOURCE_DIR}/programs/embed_binary.S.in")
set(RESOURCE_OBJS)
foreach(RESOURCE_FILE ${EMBED_RESOURCES})
set(RESOURCE_OBJ "${RESOURCE_FILE}.o")
list(APPEND RESOURCE_OBJS "${RESOURCE_OBJ}")
# Normalize the name of the resource
set(BINARY_FILE_NAME "${RESOURCE_FILE}")
string(REGEX REPLACE "[\./-]" "_" SYMBOL_NAME "${RESOURCE_FILE}") # - must be last in regex
string(REPLACE "+" "_PLUS_" SYMBOL_NAME "${SYMBOL_NAME}")
set(ASSEMBLY_FILE_NAME "${RESOURCE_FILE}.S")
# Put the configured assembly file in the output directory.
# This is so we can clean it up as usual, and we CD to the
# source directory before compiling, so that the assembly
# `.incbin` directive can find the file.
configure_file("${EMBED_TEMPLATE_FILE}" "${CMAKE_CURRENT_BINARY_DIR}/${ASSEMBLY_FILE_NAME}" @ONLY)
# Generate the output object file by compiling the assembly, in the directory of
# the sources so that the resource file may also be found
add_custom_command(
OUTPUT ${RESOURCE_OBJ}
COMMAND cd "${EMBED_RESOURCE_DIR}" &&
${CMAKE_C_COMPILER} -c -o
"${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}"
"${CMAKE_CURRENT_BINARY_DIR}/${ASSEMBLY_FILE_NAME}"
)
set_source_files_properties("${RESOURCE_OBJ}" PROPERTIES EXTERNAL_OBJECT true GENERATED true)
endforeach()
add_library("${EMBED_TARGET}" STATIC ${RESOURCE_OBJS})
set_target_properties("${EMBED_TARGET}" PROPERTIES LINKER_LANGUAGE C)
endmacro()

View File

@ -39,6 +39,7 @@ if (NOT USE_INTERNAL_CCTZ_LIBRARY)
endif()
if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS)
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
set(USE_INTERNAL_CCTZ_LIBRARY 1)
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cctz")
@ -70,19 +71,18 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS)
set(SYSTEM_STORAGE_TZ_FILE "${CMAKE_BINARY_DIR}/src/Storages/System/StorageSystemTimeZones.generated.cpp")
# remove existing copies so that its generated fresh on each build.
file(REMOVE ${SYSTEM_STORAGE_TZ_FILE})
# Build a libray with embedded tzdata
if (OS_LINUX)
# get the list of timezones from tzdata shipped with cctz
set(TZDIR "${LIBRARY_DIR}/testdata/zoneinfo")
file(STRINGS "${LIBRARY_DIR}/testdata/version" TZDATA_VERSION)
set_property(GLOBAL PROPERTY TZDATA_VERSION_PROP "${TZDATA_VERSION}")
message(STATUS "Packaging with tzdata version: ${TZDATA_VERSION}")
set(TZ_OBJS)
set(TIMEZONE_RESOURCE_FILES)
# each file in that dir (except of tab and localtime) store the info about timezone
execute_process(COMMAND
bash -c "cd ${TZDIR} && find * -type f -and ! -name '*.tab' -and ! -name 'localtime' | sort | paste -sd ';'"
bash -c "cd ${TZDIR} && find * -type f -and ! -name '*.tab' -and ! -name 'localtime' | sort | paste -sd ';' -"
OUTPUT_STRIP_TRAILING_WHITESPACE
OUTPUT_VARIABLE TIMEZONES)
@ -91,42 +91,16 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS)
foreach(TIMEZONE ${TIMEZONES})
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " \"${TIMEZONE}\",\n")
string(REPLACE "/" "_" TIMEZONE_ID ${TIMEZONE})
string(REPLACE "+" "_PLUS_" TIMEZONE_ID ${TIMEZONE_ID})
set(TZ_OBJ ${TIMEZONE_ID}.o)
set(TZ_OBJS ${TZ_OBJS} ${TZ_OBJ})
# https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake
# PPC64LE fails to do this with objcopy, use ld or lld instead
if (ARCH_PPC64LE)
add_custom_command(OUTPUT ${TZ_OBJ}
COMMAND cp "${TZDIR}/${TIMEZONE}" "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}"
COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${TZ_OBJ} ${TIMEZONE_ID}
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}")
else()
add_custom_command(OUTPUT ${TZ_OBJ}
COMMAND cp "${TZDIR}/${TIMEZONE}" "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}"
COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS}
--rename-section .data=.rodata,alloc,load,readonly,data,contents ${TIMEZONE_ID} ${TZ_OBJ}
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}")
endif()
set_source_files_properties(${TZ_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true)
list(APPEND TIMEZONE_RESOURCE_FILES "${TIMEZONE}")
endforeach(TIMEZONE)
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " nullptr};\n")
add_library(tzdata STATIC ${TZ_OBJS})
set_target_properties(tzdata PROPERTIES LINKER_LANGUAGE C)
# whole-archive prevents symbols from being discarded for unknown reason
# CMake can shuffle each of target_link_libraries arguments with other
# libraries in linker command. To avoid this we hardcode whole-archive
# library into single string.
clickhouse_embed_binaries(
TARGET tzdata
RESOURCE_DIR "${TZDIR}"
RESOURCES ${TIMEZONE_RESOURCE_FILES}
)
add_dependencies(cctz tzdata)
target_link_libraries(cctz INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:tzdata> -Wl,${NO_WHOLE_ARCHIVE}")
else ()
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "// autogenerated by ClickHouse/contrib/cctz-cmake/CMakeLists.txt\n")
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "const char * auto_time_zones[] {nullptr};\n" )
endif ()
endif ()
message (STATUS "Using cctz")

View File

@ -204,55 +204,6 @@ macro(clickhouse_program_add name)
clickhouse_program_add_executable(${name})
endmacro()
# Embed default config files as a resource into the binary.
# This is needed for two purposes:
# 1. Allow to run the binary without download of any other files.
# 2. Allow to implement "sudo clickhouse install" tool.
#
# Arguments: target (server, client, keeper, etc.) and list of files
#
# Also dependency on TARGET_FILE is required, look at examples in programs/server and programs/keeper
macro(clickhouse_embed_binaries)
# TODO We actually need this on Mac, FreeBSD.
if (OS_LINUX)
set(arguments_list "${ARGN}")
list(GET arguments_list 0 target)
# for some reason cmake iterates loop including <stop>
math(EXPR arguments_count "${ARGC}-1")
foreach(RESOURCE_POS RANGE 1 "${arguments_count}")
list(GET arguments_list "${RESOURCE_POS}" RESOURCE_FILE)
set(RESOURCE_OBJ ${RESOURCE_FILE}.o)
set(RESOURCE_OBJS ${RESOURCE_OBJS} ${RESOURCE_OBJ})
# https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake
# PPC64LE fails to do this with objcopy, use ld or lld instead
if (ARCH_PPC64LE)
add_custom_command(OUTPUT ${RESOURCE_OBJ}
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}" ${RESOURCE_FILE})
else()
add_custom_command(OUTPUT ${RESOURCE_OBJ}
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}"
COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents
"${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}" "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}")
endif()
set_source_files_properties(${RESOURCE_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true)
endforeach()
add_library(clickhouse_${target}_configs STATIC ${RESOURCE_OBJS})
set_target_properties(clickhouse_${target}_configs PROPERTIES LINKER_LANGUAGE C)
# whole-archive prevents symbols from being discarded for unknown reason
# CMake can shuffle each of target_link_libraries arguments with other
# libraries in linker command. To avoid this we hardcode whole-archive
# library into single string.
add_dependencies(clickhouse-${target}-lib clickhouse_${target}_configs)
endif ()
endmacro()
add_subdirectory (server)
add_subdirectory (client)
add_subdirectory (local)

View File

@ -0,0 +1,17 @@
// Embed a binary file into an executable.
// The variable BINARY_FILE_NAME is the actual name of the file to include
// The variable SYMBOL_NAME is the "normalized" name of the symbol, with
// symbols like `-`, `.`, and `/` replaced with `_`. This is to match how
// objcopy rewrites symbol names, and matches the expectation in
// `base/common/getResource.cpp`
.data
.global _binary_@SYMBOL_NAME@_start
_binary_@SYMBOL_NAME@_start:
.incbin "@BINARY_FILE_NAME@"
.global _binary_@SYMBOL_NAME@_end
_binary_@SYMBOL_NAME@_end:
.global _binary_@SYMBOL_NAME@_size
_binary_@SYMBOL_NAME@_size:
.quad _binary_@SYMBOL_NAME@_end - _binary_@SYMBOL_NAME@_start

View File

@ -1,3 +1,5 @@
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
set(CLICKHOUSE_KEEPER_SOURCES
Keeper.cpp
)
@ -21,4 +23,8 @@ clickhouse_program_add(keeper)
install (FILES keeper_config.xml DESTINATION "${CLICKHOUSE_ETC_DIR}/clickhouse-keeper" COMPONENT clickhouse-keeper)
clickhouse_embed_binaries(keeper keeper_config.xml keeper_embedded.xml)
clickhouse_embed_binaries(
TARGET clickhouse_keeper_configs
RESOURCES keeper_config.xml keeper_embedded.xml
)
add_dependencies(clickhouse-keeper-lib clickhouse_keeper_configs)

View File

@ -1,11 +1,11 @@
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
set(CLICKHOUSE_SERVER_SOURCES
MetricsTransmitter.cpp
Server.cpp
)
if (OS_LINUX)
set (LINK_RESOURCE_LIB INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:clickhouse_server_configs> -Wl,${NO_WHOLE_ARCHIVE}")
endif ()
set (CLICKHOUSE_SERVER_LINK
PRIVATE
@ -31,4 +31,8 @@ clickhouse_program_add(server)
install(FILES config.xml users.xml DESTINATION "${CLICKHOUSE_ETC_DIR}/clickhouse-server" COMPONENT clickhouse)
clickhouse_embed_binaries(server config.xml users.xml embedded.xml play.html)
clickhouse_embed_binaries(
TARGET clickhouse_server_configs
RESOURCES config.xml users.xml embedded.xml play.html
)
add_dependencies(clickhouse-server-lib clickhouse_server_configs)