From c3506bf16d6870299f899b7a356fe59221f6e91b Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 7 Jun 2021 14:56:32 -0700 Subject: [PATCH] Adds a better way to include binary resources - Uses a small assembly file to include binary resources, rather than objcopy - Updates `base/common/getResource.cpp` for this new method of inclusion - Removes linux-only guards in CMake files, as this solution is cross-platform. The resulting binary resources are available in the ClickHouse server binary on Linux, macOS, and illumos platforms. FreeBSD has not been tested, but will likely work as well. --- CMakeLists.txt | 27 +++++++---- base/common/getResource.cpp | 34 ++++++++++---- cmake/embed_binary.cmake | 68 +++++++++++++++++++++++++++ contrib/cctz-cmake/CMakeLists.txt | 78 +++++++++++-------------------- programs/CMakeLists.txt | 49 ------------------- programs/embed_binary.S.in | 17 +++++++ programs/keeper/CMakeLists.txt | 8 +++- programs/server/CMakeLists.txt | 12 +++-- 8 files changed, 169 insertions(+), 124 deletions(-) create mode 100644 cmake/embed_binary.cmake create mode 100644 programs/embed_binary.S.in diff --git a/CMakeLists.txt b/CMakeLists.txt index ce0f58e2521..36784cfc226 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,24 +183,31 @@ endif () # Make sure the final executable has symbols exported set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") -if (OS_LINUX) - find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-12" "llvm-objcopy-11" "llvm-objcopy-10" "llvm-objcopy-9" "llvm-objcopy-8" "objcopy") - if (OBJCOPY_PATH) - message(STATUS "Using objcopy: ${OBJCOPY_PATH}.") - +find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-12" "llvm-objcopy-11" "llvm-objcopy-10" "llvm-objcopy-9" "llvm-objcopy-8" "objcopy") +if (OBJCOPY_PATH) + message(STATUS "Using objcopy: ${OBJCOPY_PATH}.") + if (OS_LINUX) if (ARCH_AMD64) set(OBJCOPY_ARCH_OPTIONS -O elf64-x86-64 -B i386) elseif (ARCH_AARCH64) set(OBJCOPY_ARCH_OPTIONS -O elf64-aarch64 -B aarch64) endif () - else () - message(FATAL_ERROR "Cannot find objcopy.") - endif () + elseif (OS_DARWIN) + set(OBJCOPY_ARCH_OPTIONS -O mach-o-x86-64 -B i386) + elseif (OS_SUNOS) + set(OBJCOPY_ARCH_OPTIONS -O elf64-x86-64-sol2 -B i386) + endif() +else () + message(FATAL_ERROR "Cannot find objcopy.") endif () if (OS_DARWIN) - set(WHOLE_ARCHIVE -all_load) - set(NO_WHOLE_ARCHIVE -noall_load) + # The `-all_load` flag forces loading of all symbols from all libraries, + # and leads to multiply-defined symbols. This flag allows force loading + # from a _specific_ library, which is what we need. + set(WHOLE_ARCHIVE -force_load) + # The `-noall_load` flag is the default and now obsolete. + set(NO_WHOLE_ARCHIVE "") else () set(WHOLE_ARCHIVE --whole-archive) set(NO_WHOLE_ARCHIVE --no-whole-archive) diff --git a/base/common/getResource.cpp b/base/common/getResource.cpp index 5d5f18047b3..bf2e9b72ed1 100644 --- a/base/common/getResource.cpp +++ b/base/common/getResource.cpp @@ -4,23 +4,41 @@ #include #include - std::string_view getResource(std::string_view name) { + // Convert the resource file name into the form generated by `ld -r -b binary`. std::string name_replaced(name); std::replace(name_replaced.begin(), name_replaced.end(), '/', '_'); std::replace(name_replaced.begin(), name_replaced.end(), '-', '_'); std::replace(name_replaced.begin(), name_replaced.end(), '.', '_'); boost::replace_all(name_replaced, "+", "_PLUS_"); - /// These are the names that are generated by "ld -r -b binary" - std::string symbol_name_data = "_binary_" + name_replaced + "_start"; - std::string symbol_name_size = "_binary_" + name_replaced + "_size"; + // In most `dlsym(3)` APIs, one passes the symbol name as it appears via + // something like `nm` or `objdump -t`. For example, a symbol `_foo` would be + // looked up with the string `"_foo"`. + // + // Apple's linker is confusingly different. The NOTES on the man page for + // `dlsym(3)` claim that one looks up the symbol with "the name used in C + // source code". In this example, that would mean using the string `"foo"`. + // This apparently applies even in the case where the symbol did not originate + // from C source, such as the embedded binary resource files used here. So + // the symbol name must not have a leading `_` on Apple platforms. It's not + // clear how this applies to other symbols, such as those which _have_ a leading + // underscore in them by design, many leading underscores, etc. +#if defined OS_DARWIN + std::string prefix = "binary_"; +#else + std::string prefix = "_binary_"; +#endif + std::string symbol_name_start = prefix + name_replaced + "_start"; + std::string symbol_name_end = prefix + name_replaced + "_end"; - const void * sym_data = dlsym(RTLD_DEFAULT, symbol_name_data.c_str()); - const void * sym_size = dlsym(RTLD_DEFAULT, symbol_name_size.c_str()); + auto sym_start = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_start.c_str())); + auto sym_end = reinterpret_cast(dlsym(RTLD_DEFAULT, symbol_name_end.c_str())); - if (sym_data && sym_size) - return { static_cast(sym_data), unalignedLoad(&sym_size) }; + if (sym_start && sym_end) { + auto resource_size = static_cast(std::distance(sym_start, sym_end)); + return { sym_start, resource_size }; + } return {}; } diff --git a/cmake/embed_binary.cmake b/cmake/embed_binary.cmake new file mode 100644 index 00000000000..ad67e808d9e --- /dev/null +++ b/cmake/embed_binary.cmake @@ -0,0 +1,68 @@ +# Embed a set of resource files into a resulting object file. +# +# Signature: `clickhouse_embed_binaries(TARGET RESOURCE_DIR RESOURCES ...) +# +# This will generate a static library target named ``, which contains the contents of +# each `` file. The files should be located in ``. defaults to +# ${CMAKE_CURRENT_SOURCE_DIR}, and the resources may not be empty. +# +# Each resource will result in three symbols in the final archive, based on the name ``. +# These are: +# 1. `_binary__start`: Points to the start of the binary data from ``. +# 2. `_binary__end`: Points to the end of the binary data from ``. +# 2. `_binary__size`: Points to the size of the binary data from ``. +# +# `` is a normalized name derived from ``, by replacing the characters "./-" with +# the character "_", and the character "+" with "_PLUS_". This scheme is similar to those generated +# by `ld -r -b binary`, and matches the expectations in `./base/common/getResource.cpp`. +macro(clickhouse_embed_binaries) + set(one_value_args TARGET RESOURCE_DIR) + set(resources RESOURCES) + cmake_parse_arguments(EMBED "" "${one_value_args}" ${resources} ${ARGN}) + + if (NOT DEFINED EMBED_TARGET) + message(FATAL_ERROR "A target name must be provided for embedding binary resources into") + endif() + + if (NOT DEFINED EMBED_RESOURCE_DIR) + set(EMBED_RESOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") + endif() + + list(LENGTH EMBED_RESOURCES N_RESOURCES) + if (N_RESOURCES LESS 1) + message(FATAL_ERROR "The list of binary resources to embed may not be empty") + endif() + + set(EMBED_TEMPLATE_FILE "${PROJECT_SOURCE_DIR}/programs/embed_binary.S.in") + set(RESOURCE_OBJS) + foreach(RESOURCE_FILE ${EMBED_RESOURCES}) + set(RESOURCE_OBJ "${RESOURCE_FILE}.o") + list(APPEND RESOURCE_OBJS "${RESOURCE_OBJ}") + + # Normalize the name of the resource + set(BINARY_FILE_NAME "${RESOURCE_FILE}") + string(REGEX REPLACE "[\./-]" "_" SYMBOL_NAME "${RESOURCE_FILE}") # - must be last in regex + string(REPLACE "+" "_PLUS_" SYMBOL_NAME "${SYMBOL_NAME}") + set(ASSEMBLY_FILE_NAME "${RESOURCE_FILE}.S") + + # Put the configured assembly file in the output directory. + # This is so we can clean it up as usual, and we CD to the + # source directory before compiling, so that the assembly + # `.incbin` directive can find the file. + configure_file("${EMBED_TEMPLATE_FILE}" "${CMAKE_CURRENT_BINARY_DIR}/${ASSEMBLY_FILE_NAME}" @ONLY) + + # Generate the output object file by compiling the assembly, in the directory of + # the sources so that the resource file may also be found + add_custom_command( + OUTPUT ${RESOURCE_OBJ} + COMMAND cd "${EMBED_RESOURCE_DIR}" && + ${CMAKE_C_COMPILER} -c -o + "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}" + "${CMAKE_CURRENT_BINARY_DIR}/${ASSEMBLY_FILE_NAME}" + ) + set_source_files_properties("${RESOURCE_OBJ}" PROPERTIES EXTERNAL_OBJECT true GENERATED true) + endforeach() + + add_library("${EMBED_TARGET}" STATIC ${RESOURCE_OBJS}) + set_target_properties("${EMBED_TARGET}" PROPERTIES LINKER_LANGUAGE C) +endmacro() diff --git a/contrib/cctz-cmake/CMakeLists.txt b/contrib/cctz-cmake/CMakeLists.txt index 93413693796..96e2af5fb03 100644 --- a/contrib/cctz-cmake/CMakeLists.txt +++ b/contrib/cctz-cmake/CMakeLists.txt @@ -39,6 +39,7 @@ if (NOT USE_INTERNAL_CCTZ_LIBRARY) endif() if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS) + include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake) set(USE_INTERNAL_CCTZ_LIBRARY 1) set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cctz") @@ -70,63 +71,36 @@ if (NOT EXTERNAL_CCTZ_LIBRARY_FOUND OR NOT EXTERNAL_CCTZ_LIBRARY_WORKS) set(SYSTEM_STORAGE_TZ_FILE "${CMAKE_BINARY_DIR}/src/Storages/System/StorageSystemTimeZones.generated.cpp") # remove existing copies so that its generated fresh on each build. file(REMOVE ${SYSTEM_STORAGE_TZ_FILE}) - # Build a libray with embedded tzdata - if (OS_LINUX) - # get the list of timezones from tzdata shipped with cctz - set(TZDIR "${LIBRARY_DIR}/testdata/zoneinfo") - file(STRINGS "${LIBRARY_DIR}/testdata/version" TZDATA_VERSION) - set_property(GLOBAL PROPERTY TZDATA_VERSION_PROP "${TZDATA_VERSION}") - message(STATUS "Packaging with tzdata version: ${TZDATA_VERSION}") - set(TZ_OBJS) + # get the list of timezones from tzdata shipped with cctz + set(TZDIR "${LIBRARY_DIR}/testdata/zoneinfo") + file(STRINGS "${LIBRARY_DIR}/testdata/version" TZDATA_VERSION) + set_property(GLOBAL PROPERTY TZDATA_VERSION_PROP "${TZDATA_VERSION}") + message(STATUS "Packaging with tzdata version: ${TZDATA_VERSION}") - # each file in that dir (except of tab and localtime) store the info about timezone - execute_process(COMMAND - bash -c "cd ${TZDIR} && find * -type f -and ! -name '*.tab' -and ! -name 'localtime' | sort | paste -sd ';'" - OUTPUT_STRIP_TRAILING_WHITESPACE - OUTPUT_VARIABLE TIMEZONES) + set(TIMEZONE_RESOURCE_FILES) - file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "// autogenerated by ClickHouse/contrib/cctz-cmake/CMakeLists.txt\n") - file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "const char * auto_time_zones[] {\n" ) + # each file in that dir (except of tab and localtime) store the info about timezone + execute_process(COMMAND + bash -c "cd ${TZDIR} && find * -type f -and ! -name '*.tab' -and ! -name 'localtime' | sort | paste -sd ';' -" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE TIMEZONES) - foreach(TIMEZONE ${TIMEZONES}) - file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " \"${TIMEZONE}\",\n") - string(REPLACE "/" "_" TIMEZONE_ID ${TIMEZONE}) - string(REPLACE "+" "_PLUS_" TIMEZONE_ID ${TIMEZONE_ID}) - set(TZ_OBJ ${TIMEZONE_ID}.o) - set(TZ_OBJS ${TZ_OBJS} ${TZ_OBJ}) + file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "// autogenerated by ClickHouse/contrib/cctz-cmake/CMakeLists.txt\n") + file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "const char * auto_time_zones[] {\n" ) - # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - # PPC64LE fails to do this with objcopy, use ld or lld instead - if (ARCH_PPC64LE) - add_custom_command(OUTPUT ${TZ_OBJ} - COMMAND cp "${TZDIR}/${TIMEZONE}" "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}" - COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o ${TZ_OBJ} ${TIMEZONE_ID} - COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}") - else() - add_custom_command(OUTPUT ${TZ_OBJ} - COMMAND cp "${TZDIR}/${TIMEZONE}" "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}" - COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} - --rename-section .data=.rodata,alloc,load,readonly,data,contents ${TIMEZONE_ID} ${TZ_OBJ} - COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${TIMEZONE_ID}") - endif() - set_source_files_properties(${TZ_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) - endforeach(TIMEZONE) - - file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " nullptr};\n") - - add_library(tzdata STATIC ${TZ_OBJS}) - set_target_properties(tzdata PROPERTIES LINKER_LANGUAGE C) - # whole-archive prevents symbols from being discarded for unknown reason - # CMake can shuffle each of target_link_libraries arguments with other - # libraries in linker command. To avoid this we hardcode whole-archive - # library into single string. - add_dependencies(cctz tzdata) - target_link_libraries(cctz INTERFACE "-Wl,${WHOLE_ARCHIVE} $ -Wl,${NO_WHOLE_ARCHIVE}") - else () - file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "// autogenerated by ClickHouse/contrib/cctz-cmake/CMakeLists.txt\n") - file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "const char * auto_time_zones[] {nullptr};\n" ) - endif () + foreach(TIMEZONE ${TIMEZONES}) + file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " \"${TIMEZONE}\",\n") + list(APPEND TIMEZONE_RESOURCE_FILES "${TIMEZONE}") + endforeach(TIMEZONE) + file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " nullptr};\n") + clickhouse_embed_binaries( + TARGET tzdata + RESOURCE_DIR "${TZDIR}" + RESOURCES ${TIMEZONE_RESOURCE_FILES} + ) + add_dependencies(cctz tzdata) + target_link_libraries(cctz INTERFACE "-Wl,${WHOLE_ARCHIVE} $ -Wl,${NO_WHOLE_ARCHIVE}") endif () message (STATUS "Using cctz") diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 2af0331c70b..12aec76a303 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -204,55 +204,6 @@ macro(clickhouse_program_add name) clickhouse_program_add_executable(${name}) endmacro() -# Embed default config files as a resource into the binary. -# This is needed for two purposes: -# 1. Allow to run the binary without download of any other files. -# 2. Allow to implement "sudo clickhouse install" tool. -# -# Arguments: target (server, client, keeper, etc.) and list of files -# -# Also dependency on TARGET_FILE is required, look at examples in programs/server and programs/keeper -macro(clickhouse_embed_binaries) - # TODO We actually need this on Mac, FreeBSD. - if (OS_LINUX) - - set(arguments_list "${ARGN}") - list(GET arguments_list 0 target) - - # for some reason cmake iterates loop including - math(EXPR arguments_count "${ARGC}-1") - - foreach(RESOURCE_POS RANGE 1 "${arguments_count}") - list(GET arguments_list "${RESOURCE_POS}" RESOURCE_FILE) - set(RESOURCE_OBJ ${RESOURCE_FILE}.o) - set(RESOURCE_OBJS ${RESOURCE_OBJS} ${RESOURCE_OBJ}) - - # https://stackoverflow.com/questions/14776463/compile-and-add-an-object-file-from-a-binary-with-cmake - # PPC64LE fails to do this with objcopy, use ld or lld instead - if (ARCH_PPC64LE) - add_custom_command(OUTPUT ${RESOURCE_OBJ} - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${CMAKE_LINKER} -m elf64lppc -r -b binary -o "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}" ${RESOURCE_FILE}) - else() - add_custom_command(OUTPUT ${RESOURCE_OBJ} - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && ${OBJCOPY_PATH} -I binary ${OBJCOPY_ARCH_OPTIONS} ${RESOURCE_FILE} "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}" - COMMAND ${OBJCOPY_PATH} --rename-section .data=.rodata,alloc,load,readonly,data,contents - "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}" "${CMAKE_CURRENT_BINARY_DIR}/${RESOURCE_OBJ}") - endif() - set_source_files_properties(${RESOURCE_OBJ} PROPERTIES EXTERNAL_OBJECT true GENERATED true) - endforeach() - - add_library(clickhouse_${target}_configs STATIC ${RESOURCE_OBJS}) - set_target_properties(clickhouse_${target}_configs PROPERTIES LINKER_LANGUAGE C) - - # whole-archive prevents symbols from being discarded for unknown reason - # CMake can shuffle each of target_link_libraries arguments with other - # libraries in linker command. To avoid this we hardcode whole-archive - # library into single string. - add_dependencies(clickhouse-${target}-lib clickhouse_${target}_configs) - endif () -endmacro() - - add_subdirectory (server) add_subdirectory (client) add_subdirectory (local) diff --git a/programs/embed_binary.S.in b/programs/embed_binary.S.in new file mode 100644 index 00000000000..47d56d2a2ae --- /dev/null +++ b/programs/embed_binary.S.in @@ -0,0 +1,17 @@ +// Embed a binary file into an executable. + +// The variable BINARY_FILE_NAME is the actual name of the file to include +// The variable SYMBOL_NAME is the "normalized" name of the symbol, with +// symbols like `-`, `.`, and `/` replaced with `_`. This is to match how +// objcopy rewrites symbol names, and matches the expectation in +// `base/common/getResource.cpp` + + .data + .global _binary_@SYMBOL_NAME@_start +_binary_@SYMBOL_NAME@_start: + .incbin "@BINARY_FILE_NAME@" + .global _binary_@SYMBOL_NAME@_end +_binary_@SYMBOL_NAME@_end: + .global _binary_@SYMBOL_NAME@_size +_binary_@SYMBOL_NAME@_size: + .quad _binary_@SYMBOL_NAME@_end - _binary_@SYMBOL_NAME@_start diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index e604d0e304e..5a50a7074d3 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -1,3 +1,5 @@ +include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake) + set(CLICKHOUSE_KEEPER_SOURCES Keeper.cpp ) @@ -21,4 +23,8 @@ clickhouse_program_add(keeper) install (FILES keeper_config.xml DESTINATION "${CLICKHOUSE_ETC_DIR}/clickhouse-keeper" COMPONENT clickhouse-keeper) -clickhouse_embed_binaries(keeper keeper_config.xml keeper_embedded.xml) +clickhouse_embed_binaries( + TARGET clickhouse_keeper_configs + RESOURCES keeper_config.xml keeper_embedded.xml +) +add_dependencies(clickhouse-keeper-lib clickhouse_keeper_configs) diff --git a/programs/server/CMakeLists.txt b/programs/server/CMakeLists.txt index f7f76fdb450..739d1004025 100644 --- a/programs/server/CMakeLists.txt +++ b/programs/server/CMakeLists.txt @@ -1,11 +1,11 @@ +include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake) + set(CLICKHOUSE_SERVER_SOURCES MetricsTransmitter.cpp Server.cpp ) -if (OS_LINUX) - set (LINK_RESOURCE_LIB INTERFACE "-Wl,${WHOLE_ARCHIVE} $ -Wl,${NO_WHOLE_ARCHIVE}") -endif () +set (LINK_RESOURCE_LIB INTERFACE "-Wl,${WHOLE_ARCHIVE} $ -Wl,${NO_WHOLE_ARCHIVE}") set (CLICKHOUSE_SERVER_LINK PRIVATE @@ -31,4 +31,8 @@ clickhouse_program_add(server) install(FILES config.xml users.xml DESTINATION "${CLICKHOUSE_ETC_DIR}/clickhouse-server" COMPONENT clickhouse) -clickhouse_embed_binaries(server config.xml users.xml embedded.xml play.html) +clickhouse_embed_binaries( + TARGET clickhouse_server_configs + RESOURCES config.xml users.xml embedded.xml play.html +) +add_dependencies(clickhouse-server-lib clickhouse_server_configs)