Use incbin for resources, part 2

This commit is contained in:
Alexey Milovidov 2023-07-23 05:25:14 +02:00
parent 00d6f2ee08
commit 4170d1458b
10 changed files with 58 additions and 234 deletions

View File

@ -1,58 +0,0 @@
# Embed a set of resource files into a resulting object file.
#
# Signature: `clickhouse_embed_binaries(TARGET <target> RESOURCE_DIR <dir> RESOURCES <resource> ...)
#
# This will generate a static library target named `<target>`, which contains the contents of
# each `<resource>` file. The files should be located in `<dir>`. <dir> defaults to
# ${CMAKE_CURRENT_SOURCE_DIR}, and the resources may not be empty.
#
# Each resource will result in three symbols in the final archive, based on the name `<resource>`.
# These are:
# 1. `_binary_<name>_start`: Points to the start of the binary data from `<resource>`.
# 2. `_binary_<name>_end`: Points to the end of the binary data from `<resource>`.
# 2. `_binary_<name>_size`: Points to the size of the binary data from `<resource>`.
#
# `<name>` is a normalized name derived from `<resource>`, by replacing the characters "./-" with
# the character "_", and the character "+" with "_PLUS_". This scheme is similar to those generated
# by `ld -r -b binary`, and matches the expectations in `./base/common/getResource.cpp`.
macro(clickhouse_embed_binaries)
set(one_value_args TARGET RESOURCE_DIR)
set(resources RESOURCES)
cmake_parse_arguments(EMBED "" "${one_value_args}" ${resources} ${ARGN})
if (NOT DEFINED EMBED_TARGET)
message(FATAL_ERROR "A target name must be provided for embedding binary resources into")
endif()
if (NOT DEFINED EMBED_RESOURCE_DIR)
set(EMBED_RESOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
endif()
list(LENGTH EMBED_RESOURCES N_RESOURCES)
if (N_RESOURCES LESS 1)
message(FATAL_ERROR "The list of binary resources to embed may not be empty")
endif()
add_library("${EMBED_TARGET}" STATIC)
set_target_properties("${EMBED_TARGET}" PROPERTIES LINKER_LANGUAGE C)
set(EMBED_TEMPLATE_FILE "${PROJECT_SOURCE_DIR}/programs/embed_binary.S.in")
foreach(RESOURCE_FILE ${EMBED_RESOURCES})
set(ASSEMBLY_FILE_NAME "${RESOURCE_FILE}.S")
set(BINARY_FILE_NAME "${RESOURCE_FILE}")
# Normalize the name of the resource.
string(REGEX REPLACE "[\./-]" "_" SYMBOL_NAME "${RESOURCE_FILE}") # - must be last in regex
string(REPLACE "+" "_PLUS_" SYMBOL_NAME "${SYMBOL_NAME}")
# Generate the configured assembly file in the output directory.
configure_file("${EMBED_TEMPLATE_FILE}" "${CMAKE_CURRENT_BINARY_DIR}/${ASSEMBLY_FILE_NAME}" @ONLY)
# Set the include directory for relative paths specified for `.incbin` directive.
set_property(SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${ASSEMBLY_FILE_NAME}" APPEND PROPERTY INCLUDE_DIRECTORIES "${EMBED_RESOURCE_DIR}")
target_sources("${EMBED_TARGET}" PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/${ASSEMBLY_FILE_NAME}")
set_target_properties("${EMBED_TARGET}" PROPERTIES OBJECT_DEPENDS "${RESOURCE_FILE}")
endforeach()
endmacro()

View File

@ -1,4 +1,3 @@
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cctz")
set (SRCS
@ -23,12 +22,10 @@ if (OS_FREEBSD)
endif ()
# Related to time_zones table:
# StorageSystemTimeZones.generated.cpp is autogenerated each time during a build
# data in this file will be used to populate the system.time_zones table, this is specific to OS_LINUX
# as the library that's built using embedded tzdata is also specific to OS_LINUX
set(SYSTEM_STORAGE_TZ_FILE "${PROJECT_BINARY_DIR}/src/Storages/System/StorageSystemTimeZones.generated.cpp")
# TimeZones.generated.cpp is autogenerated each time during a build
set(TIMEZONES_FILE "${CMAKE_CURRENT_BINARY_DIR}/TimeZones.generated.cpp")
# remove existing copies so that its generated fresh on each build.
file(REMOVE ${SYSTEM_STORAGE_TZ_FILE})
file(REMOVE ${TIMEZONES_FILE})
# get the list of timezones from tzdata shipped with cctz
set(TZDIR "${LIBRARY_DIR}/testdata/zoneinfo")
@ -36,28 +33,36 @@ file(STRINGS "${LIBRARY_DIR}/testdata/version" TZDATA_VERSION)
set_property(GLOBAL PROPERTY TZDATA_VERSION_PROP "${TZDATA_VERSION}")
message(STATUS "Packaging with tzdata version: ${TZDATA_VERSION}")
set(TIMEZONE_RESOURCE_FILES)
# each file in that dir (except of tab and localtime) store the info about timezone
execute_process(COMMAND
bash -c "cd ${TZDIR} && find * -type f -and ! -name '*.tab' -and ! -name 'localtime' | LC_ALL=C sort | paste -sd ';' -"
OUTPUT_STRIP_TRAILING_WHITESPACE
OUTPUT_VARIABLE TIMEZONES)
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "// autogenerated by ClickHouse/contrib/cctz-cmake/CMakeLists.txt\n")
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} "const char * auto_time_zones[] {\n" )
file(APPEND ${TIMEZONES_FILE} "// autogenerated by ClickHouse/contrib/cctz-cmake/CMakeLists.txt\n")
file(APPEND ${TIMEZONES_FILE} "#include <incbin.h>\n")
set (COUNTER 1)
foreach(TIMEZONE ${TIMEZONES})
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " \"${TIMEZONE}\",\n")
list(APPEND TIMEZONE_RESOURCE_FILES "${TIMEZONE}")
file(APPEND ${TIMEZONES_FILE} "INCBIN(resource_timezone${COUNTER}, \"${TIMEZONE}\");\n")
MATH(EXPR COUNTER "${COUNTER}+1")
endforeach(TIMEZONE)
file(APPEND ${SYSTEM_STORAGE_TZ_FILE} " nullptr};\n")
clickhouse_embed_binaries(
TARGET tzdata
RESOURCE_DIR "${TZDIR}"
RESOURCES ${TIMEZONE_RESOURCE_FILES}
)
add_dependencies(_cctz tzdata)
target_link_libraries(_cctz INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:tzdata> -Wl,${NO_WHOLE_ARCHIVE}")
file(APPEND ${TIMEZONES_FILE} "#include <cstddef>\n")
file(APPEND ${TIMEZONES_FILE} "struct TimeZone { const char * name; const unsigned char * data; size_t size; };\n")
file(APPEND ${TIMEZONES_FILE} "TimeZone auto_time_zones[] {\n" )
set (COUNTER 1)
foreach(TIMEZONE ${TIMEZONES})
file(APPEND ${TIMEZONES_FILE} " {\"${TIMEZONE}\", gresource_timezone${COUNTER}Data, gresource_timezone${COUNTER}Size},\n")
MATH(EXPR COUNTER "${COUNTER}+1")
endforeach(TIMEZONE)
file(APPEND ${TIMEZONES_FILE} " {nullptr, nullptr, 0}};\n")
add_library (tzdata ${TIMEZONES_FILE})
target_link_libraries(tzdata ch_contrib::incbin)
target_include_directories(tzdata PRIVATE ${TZDIR})
target_link_libraries(_cctz tzdata)
add_library(ch_contrib::cctz ALIAS _cctz)

View File

@ -19,7 +19,6 @@
#include <Common/ZooKeeper/KeeperException.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/Exception.h>
#include <Common/getResource.h>
#include <Common/XMLUtils.h>
#include <Common/logger_useful.h>
#include <base/errnoToString.h>

View File

@ -3,7 +3,6 @@
#include <cctz/civil_time.h>
#include <cctz/time_zone.h>
#include <cctz/zone_info_source.h>
#include <Common/getResource.h>
#include <Poco/Exception.h>
#include <algorithm>
@ -13,6 +12,10 @@
#include <memory>
/// Embedded timezones.
struct TimeZone { const char * name; const unsigned char * data; size_t size; };
extern TimeZone auto_time_zones[];
namespace
{
@ -249,9 +252,15 @@ namespace cctz_extension
const std::string & name,
const std::function<std::unique_ptr<cctz::ZoneInfoSource>(const std::string & name)> & fallback)
{
std::string_view resource = getResource(name);
if (!resource.empty())
return std::make_unique<Source>(resource.data(), resource.size());
const TimeZone * timezone = auto_time_zones;
while (timezone->name != nullptr)
{
if (timezone->name == name)
break;
++timezone;
}
if (timezone->size)
return std::make_unique<Source>(reinterpret_cast<const char *>(timezone->data), timezone->size);
return fallback(name);
}

View File

@ -87,50 +87,13 @@ namespace
/// https://stackoverflow.com/questions/32088140/multiple-string-tables-in-elf-object
void updateResources(ElfW(Addr) base_address, std::string_view object_name, std::string_view name, const void * address, SymbolIndex::Resources & resources)
{
const char * char_address = static_cast<const char *>(address);
if (name.starts_with("_binary_") || name.starts_with("binary_"))
{
if (name.ends_with("_start"))
{
name = name.substr((name[0] == '_') + strlen("binary_"));
name = name.substr(0, name.size() - strlen("_start"));
auto & resource = resources[name];
if (!resource.base_address || resource.base_address == base_address)
{
resource.base_address = base_address;
resource.start = std::string_view{char_address, 0}; // NOLINT(bugprone-string-constructor)
resource.object_name = object_name;
}
}
if (name.ends_with("_end"))
{
name = name.substr((name[0] == '_') + strlen("binary_"));
name = name.substr(0, name.size() - strlen("_end"));
auto & resource = resources[name];
if (!resource.base_address || resource.base_address == base_address)
{
resource.base_address = base_address;
resource.end = std::string_view{char_address, 0}; // NOLINT(bugprone-string-constructor)
resource.object_name = object_name;
}
}
}
}
/// Based on the code of musl-libc and the answer of Kanalpiroge on
/// https://stackoverflow.com/questions/15779185/list-all-the-functions-symbols-on-the-fly-in-c-code-on-a-linux-architecture
/// It does not extract all the symbols (but only public - exported and used for dynamic linking),
/// but will work if we cannot find or parse ELF files.
void collectSymbolsFromProgramHeaders(
dl_phdr_info * info,
std::vector<SymbolIndex::Symbol> & symbols,
SymbolIndex::Resources & resources)
std::vector<SymbolIndex::Symbol> & symbols)
{
/* Iterate over all headers of the current shared lib
* (first call is for the executable itself)
@ -248,9 +211,6 @@ void collectSymbolsFromProgramHeaders(
/// We are not interested in empty symbols.
if (elf_sym[sym_index].st_size)
symbols.push_back(symbol);
/// But resources can be represented by a pair of empty symbols (indicating their boundaries).
updateResources(base_address, info->dlpi_name, symbol.name, symbol.address_begin, resources);
}
break;
@ -281,8 +241,7 @@ void collectSymbolsFromELFSymbolTable(
const Elf & elf,
const Elf::Section & symbol_table,
const Elf::Section & string_table,
std::vector<SymbolIndex::Symbol> & symbols,
SymbolIndex::Resources & resources)
std::vector<SymbolIndex::Symbol> & symbols)
{
/// Iterate symbol table.
const ElfSym * symbol_table_entry = reinterpret_cast<const ElfSym *>(symbol_table.begin());
@ -312,8 +271,6 @@ void collectSymbolsFromELFSymbolTable(
if (symbol_table_entry->st_size)
symbols.push_back(symbol);
updateResources(info->dlpi_addr, info->dlpi_name, symbol.name, symbol.address_begin, resources);
}
}
@ -323,8 +280,7 @@ bool searchAndCollectSymbolsFromELFSymbolTable(
const Elf & elf,
unsigned section_header_type,
const char * string_table_name,
std::vector<SymbolIndex::Symbol> & symbols,
SymbolIndex::Resources & resources)
std::vector<SymbolIndex::Symbol> & symbols)
{
std::optional<Elf::Section> symbol_table;
std::optional<Elf::Section> string_table;
@ -342,7 +298,7 @@ bool searchAndCollectSymbolsFromELFSymbolTable(
return false;
}
collectSymbolsFromELFSymbolTable(info, elf, *symbol_table, *string_table, symbols, resources);
collectSymbolsFromELFSymbolTable(info, elf, *symbol_table, *string_table, symbols);
return true;
}
@ -351,7 +307,6 @@ void collectSymbolsFromELF(
dl_phdr_info * info,
std::vector<SymbolIndex::Symbol> & symbols,
std::vector<SymbolIndex::Object> & objects,
SymbolIndex::Resources & resources,
String & build_id)
{
String object_name;
@ -462,11 +417,11 @@ void collectSymbolsFromELF(
object.name = object_name;
objects.push_back(std::move(object));
searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_SYMTAB, ".strtab", symbols, resources);
searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_SYMTAB, ".strtab", symbols);
/// Unneeded if they were parsed from "program headers" of loaded objects.
#if defined USE_MUSL
searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_DYNSYM, ".dynstr", symbols, resources);
searchAndCollectSymbolsFromELFSymbolTable(info, *objects.back().elf, SHT_DYNSYM, ".dynstr", symbols);
#endif
}
@ -479,8 +434,8 @@ int collectSymbols(dl_phdr_info * info, size_t, void * data_ptr)
{
SymbolIndex::Data & data = *reinterpret_cast<SymbolIndex::Data *>(data_ptr);
collectSymbolsFromProgramHeaders(info, data.symbols, data.resources);
collectSymbolsFromELF(info, data.symbols, data.objects, data.resources, data.build_id);
collectSymbolsFromProgramHeaders(info, data.symbols);
collectSymbolsFromELF(info, data.symbols, data.objects, data.build_id);
/* Continue iterations */
return 0;

View File

@ -8,6 +8,7 @@
#include <Common/Elf.h>
#include <boost/noncopyable.hpp>
namespace DB
{
@ -45,44 +46,15 @@ public:
const std::vector<Symbol> & symbols() const { return data.symbols; }
const std::vector<Object> & objects() const { return data.objects; }
std::string_view getResource(String name) const
{
if (auto it = data.resources.find(name); it != data.resources.end())
return it->second.data();
return {};
}
/// The BuildID that is generated by compiler.
String getBuildID() const { return data.build_id; }
String getBuildIDHex() const;
struct ResourcesBlob
{
/// Symbol can be presented in multiple shared objects,
/// base_address will be used to compare only symbols from the same SO.
ElfW(Addr) base_address = 0;
/// Just a human name of the SO.
std::string_view object_name;
/// Data blob.
std::string_view start;
std::string_view end;
std::string_view data() const
{
assert(end.data() >= start.data());
return std::string_view{start.data(), static_cast<size_t>(end.data() - start.data())};
}
};
using Resources = std::unordered_map<std::string_view /* symbol name */, ResourcesBlob>;
struct Data
{
std::vector<Symbol> symbols;
std::vector<Object> objects;
String build_id;
/// Resources (embedded binary data) are located by symbols in form of _binary_name_start and _binary_name_end.
Resources resources;
};
private:
Data data;

View File

@ -1,52 +0,0 @@
#include "getResource.h"
#include <dlfcn.h>
#include <string>
#include <boost/algorithm/string/replace.hpp>
#include <Common/SymbolIndex.h>
std::string_view getResource(std::string_view name)
{
// Convert the resource file name into the form generated by `ld -r -b binary`.
std::string name_replaced(name);
std::replace(name_replaced.begin(), name_replaced.end(), '/', '_');
std::replace(name_replaced.begin(), name_replaced.end(), '-', '_');
std::replace(name_replaced.begin(), name_replaced.end(), '.', '_');
boost::replace_all(name_replaced, "+", "_PLUS_");
#if defined USE_MUSL
/// If static linking is used, we cannot use dlsym and have to parse ELF symbol table by ourself.
return DB::SymbolIndex::instance().getResource(name_replaced);
#else
// In most `dlsym(3)` APIs, one passes the symbol name as it appears via
// something like `nm` or `objdump -t`. For example, a symbol `_foo` would be
// looked up with the string `"_foo"`.
//
// Apple's linker is confusingly different. The NOTES on the man page for
// `dlsym(3)` claim that one looks up the symbol with "the name used in C
// source code". In this example, that would mean using the string `"foo"`.
// This apparently applies even in the case where the symbol did not originate
// from C source, such as the embedded binary resource files used here. So
// the symbol name must not have a leading `_` on Apple platforms. It's not
// clear how this applies to other symbols, such as those which _have_ a leading
// underscore in them by design, many leading underscores, etc.
#if defined OS_DARWIN
std::string prefix = "binary_";
#else
std::string prefix = "_binary_";
#endif
std::string symbol_name_start = prefix + name_replaced + "_start";
std::string symbol_name_end = prefix + name_replaced + "_end";
const char * sym_start = reinterpret_cast<const char *>(dlsym(RTLD_DEFAULT, symbol_name_start.c_str()));
const char * sym_end = reinterpret_cast<const char *>(dlsym(RTLD_DEFAULT, symbol_name_end.c_str()));
if (sym_start && sym_end)
{
auto resource_size = static_cast<size_t>(std::distance(sym_start, sym_end));
return { sym_start, resource_size };
}
return {};
#endif
}

View File

@ -1,7 +0,0 @@
#pragma once
#include <string_view>
/// Get resource from binary if exists. Otherwise return empty string view.
/// Resources are data that is embedded into executable at link time.
std::string_view getResource(std::string_view name);

View File

@ -15,7 +15,8 @@
#endif
// All timezones present at build time and embedded into ClickHouse binary.
extern const char * auto_time_zones[];
struct TimeZone { const char * name; const unsigned char * data; size_t size; };
extern TimeZone auto_time_zones[];
namespace
{
@ -32,14 +33,14 @@ std::vector<const char*> allTimezones(bool with_weird_offsets = true)
{
std::vector<const char*> result;
const auto * timezone_name = auto_time_zones;
while (*timezone_name)
const TimeZone * timezone = auto_time_zones;
while (timezone->name)
{
bool weird_offsets = (std::string_view(*timezone_name) == "Africa/Monrovia");
bool weird_offsets = (std::string_view(timezone->name) == "Africa/Monrovia");
if (!weird_offsets || with_weird_offsets)
result.push_back(*timezone_name);
++timezone_name;
result.push_back(timezone->name);
++timezone;
}
return result;
@ -548,4 +549,3 @@ INSTANTIATE_TEST_SUITE_P(AllTimezones_Year1970,
// {0, 0 + 11 * 3600 * 24 + 12, 11},
}))
);

View File

@ -4,7 +4,8 @@
#include <DataTypes/DataTypeString.h>
extern const char * auto_time_zones[];
struct TimeZone { const char * name; const unsigned char * data; size_t size; };
extern TimeZone auto_time_zones[];
namespace DB
{
@ -17,7 +18,7 @@ NamesAndTypesList StorageSystemTimeZones::getNamesAndTypes()
void StorageSystemTimeZones::fillData(MutableColumns & res_columns, ContextPtr, const SelectQueryInfo &) const
{
for (auto * it = auto_time_zones; *it; ++it)
res_columns[0]->insert(String(*it));
for (auto * it = auto_time_zones; it->name != nullptr; ++it)
res_columns[0]->insert(String(it->name));
}
}