mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-03 13:02:00 +00:00
Use incbin for resources, part 1
This commit is contained in:
parent
2467eaf32d
commit
00d6f2ee08
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -340,3 +340,6 @@
|
||||
[submodule "contrib/c-ares"]
|
||||
path = contrib/c-ares
|
||||
url = https://github.com/c-ares/c-ares.git
|
||||
[submodule "contrib/incbin"]
|
||||
path = contrib/incbin
|
||||
url = https://github.com/graphitemaster/incbin.git
|
||||
|
2
contrib/CMakeLists.txt
vendored
2
contrib/CMakeLists.txt
vendored
@ -164,13 +164,13 @@ add_contrib (libpq-cmake libpq)
|
||||
add_contrib (nuraft-cmake NuRaft)
|
||||
add_contrib (fast_float-cmake fast_float)
|
||||
add_contrib (datasketches-cpp-cmake datasketches-cpp)
|
||||
add_contrib (incbin-cmake incbin)
|
||||
|
||||
option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES})
|
||||
if (ENABLE_NLP)
|
||||
add_contrib (libstemmer-c-cmake libstemmer_c)
|
||||
add_contrib (wordnet-blast-cmake wordnet-blast)
|
||||
add_contrib (lemmagen-c-cmake lemmagen-c)
|
||||
add_contrib (nlp-data-cmake nlp-data)
|
||||
add_contrib (cld2-cmake cld2)
|
||||
endif()
|
||||
|
||||
|
1
contrib/incbin
vendored
Submodule
1
contrib/incbin
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 6e576cae5ab5810f25e2631f2e0b80cbe7dc8cbf
|
4
contrib/incbin-cmake/CMakeLists.txt
Normal file
4
contrib/incbin-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,4 @@
|
||||
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/incbin")
|
||||
add_library(_incbin INTERFACE)
|
||||
target_include_directories(_incbin SYSTEM INTERFACE ${LIBRARY_DIR})
|
||||
add_library(ch_contrib::incbin ALIAS _incbin)
|
@ -1,15 +0,0 @@
|
||||
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
|
||||
|
||||
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/nlp-data")
|
||||
|
||||
add_library (_nlp_data INTERFACE)
|
||||
|
||||
clickhouse_embed_binaries(
|
||||
TARGET nlp_dictionaries
|
||||
RESOURCE_DIR "${LIBRARY_DIR}"
|
||||
RESOURCES charset.zst tonality_ru.zst programming.zst
|
||||
)
|
||||
|
||||
add_dependencies(_nlp_data nlp_dictionaries)
|
||||
target_link_libraries(_nlp_data INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:nlp_dictionaries> -Wl,${NO_WHOLE_ARCHIVE}")
|
||||
add_library(ch_contrib::nlp_data ALIAS _nlp_data)
|
@ -10,3 +10,6 @@ set (CLICKHOUSE_INSTALL_LINK
|
||||
)
|
||||
|
||||
clickhouse_program_add_library(install)
|
||||
|
||||
# For incbin
|
||||
target_include_directories(clickhouse-install-lib PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../server")
|
||||
|
@ -20,10 +20,7 @@
|
||||
#include <Common/formatReadable.h>
|
||||
#include <Common/Config/ConfigProcessor.h>
|
||||
#include <Common/OpenSSLHelpers.h>
|
||||
#include <base/hex.h>
|
||||
#include <Common/getResource.h>
|
||||
#include <base/sleep.h>
|
||||
#include <IO/ReadBufferFromFileDescriptor.h>
|
||||
#include <IO/WriteBufferFromFileDescriptor.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
@ -35,6 +32,12 @@
|
||||
|
||||
#include <Poco/Util/XMLConfiguration.h>
|
||||
|
||||
#include <incbin.h>
|
||||
|
||||
/// Embedded configuration files used inside the install program
|
||||
INCBIN(resource_config_xml, "config.xml");
|
||||
INCBIN(resource_users_xml, "users.xml");
|
||||
|
||||
|
||||
/** This tool can be used to install ClickHouse without a deb/rpm/tgz package, having only "clickhouse" binary.
|
||||
* It also allows to avoid dependency on systemd, upstart, SysV init.
|
||||
@ -560,7 +563,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
|
||||
|
||||
if (!fs::exists(main_config_file))
|
||||
{
|
||||
std::string_view main_config_content = getResource("config.xml");
|
||||
std::string_view main_config_content(reinterpret_cast<const char *>(gresource_config_xmlData), gresource_config_xmlSize);
|
||||
if (main_config_content.empty())
|
||||
{
|
||||
fmt::print("There is no default config.xml, you have to download it and place to {}.\n", main_config_file.string());
|
||||
@ -672,7 +675,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
|
||||
|
||||
if (!fs::exists(users_config_file))
|
||||
{
|
||||
std::string_view users_config_content = getResource("users.xml");
|
||||
std::string_view users_config_content(reinterpret_cast<const char *>(gresource_users_xmlData), gresource_users_xmlSize);
|
||||
if (users_config_content.empty())
|
||||
{
|
||||
fmt::print("There is no default users.xml, you have to download it and place to {}.\n", users_config_file.string());
|
||||
|
@ -1,16 +1,3 @@
|
||||
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
|
||||
|
||||
if (OS_LINUX)
|
||||
set (LINK_RESOURCE_LIB INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:clickhouse_keeper_configs> -Wl,${NO_WHOLE_ARCHIVE}")
|
||||
# for some reason INTERFACE linkage doesn't work for standalone binary
|
||||
set (LINK_RESOURCE_LIB_STANDALONE_KEEPER "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:clickhouse_keeper_configs> -Wl,${NO_WHOLE_ARCHIVE}")
|
||||
endif ()
|
||||
|
||||
clickhouse_embed_binaries(
|
||||
TARGET clickhouse_keeper_configs
|
||||
RESOURCES keeper_config.xml keeper_embedded.xml
|
||||
)
|
||||
|
||||
set(CLICKHOUSE_KEEPER_SOURCES
|
||||
Keeper.cpp
|
||||
)
|
||||
@ -29,7 +16,6 @@ set (CLICKHOUSE_KEEPER_LINK
|
||||
clickhouse_program_add(keeper)
|
||||
|
||||
install(FILES keeper_config.xml DESTINATION "${CLICKHOUSE_ETC_DIR}/clickhouse-keeper" COMPONENT clickhouse-keeper)
|
||||
add_dependencies(clickhouse-keeper-lib clickhouse_keeper_configs)
|
||||
|
||||
if (BUILD_STANDALONE_KEEPER)
|
||||
# Straight list of all required sources
|
||||
@ -215,7 +201,6 @@ if (BUILD_STANDALONE_KEEPER)
|
||||
${LINK_RESOURCE_LIB_STANDALONE_KEEPER}
|
||||
)
|
||||
|
||||
add_dependencies(clickhouse-keeper clickhouse_keeper_configs)
|
||||
set_target_properties(clickhouse-keeper PROPERTIES RUNTIME_OUTPUT_DIRECTORY ../)
|
||||
|
||||
if (SPLIT_DEBUG_SYMBOLS)
|
||||
|
@ -457,8 +457,10 @@ try
|
||||
const std::string key_path = config().getString("openSSL.server.privateKeyFile", "");
|
||||
|
||||
std::vector<std::string> extra_paths = {include_from_path};
|
||||
if (!cert_path.empty()) extra_paths.emplace_back(cert_path);
|
||||
if (!key_path.empty()) extra_paths.emplace_back(key_path);
|
||||
if (!cert_path.empty())
|
||||
extra_paths.emplace_back(cert_path);
|
||||
if (!key_path.empty())
|
||||
extra_paths.emplace_back(key_path);
|
||||
|
||||
/// ConfigReloader have to strict parameters which are redundant in our case
|
||||
auto main_config_reloader = std::make_unique<ConfigReloader>(
|
||||
|
@ -1,12 +1,8 @@
|
||||
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
|
||||
|
||||
set(CLICKHOUSE_SERVER_SOURCES
|
||||
MetricsTransmitter.cpp
|
||||
Server.cpp
|
||||
)
|
||||
|
||||
set (LINK_RESOURCE_LIB INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:clickhouse_server_configs> -Wl,${NO_WHOLE_ARCHIVE}")
|
||||
|
||||
set (CLICKHOUSE_SERVER_LINK
|
||||
PRIVATE
|
||||
clickhouse_aggregate_functions
|
||||
@ -33,10 +29,4 @@ endif()
|
||||
|
||||
clickhouse_program_add(server)
|
||||
|
||||
install(FILES config.xml users.xml DESTINATION "${CLICKHOUSE_ETC_DIR}/clickhouse-server" COMPONENT clickhouse)
|
||||
|
||||
clickhouse_embed_binaries(
|
||||
TARGET clickhouse_server_configs
|
||||
RESOURCES config.xml users.xml embedded.xml play.html dashboard.html js/uplot.js
|
||||
)
|
||||
add_dependencies(clickhouse-server-lib clickhouse_server_configs)
|
||||
target_include_directories(clickhouse-server-lib PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
@ -128,6 +128,10 @@
|
||||
# include <azure/storage/common/internal/xml_wrapper.hpp>
|
||||
#endif
|
||||
|
||||
#include <incbin.h>
|
||||
/// A minimal file used when the server is run without installation
|
||||
INCBIN(resource_embedded_xml, "embedded.xml");
|
||||
|
||||
namespace CurrentMetrics
|
||||
{
|
||||
extern const Metric Revision;
|
||||
@ -393,6 +397,7 @@ int Server::run()
|
||||
|
||||
void Server::initialize(Poco::Util::Application & self)
|
||||
{
|
||||
ConfigProcessor::registerEmbeddedConfig("config.xml", std::string_view(reinterpret_cast<const char *>(gresource_embedded_xmlData), gresource_embedded_xmlSize));
|
||||
BaseDaemon::initialize(self);
|
||||
logger().information("starting up");
|
||||
|
||||
@ -1105,8 +1110,10 @@ try
|
||||
const std::string key_path = config().getString("openSSL.server.privateKeyFile", "");
|
||||
|
||||
std::vector<std::string> extra_paths = {include_from_path};
|
||||
if (!cert_path.empty()) extra_paths.emplace_back(cert_path);
|
||||
if (!key_path.empty()) extra_paths.emplace_back(key_path);
|
||||
if (!cert_path.empty())
|
||||
extra_paths.emplace_back(cert_path);
|
||||
if (!key_path.empty())
|
||||
extra_paths.emplace_back(key_path);
|
||||
|
||||
auto main_config_reloader = std::make_unique<ConfigReloader>(
|
||||
config_path,
|
||||
|
0
programs/server/resources.cpp
Normal file
0
programs/server/resources.cpp
Normal file
@ -210,7 +210,7 @@ if (TARGET ch_contrib::jemalloc)
|
||||
target_link_libraries (clickhouse_storages_system PRIVATE ch_contrib::jemalloc)
|
||||
endif()
|
||||
|
||||
target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::sparsehash)
|
||||
target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::sparsehash ch_contrib::incbin)
|
||||
|
||||
add_subdirectory(Access/Common)
|
||||
add_subdirectory(Common/ZooKeeper)
|
||||
@ -296,7 +296,7 @@ macro (dbms_target_include_directories)
|
||||
endforeach ()
|
||||
endmacro ()
|
||||
|
||||
dbms_target_include_directories (PUBLIC "${ClickHouse_SOURCE_DIR}/src" "${ClickHouse_BINARY_DIR}/src")
|
||||
dbms_target_include_directories (PUBLIC "${ClickHouse_SOURCE_DIR}/src" "${ClickHouse_BINARY_DIR}/src" "${ClickHouse_SOURCE_DIR}/programs/server")
|
||||
target_include_directories (clickhouse_common_io PUBLIC "${ClickHouse_SOURCE_DIR}/src" "${ClickHouse_BINARY_DIR}/src")
|
||||
|
||||
if (TARGET ch_contrib::llvm)
|
||||
@ -561,7 +561,7 @@ if (ENABLE_NLP)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::stemmer)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::wnb)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::lemmagen)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::nlp_data)
|
||||
target_include_directories(clickhouse_common_io PUBLIC ${CMAKE_SOURCE_DIR}/contrib/nlp-data)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::ulid)
|
||||
|
@ -9,5 +9,5 @@ if (ENABLE_EXAMPLES)
|
||||
endif()
|
||||
|
||||
if (ENABLE_MYSQL)
|
||||
add_subdirectory (mysqlxx)
|
||||
add_subdirectory(mysqlxx)
|
||||
endif ()
|
||||
|
@ -83,6 +83,13 @@ ConfigProcessor::~ConfigProcessor()
|
||||
Poco::Logger::destroy("ConfigProcessor");
|
||||
}
|
||||
|
||||
static std::unordered_map<std::string, std::string_view> embedded_configs;
|
||||
|
||||
void ConfigProcessor::registerEmbeddedConfig(std::string name, std::string_view content)
|
||||
{
|
||||
embedded_configs[name] = content;
|
||||
}
|
||||
|
||||
|
||||
/// Vector containing the name of the element and a sorted list of attribute names and values
|
||||
/// (except "remove" and "replace" attributes).
|
||||
@ -281,15 +288,15 @@ void ConfigProcessor::doIncludesRecursive(
|
||||
{
|
||||
std::string value = node->nodeValue();
|
||||
|
||||
bool replace_occured = false;
|
||||
bool replace_occurred = false;
|
||||
size_t pos;
|
||||
while ((pos = value.find(substitution.first)) != std::string::npos)
|
||||
{
|
||||
value.replace(pos, substitution.first.length(), substitution.second);
|
||||
replace_occured = true;
|
||||
replace_occurred = true;
|
||||
}
|
||||
|
||||
if (replace_occured)
|
||||
if (replace_occurred)
|
||||
node->setNodeValue(value);
|
||||
}
|
||||
}
|
||||
@ -528,26 +535,14 @@ XMLDocumentPtr ConfigProcessor::processConfig(
|
||||
}
|
||||
else
|
||||
{
|
||||
/// These embedded files added during build with some cmake magic.
|
||||
/// Look at the end of programs/server/CMakeLists.txt.
|
||||
std::string embedded_name;
|
||||
if (path == "config.xml")
|
||||
embedded_name = "embedded.xml";
|
||||
|
||||
if (path == "keeper_config.xml")
|
||||
embedded_name = "keeper_embedded.xml";
|
||||
|
||||
/// When we can use config embedded in binary.
|
||||
if (!embedded_name.empty())
|
||||
/// When we can use a config embedded in the binary.
|
||||
if (auto it = embedded_configs.find(path); it != embedded_configs.end())
|
||||
{
|
||||
auto resource = getResource(embedded_name);
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Configuration file {} doesn't exist and there is no embedded config", path);
|
||||
LOG_DEBUG(log, "There is no file '{}', will use embedded config.", path);
|
||||
config = dom_parser.parseMemory(resource.data(), resource.size());
|
||||
config = dom_parser.parseMemory(it->second.data(), it->second.size());
|
||||
}
|
||||
else
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Configuration file {} doesn't exist", path);
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Configuration file {} doesn't exist and there is no embedded config", path);
|
||||
}
|
||||
|
||||
std::vector<std::string> contributing_files;
|
||||
|
@ -65,6 +65,9 @@ public:
|
||||
zkutil::ZooKeeperNodeCache * zk_node_cache = nullptr,
|
||||
const zkutil::EventPtr & zk_changed_event = nullptr);
|
||||
|
||||
/// These configurations will be used if there is no configuration file.
|
||||
static void registerEmbeddedConfig(std::string name, std::string_view content);
|
||||
|
||||
|
||||
/// loadConfig* functions apply processConfig and create Poco::Util::XMLConfiguration.
|
||||
/// The resulting XML document is saved into a file with the name
|
||||
|
181
src/Common/FrequencyHolder.cpp
Normal file
181
src/Common/FrequencyHolder.cpp
Normal file
@ -0,0 +1,181 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
|
||||
#include <incbin.h>
|
||||
|
||||
/// Embedded SQL definitions
|
||||
INCBIN(resource_charset_zst, "charset.zst");
|
||||
INCBIN(resource_tonality_ru_zst, "tonality_ru.zst");
|
||||
INCBIN(resource_programming_zst, "programming.zst");
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int FILE_DOESNT_EXIST;
|
||||
}
|
||||
|
||||
|
||||
FrequencyHolder & FrequencyHolder::getInstance()
|
||||
{
|
||||
static FrequencyHolder instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
FrequencyHolder::FrequencyHolder()
|
||||
{
|
||||
loadEmotionalDict();
|
||||
loadEncodingsFrequency();
|
||||
loadProgrammingFrequency();
|
||||
}
|
||||
|
||||
void FrequencyHolder::loadEncodingsFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded charset frequencies");
|
||||
|
||||
std::string_view resource(reinterpret_cast<const char *>(gresource_charset_zstData), gresource_charset_zstSize);
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
|
||||
|
||||
String line;
|
||||
UInt16 bigram;
|
||||
Float64 frequency;
|
||||
String charset_name;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new charset
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(charset_name, buf_line);
|
||||
|
||||
/* In our dictionary we have lines with form: <Language>_<Charset>
|
||||
* If we need to find language of data, we return <Language>
|
||||
* If we need to find charset of data, we return <Charset>.
|
||||
*/
|
||||
size_t sep = charset_name.find('_');
|
||||
|
||||
Encoding enc;
|
||||
enc.lang = charset_name.substr(0, sep);
|
||||
enc.name = charset_name.substr(sep + 1);
|
||||
encodings_freq.push_back(std::move(enc));
|
||||
}
|
||||
else
|
||||
{
|
||||
readIntText(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
encodings_freq.back().map[bigram] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
||||
}
|
||||
|
||||
void FrequencyHolder::loadEmotionalDict()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
LOG_TRACE(log, "Loading embedded emotional dictionary");
|
||||
|
||||
std::string_view resource(reinterpret_cast<const char *>(gresource_tonality_ru_zstData), gresource_tonality_ru_zstSize);
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
|
||||
|
||||
String line;
|
||||
String word;
|
||||
Float64 tonality;
|
||||
size_t count = 0;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
readStringUntilWhitespace(word, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(tonality, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
|
||||
emotional_dict[ref] = tonality;
|
||||
++count;
|
||||
}
|
||||
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
||||
}
|
||||
|
||||
void FrequencyHolder::loadProgrammingFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
|
||||
|
||||
std::string_view resource(reinterpret_cast<const char *>(gresource_programming_zstData), gresource_programming_zstSize);
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
|
||||
|
||||
String line;
|
||||
String bigram;
|
||||
Float64 frequency;
|
||||
String programming_language;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new language
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(programming_language, buf_line);
|
||||
|
||||
Language lang;
|
||||
lang.name = programming_language;
|
||||
programming_freq.push_back(std::move(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
readStringUntilWhitespace(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
|
||||
programming_freq.back().map[ref] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
|
||||
}
|
@ -7,7 +7,6 @@
|
||||
#include <unordered_map>
|
||||
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/getResource.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
@ -20,11 +19,6 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int FILE_DOESNT_EXIST;
|
||||
}
|
||||
|
||||
/// FrequencyHolder class is responsible for storing and loading dictionaries
|
||||
/// needed for text classification functions:
|
||||
///
|
||||
@ -56,11 +50,7 @@ public:
|
||||
using EncodingMap = HashMap<UInt16, Float64>;
|
||||
using EncodingContainer = std::vector<Encoding>;
|
||||
|
||||
static FrequencyHolder & getInstance()
|
||||
{
|
||||
static FrequencyHolder instance;
|
||||
return instance;
|
||||
}
|
||||
static FrequencyHolder & getInstance();
|
||||
|
||||
const Map & getEmotionalDict() const
|
||||
{
|
||||
@ -78,161 +68,11 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
FrequencyHolder();
|
||||
|
||||
FrequencyHolder()
|
||||
{
|
||||
loadEmotionalDict();
|
||||
loadEncodingsFrequency();
|
||||
loadProgrammingFrequency();
|
||||
}
|
||||
|
||||
void loadEncodingsFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded charset frequencies");
|
||||
|
||||
auto resource = getResource("charset.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
|
||||
|
||||
String line;
|
||||
UInt16 bigram;
|
||||
Float64 frequency;
|
||||
String charset_name;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new charset
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(charset_name, buf_line);
|
||||
|
||||
/* In our dictionary we have lines with form: <Language>_<Charset>
|
||||
* If we need to find language of data, we return <Language>
|
||||
* If we need to find charset of data, we return <Charset>.
|
||||
*/
|
||||
size_t sep = charset_name.find('_');
|
||||
|
||||
Encoding enc;
|
||||
enc.lang = charset_name.substr(0, sep);
|
||||
enc.name = charset_name.substr(sep + 1);
|
||||
encodings_freq.push_back(std::move(enc));
|
||||
}
|
||||
else
|
||||
{
|
||||
readIntText(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
encodings_freq.back().map[bigram] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
||||
}
|
||||
|
||||
void loadEmotionalDict()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
LOG_TRACE(log, "Loading embedded emotional dictionary");
|
||||
|
||||
auto resource = getResource("tonality_ru.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
|
||||
|
||||
String line;
|
||||
String word;
|
||||
Float64 tonality;
|
||||
size_t count = 0;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
readStringUntilWhitespace(word, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(tonality, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
|
||||
emotional_dict[ref] = tonality;
|
||||
++count;
|
||||
}
|
||||
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
||||
}
|
||||
|
||||
void loadProgrammingFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
|
||||
|
||||
auto resource = getResource("programming.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
|
||||
|
||||
String line;
|
||||
String bigram;
|
||||
Float64 frequency;
|
||||
String programming_language;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new language
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(programming_language, buf_line);
|
||||
|
||||
Language lang;
|
||||
lang.name = programming_language;
|
||||
programming_freq.push_back(std::move(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
readStringUntilWhitespace(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
|
||||
programming_freq.back().map[ref] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
void loadEncodingsFrequency();
|
||||
void loadEmotionalDict();
|
||||
void loadProgrammingFrequency();
|
||||
|
||||
Arena string_pool;
|
||||
|
||||
|
@ -38,7 +38,6 @@
|
||||
#include <base/coverage.h>
|
||||
#include <base/sleep.h>
|
||||
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileDescriptorDiscardOnFailure.h>
|
||||
#include <IO/ReadBufferFromFileDescriptor.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
@ -6,10 +6,16 @@
|
||||
#include <Poco/Util/LayeredConfiguration.h>
|
||||
|
||||
#include <IO/HTTPCommon.h>
|
||||
#include <Common/getResource.h>
|
||||
|
||||
#include <re2/re2.h>
|
||||
|
||||
#include <incbin.h>
|
||||
|
||||
/// Embedded HTML pages
|
||||
INCBIN(resource_play_html, "play.html");
|
||||
INCBIN(resource_dashboard_html, "dashboard.html");
|
||||
INCBIN(resource_uplot_js, "js/uplot.js");
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -34,13 +40,13 @@ void WebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerR
|
||||
if (request.getURI().starts_with("/play"))
|
||||
{
|
||||
response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK);
|
||||
*response.send() << getResource("play.html");
|
||||
*response.send() << std::string_view(reinterpret_cast<const char *>(gresource_play_htmlData), gresource_play_htmlSize);
|
||||
}
|
||||
else if (request.getURI().starts_with("/dashboard"))
|
||||
{
|
||||
response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK);
|
||||
|
||||
std::string html(getResource("dashboard.html"));
|
||||
std::string html(reinterpret_cast<const char *>(gresource_dashboard_htmlData), gresource_dashboard_htmlSize);
|
||||
|
||||
/// Replace a link to external JavaScript file to embedded file.
|
||||
/// This allows to open the HTML without running a server and to host it on server.
|
||||
@ -55,7 +61,7 @@ void WebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerR
|
||||
else if (request.getURI() == "/js/uplot.js")
|
||||
{
|
||||
response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK);
|
||||
*response.send() << getResource("js/uplot.js");
|
||||
*response.send() << std::string_view(reinterpret_cast<const char *>(gresource_uplot_jsData), gresource_uplot_jsSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -43,18 +43,9 @@ list (APPEND storages_system_sources ${GENERATED_TIMEZONES_SRC})
|
||||
# Overlength strings
|
||||
set_source_files_properties(${GENERATED_LICENSES_SRC} PROPERTIES COMPILE_FLAGS -w)
|
||||
|
||||
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
|
||||
clickhouse_embed_binaries(
|
||||
TARGET information_schema_metadata
|
||||
RESOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/InformationSchema/"
|
||||
RESOURCES schemata.sql tables.sql views.sql columns.sql
|
||||
)
|
||||
|
||||
list (SORT storages_system_sources) # Reproducible build
|
||||
add_library(clickhouse_storages_system ${storages_system_sources})
|
||||
|
||||
add_dependencies(clickhouse_storages_system information_schema_metadata)
|
||||
|
||||
target_link_libraries(clickhouse_storages_system PRIVATE
|
||||
dbms
|
||||
common
|
||||
@ -62,5 +53,6 @@ target_link_libraries(clickhouse_storages_system PRIVATE
|
||||
clickhouse_common_zookeeper
|
||||
clickhouse_parsers
|
||||
Poco::JSON
|
||||
INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:information_schema_metadata> -Wl,${NO_WHOLE_ARCHIVE}"
|
||||
)
|
||||
|
||||
target_include_directories(clickhouse_storages_system PRIVATE InformationSchema)
|
||||
|
@ -3,14 +3,21 @@
|
||||
#include <Storages/System/attachSystemTablesImpl.h>
|
||||
#include <Parsers/ParserCreateQuery.h>
|
||||
#include <Parsers/parseQuery.h>
|
||||
#include <Common/getResource.h>
|
||||
#include <incbin.h>
|
||||
|
||||
/// Embedded SQL definitions
|
||||
INCBIN(resource_schemata_sql, "schemata.sql");
|
||||
INCBIN(resource_tables_sql, "tables.sql");
|
||||
INCBIN(resource_views_sql, "views.sql");
|
||||
INCBIN(resource_columns_sql, "columns.sql");
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// View structures are taken from http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
|
||||
|
||||
static void createInformationSchemaView(ContextMutablePtr context, IDatabase & database, const String & view_name)
|
||||
static void createInformationSchemaView(ContextMutablePtr context, IDatabase & database, const String & view_name, std::string_view query)
|
||||
{
|
||||
try
|
||||
{
|
||||
@ -21,12 +28,11 @@ static void createInformationSchemaView(ContextMutablePtr context, IDatabase & d
|
||||
bool is_uppercase = database.getDatabaseName() == DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE;
|
||||
|
||||
String metadata_resource_name = view_name + ".sql";
|
||||
auto attach_query = getResource(metadata_resource_name);
|
||||
if (attach_query.empty())
|
||||
if (query.empty())
|
||||
return;
|
||||
|
||||
ParserCreateQuery parser;
|
||||
ASTPtr ast = parseQuery(parser, attach_query.data(), attach_query.data() + attach_query.size(),
|
||||
ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(),
|
||||
"Attach query from embedded resource " + metadata_resource_name,
|
||||
DBMS_DEFAULT_MAX_QUERY_SIZE, DBMS_DEFAULT_MAX_PARSER_DEPTH);
|
||||
|
||||
@ -50,10 +56,10 @@ static void createInformationSchemaView(ContextMutablePtr context, IDatabase & d
|
||||
|
||||
void attachInformationSchema(ContextMutablePtr context, IDatabase & information_schema_database)
|
||||
{
|
||||
createInformationSchemaView(context, information_schema_database, "schemata");
|
||||
createInformationSchemaView(context, information_schema_database, "tables");
|
||||
createInformationSchemaView(context, information_schema_database, "views");
|
||||
createInformationSchemaView(context, information_schema_database, "columns");
|
||||
createInformationSchemaView(context, information_schema_database, "schemata", std::string_view(reinterpret_cast<const char *>(gresource_schemata_sqlData), gresource_schemata_sqlSize));
|
||||
createInformationSchemaView(context, information_schema_database, "tables", std::string_view(reinterpret_cast<const char *>(gresource_tables_sqlData), gresource_tables_sqlSize));
|
||||
createInformationSchemaView(context, information_schema_database, "views", std::string_view(reinterpret_cast<const char *>(gresource_views_sqlData), gresource_views_sqlSize));
|
||||
createInformationSchemaView(context, information_schema_database, "columns", std::string_view(reinterpret_cast<const char *>(gresource_columns_sqlData), gresource_columns_sqlSize));
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user