Merge pull request #16192 from filimonov/clickhouse-local-segfault

Add setTemporaryStorage to clickhouse-local to make OPTIMIZE work
This commit is contained in:
Alexander Kuzmenkov 2020-10-22 20:47:28 +03:00 committed by GitHub
commit a1a7bc0217
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 164 additions and 41 deletions

View File

@ -57,8 +57,8 @@ LocalServer::LocalServer() = default;
LocalServer::~LocalServer()
{
if (context)
context->shutdown(); /// required for properly exception handling
if (global_context)
global_context->shutdown(); /// required for properly exception handling
}
@ -95,9 +95,9 @@ void LocalServer::initialize(Poco::Util::Application & self)
}
}
void LocalServer::applyCmdSettings()
void LocalServer::applyCmdSettings(Context & context)
{
context->applySettingsChanges(cmd_settings.changes());
context.applySettingsChanges(cmd_settings.changes());
}
/// If path is specified and not empty, will try to setup server environment and load existing metadata
@ -151,8 +151,12 @@ void LocalServer::tryInitPath()
if (path.back() != '/')
path += '/';
context->setPath(path);
context->setUserFilesPath(""); // user's files are everywhere
global_context->setPath(path);
global_context->setTemporaryStorage(path + "tmp");
global_context->setFlagsPath(path + "flags");
global_context->setUserFilesPath(""); // user's files are everywhere
}
@ -186,9 +190,9 @@ try
}
shared_context = Context::createShared();
context = std::make_unique<Context>(Context::createGlobal(shared_context.get()));
context->makeGlobalContext();
context->setApplicationType(Context::ApplicationType::LOCAL);
global_context = std::make_unique<Context>(Context::createGlobal(shared_context.get()));
global_context->makeGlobalContext();
global_context->setApplicationType(Context::ApplicationType::LOCAL);
tryInitPath();
std::optional<StatusFile> status;
@ -210,32 +214,32 @@ try
/// Maybe useless
if (config().has("macros"))
context->setMacros(std::make_unique<Macros>(config(), "macros", log));
global_context->setMacros(std::make_unique<Macros>(config(), "macros", log));
/// Skip networking
/// Sets external authenticators config (LDAP).
context->setExternalAuthenticatorsConfig(config());
global_context->setExternalAuthenticatorsConfig(config());
setupUsers();
/// Limit on total number of concurrently executing queries.
/// There is no need for concurrent queries, override max_concurrent_queries.
context->getProcessList().setMaxSize(0);
global_context->getProcessList().setMaxSize(0);
/// Size of cache for uncompressed blocks. Zero means disabled.
size_t uncompressed_cache_size = config().getUInt64("uncompressed_cache_size", 0);
if (uncompressed_cache_size)
context->setUncompressedCache(uncompressed_cache_size);
global_context->setUncompressedCache(uncompressed_cache_size);
/// Size of cache for marks (index of MergeTree family of tables). It is necessary.
/// Specify default value for mark_cache_size explicitly!
size_t mark_cache_size = config().getUInt64("mark_cache_size", 5368709120);
if (mark_cache_size)
context->setMarkCache(mark_cache_size);
global_context->setMarkCache(mark_cache_size);
/// Load global settings from default_profile and system_profile.
context->setDefaultProfiles(config());
global_context->setDefaultProfiles(config());
/** Init dummy default DB
* NOTE: We force using isolated default database to avoid conflicts with default database from server environment
@ -243,34 +247,34 @@ try
* if such tables will not be dropped, clickhouse-server will not be able to load them due to security reasons.
*/
std::string default_database = config().getString("default_database", "_local");
DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared<DatabaseMemory>(default_database, *context));
context->setCurrentDatabase(default_database);
applyCmdOptions();
DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared<DatabaseMemory>(default_database, *global_context));
global_context->setCurrentDatabase(default_database);
applyCmdOptions(*global_context);
String path = context->getPath();
String path = global_context->getPath();
if (!path.empty())
{
/// Lock path directory before read
status.emplace(context->getPath() + "status", StatusFile::write_full_info);
status.emplace(global_context->getPath() + "status", StatusFile::write_full_info);
LOG_DEBUG(log, "Loading metadata from {}", path);
Poco::File(path + "data/").createDirectories();
Poco::File(path + "metadata/").createDirectories();
loadMetadataSystem(*context);
attachSystemTables(*context);
loadMetadata(*context);
loadMetadataSystem(*global_context);
attachSystemTables(*global_context);
loadMetadata(*global_context);
DatabaseCatalog::instance().loadDatabases();
LOG_DEBUG(log, "Loaded metadata.");
}
else
{
attachSystemTables(*context);
attachSystemTables(*global_context);
}
processQueries();
context->shutdown();
context.reset();
global_context->shutdown();
global_context.reset();
status.reset();
cleanup();
@ -323,7 +327,7 @@ void LocalServer::processQueries()
String initial_create_query = getInitialCreateTableQuery();
String queries_str = initial_create_query + config().getRawString("query");
const auto & settings = context->getSettingsRef();
const auto & settings = global_context->getSettingsRef();
std::vector<String> queries;
auto parse_res = splitMultipartQuery(queries_str, queries, settings.max_query_size, settings.max_parser_depth);
@ -331,15 +335,19 @@ void LocalServer::processQueries()
if (!parse_res.second)
throw Exception("Cannot parse and execute the following part of query: " + String(parse_res.first), ErrorCodes::SYNTAX_ERROR);
context->makeSessionContext();
context->makeQueryContext();
/// we can't mutate global global_context (can lead to races, as it was already passed to some background threads)
/// so we can't reuse it safely as a query context and need a copy here
auto context = Context(*global_context);
context->setUser("default", "", Poco::Net::SocketAddress{});
context->setCurrentQueryId("");
applyCmdSettings();
context.makeSessionContext();
context.makeQueryContext();
context.setUser("default", "", Poco::Net::SocketAddress{});
context.setCurrentQueryId("");
applyCmdSettings(context);
/// Use the same query_id (and thread group) for all queries
CurrentThread::QueryScope query_scope_holder(*context);
CurrentThread::QueryScope query_scope_holder(context);
bool echo_queries = config().hasOption("echo") || config().hasOption("verbose");
std::exception_ptr exception;
@ -358,7 +366,7 @@ void LocalServer::processQueries()
try
{
executeQuery(read_buf, write_buf, /* allow_into_outfile = */ true, *context, {});
executeQuery(read_buf, write_buf, /* allow_into_outfile = */ true, context, {});
}
catch (...)
{
@ -423,7 +431,7 @@ void LocalServer::setupUsers()
}
if (users_config)
context->setUsersConfig(users_config);
global_context->setUsersConfig(users_config);
else
throw Exception("Can't load config for users", ErrorCodes::CANNOT_LOAD_CONFIG);
}
@ -577,10 +585,10 @@ void LocalServer::init(int argc, char ** argv)
argsToConfig(arguments, config(), 100);
}
void LocalServer::applyCmdOptions()
void LocalServer::applyCmdOptions(Context & context)
{
context->setDefaultFormat(config().getString("output-format", config().getString("format", "TSV")));
applyCmdSettings();
context.setDefaultFormat(config().getString("output-format", config().getString("format", "TSV")));
applyCmdSettings(context);
}
}

View File

@ -36,15 +36,15 @@ private:
std::string getInitialCreateTableQuery();
void tryInitPath();
void applyCmdOptions();
void applyCmdSettings();
void applyCmdOptions(Context & context);
void applyCmdSettings(Context & context);
void processQueries();
void setupUsers();
void cleanup();
protected:
SharedContextHolder shared_context;
std::unique_ptr<Context> context;
std::unique_ptr<Context> global_context;
/// Settings specified via command line args
Settings cmd_settings;

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh
WORKING_FOLDER_01527="${CLICKHOUSE_TMP}/01527_clickhouse_local_optimize"
rm -rf "${WORKING_FOLDER_01527}"
mkdir -p "${WORKING_FOLDER_01527}"
# OPTIMIZE was crashing due to lack of temporary volume in local
${CLICKHOUSE_LOCAL} --query "drop database if exists d; create database d; create table d.t engine MergeTree order by a as select 1 a; optimize table d.t final" -- --path="${WORKING_FOLDER_01527}"
rm -rf "${WORKING_FOLDER_01527}"

View File

@ -0,0 +1,19 @@
Option 1. Prepare parts from from table with Engine=File defined in metadata, read from an arbitrary path
1 2020-01-01 String
2 2020-02-02 Another string
3 2020-03-03 One more string
4 2020-01-02 String for first partition
Option 2. Prepare parts from from table with Engine=File defined in metadata, read from stdin (pipe)
11 2020-01-01 String
12 2020-02-02 Another string
13 2020-03-03 One more string
14 2020-01-02 String for first partition
Option 3. Prepare parts from from table with Engine=File defined via command line, read from stdin (pipe)
21 2020-01-01 String
22 2020-02-02 Another string
23 2020-03-03 One more string
24 2020-01-02 String for first partition
Possibility to run optimize on prepared parts before sending parts to server
202001 1
202002 1
202003 1

View File

@ -0,0 +1,83 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh
WORKING_FOLDER_01528="${CLICKHOUSE_TMP}/01528_clickhouse_local_prepare_parts"
rm -rf "${WORKING_FOLDER_01528}"
mkdir -p "${WORKING_FOLDER_01528}/metadata/local"
## Checks scenario of preparing parts offline by clickhouse-local
## that is the metadata for the table we want to fill
## schema should match the schema of the table from server
## (the easiest way is just to copy it from the server)
cat <<EOF > "${WORKING_FOLDER_01528}/metadata/local/test.sql"
ATTACH TABLE local.test (id UInt64, d Date, s String) Engine=MergeTree ORDER BY id PARTITION BY toYYYYMM(d);
EOF
#################
echo "Option 1. Prepare parts from from table with Engine=File defined in metadata, read from an arbitrary path"
## Source file:
cat <<EOF > "${WORKING_FOLDER_01528}/data.csv"
1,2020-01-01,"String"
2,2020-02-02,"Another string"
3,2020-03-03,"One more string"
4,2020-01-02,"String for first partition"
EOF
## metadata written into file
cat <<EOF > "${WORKING_FOLDER_01528}/metadata/local/data_csv.sql"
ATTACH TABLE local.data_csv (id UInt64, d Date, s String) Engine=File(CSV, '${WORKING_FOLDER_01528}/data.csv');
EOF
## feed the table
${CLICKHOUSE_LOCAL} --query "INSERT INTO local.test SELECT * FROM local.data_csv;" -- --path="${WORKING_FOLDER_01528}"
## check the parts were created
${CLICKHOUSE_LOCAL} --query "SELECT * FROM local.test WHERE id < 10 ORDER BY id;" -- --path="${WORKING_FOLDER_01528}"
#################
echo "Option 2. Prepare parts from from table with Engine=File defined in metadata, read from stdin (pipe)"
cat <<EOF > "${WORKING_FOLDER_01528}/metadata/local/stdin.sql"
ATTACH TABLE local.stdin (id UInt64, d Date, s String) Engine=File(CSV, stdin);
EOF
cat <<EOF | ${CLICKHOUSE_LOCAL} --query "INSERT INTO local.test SELECT * FROM local.stdin;" -- --path="${WORKING_FOLDER_01528}"
11,2020-01-01,"String"
12,2020-02-02,"Another string"
13,2020-03-03,"One more string"
14,2020-01-02,"String for first partition"
EOF
${CLICKHOUSE_LOCAL} --query "SELECT * FROM local.test WHERE id BETWEEN 10 AND 19 ORDER BY id;" -- --path="${WORKING_FOLDER_01528}"
#################
echo "Option 3. Prepare parts from from table with Engine=File defined via command line, read from stdin (pipe)"
cat <<EOF | ${CLICKHOUSE_LOCAL} --query "INSERT INTO local.test SELECT * FROM table;" -S "id UInt64, d Date, s String" --input-format=CSV -- --path="${WORKING_FOLDER_01528}"
21,2020-01-01,"String"
22,2020-02-02,"Another string"
23,2020-03-03,"One more string"
24,2020-01-02,"String for first partition"
EOF
${CLICKHOUSE_LOCAL} --query "SELECT * FROM local.test WHERE id BETWEEN 20 AND 29 ORDER BY id;" -- --path="${WORKING_FOLDER_01528}"
#################
echo "Possibility to run optimize on prepared parts before sending parts to server"
${CLICKHOUSE_LOCAL} --query "OPTIMIZE TABLE local.test FINAL;" -- --path="${WORKING_FOLDER_01528}"
# ensure we have one part per partition
${CLICKHOUSE_LOCAL} --query "SELECT toYYYYMM(d) m, uniqExact(_part) FROM local.test GROUP BY m ORDER BY m" -- --path="${WORKING_FOLDER_01528}"
# cleanup
rm -rf "${WORKING_FOLDER_01528}"