Merge branch 'master' of github.com:yandex/ClickHouse

This commit is contained in:
Alexey Milovidov 2019-02-25 17:36:48 +03:00
commit c26657ce85
44 changed files with 1521 additions and 180 deletions

View File

@ -1,3 +1,11 @@
## ClickHouse release 19.3.5, 2019-02-21
### Исправления ошибок:
* Исправлена ошибка обработки длинных http-запросов на вставку на стороне сервера. [#4454](https://github.com/yandex/ClickHouse/pull/4454) ([alesapin](https://github.com/alesapin))
* Исправлена обратная несовместимость со старыми версиями, появившаяся из-за некорректной реализации настройки `send_logs_level`. [#4445](https://github.com/yandex/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Исправлена обратная несовместимость табличной функции `remote`, появившаяся из-за добавления комментариев колонок. [#4446](https://github.com/yandex/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov))
## ClickHouse release 19.3.4, 2019-02-16
### Улучшения:
@ -125,6 +133,13 @@
* Уменьшено время ожидания завершения сервера и завершения запросов `ALTER`. [#4372](https://github.com/yandex/ClickHouse/pull/4372) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Добавлена информация о значении настройки `replicated_can_become_leader` в таблицу `system.replicas`. Добавлено логирование в случае, если реплика не собирается стать лидером. [#4379](https://github.com/yandex/ClickHouse/pull/4379) ([Alex Zatelepin](https://github.com/ztlpn))
## ClickHouse release 19.1.9, 2019-02-21
### Исправления ошибок:
* Исправлена обратная несовместимость со старыми версиями, появившаяся из-за некорректной реализации настройки `send_logs_level`. [#4445](https://github.com/yandex/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Исправлена обратная несовместимость табличной функции `remote`, появившаяся из-за добавления комментариев колонок. [#4446](https://github.com/yandex/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov))
## ClickHouse release 19.1.8, 2019-02-16
### Исправления ошибок:

View File

@ -21,7 +21,7 @@ SELECT UserID FROM {table} WHERE UserID = 12345678901234567890;
SELECT count() FROM {table} WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, any(URL), count() AS c FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM {table} PREWHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;

View File

@ -513,7 +513,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
{
/// DDL worker should be started after all tables were loaded
String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/");
global_context->setDDLWorker(std::make_shared<DDLWorker>(ddl_zookeeper_path, *global_context, &config(), "distributed_ddl"));
global_context->setDDLWorker(std::make_unique<DDLWorker>(ddl_zookeeper_path, *global_context, &config(), "distributed_ddl"));
}
std::unique_ptr<DNSCacheUpdater> dns_cache_updater;

View File

@ -294,12 +294,20 @@
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</query_log>
<!-- Query thread log. Has information about all threads participated in query execution.
Used only for queries with setting log_query_threads = 1. -->
<query_thread_log>
<database>system</database>
<table>query_thread_log</table>
<partition_by>toYYYYMM(event_date)</partition_by>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</query_thread_log>
<!-- Uncomment if use part_log
<!-- Uncomment if use part log.
Part log contains information about all actions with parts in MergeTree tables (creation, deletion, merges, downloads).
<part_log>
<database>system</database>
<table>part_log</table>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</part_log>
-->

View File

@ -192,7 +192,8 @@ MongoDBDictionarySource::MongoDBDictionarySource(
{
# if POCO_VERSION >= 0x01070800
Poco::MongoDB::Database poco_db(db);
poco_db.authenticate(*connection, user, password, method.empty() ? Poco::MongoDB::Database::AUTH_SCRAM_SHA1 : method);
if (!poco_db.authenticate(*connection, user, password, method.empty() ? Poco::MongoDB::Database::AUTH_SCRAM_SHA1 : method))
throw Exception("Cannot authenticate in MongoDB, incorrect user or password", ErrorCodes::MONGODB_CANNOT_AUTHENTICATE);
# else
authenticate(*connection, db, user, password);
# endif

View File

@ -28,6 +28,8 @@ namespace ErrorCodes
class ArrayJoinedColumnsMatcher
{
public:
using Visitor = InDepthNodeVisitor<ArrayJoinedColumnsMatcher, true>;
struct Data
{
const Aliases & aliases;
@ -36,8 +38,6 @@ public:
NameToNameMap & array_join_result_to_source;
};
static constexpr const char * label = "ArrayJoinedColumns";
static bool needChildVisit(ASTPtr & node, const ASTPtr & child)
{
if (typeid_cast<ASTTablesInSelectQuery *>(node.get()))
@ -50,17 +50,16 @@ public:
return true;
}
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data)
static void visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTIdentifier *>(ast.get()))
visit(*t, ast, data);
if (auto * t = typeid_cast<ASTSelectQuery *>(ast.get()))
return visit(*t, ast, data);
return {};
visit(*t, ast, data);
}
private:
static std::vector<ASTPtr *> visit(const ASTSelectQuery & node, ASTPtr &, Data & data)
static void visit(const ASTSelectQuery & node, ASTPtr &, Data & data)
{
ASTPtr array_join_expression_list = node.array_join_expression_list();
if (!array_join_expression_list)
@ -87,7 +86,8 @@ private:
out.emplace_back(&child2);
}
return out;
for (ASTPtr * add_node : out)
Visitor(data).visit(*add_node);
}
static void visit(const ASTIdentifier & node, ASTPtr &, Data & data)
@ -130,6 +130,6 @@ private:
}
};
using ArrayJoinedColumnsVisitor = InDepthNodeVisitor<ArrayJoinedColumnsMatcher, true>;
using ArrayJoinedColumnsVisitor = ArrayJoinedColumnsMatcher::Visitor;
}

View File

@ -41,6 +41,7 @@
#include <Interpreters/QueryThreadLog.h>
#include <Interpreters/PartLog.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Common/DNSResolver.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/UncompressedCache.h>
@ -141,7 +142,7 @@ struct ContextShared
std::optional<BackgroundSchedulePool> schedule_pool; /// A thread pool that can run different jobs in background (used in replicated tables)
MultiVersion<Macros> macros; /// Substitutions extracted from config.
std::optional<Compiler> compiler; /// Used for dynamic compilation of queries' parts if it necessary.
std::shared_ptr<DDLWorker> ddl_worker; /// Process ddl commands from zk.
std::unique_ptr<DDLWorker> ddl_worker; /// Process ddl commands from zk.
/// Rules for selecting the compression settings, depending on the size of the part.
mutable std::unique_ptr<CompressionCodecSelector> compression_codec_selector;
std::optional<MergeTreeSettings> merge_tree_settings; /// Settings of MergeTree* engines.
@ -274,6 +275,7 @@ struct ContextShared
external_models.reset();
background_pool.reset();
schedule_pool.reset();
ddl_worker.reset();
}
private:
@ -1360,12 +1362,12 @@ BackgroundSchedulePool & Context::getSchedulePool()
return *shared->schedule_pool;
}
void Context::setDDLWorker(std::shared_ptr<DDLWorker> ddl_worker)
void Context::setDDLWorker(std::unique_ptr<DDLWorker> ddl_worker)
{
auto lock = getLock();
if (shared->ddl_worker)
throw Exception("DDL background thread has already been initialized.", ErrorCodes::LOGICAL_ERROR);
shared->ddl_worker = ddl_worker;
shared->ddl_worker = std::move(ddl_worker);
}
DDLWorker & Context::getDDLWorker() const
@ -1578,7 +1580,7 @@ PartLog * Context::getPartLog(const String & part_database)
{
auto lock = getLock();
/// System logs are shutting down.
/// No part log or system logs are shutting down.
if (!shared->system_logs || !shared->system_logs->part_log)
return nullptr;

View File

@ -371,7 +371,7 @@ public:
BackgroundProcessingPool & getBackgroundPool();
BackgroundSchedulePool & getSchedulePool();
void setDDLWorker(std::shared_ptr<DDLWorker> ddl_worker);
void setDDLWorker(std::unique_ptr<DDLWorker> ddl_worker);
DDLWorker & getDDLWorker() const;
Clusters & getClusters() const;

View File

@ -181,11 +181,10 @@ static ASTPtr getCrossJoin(ASTSelectQuery & select, std::vector<DatabaseAndTable
}
std::vector<ASTPtr *> CrossToInnerJoinMatcher::visit(ASTPtr & ast, Data & data)
void CrossToInnerJoinMatcher::visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTSelectQuery *>(ast.get()))
visit(*t, ast, data);
return {};
}
void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr & ast, Data & data)

View File

@ -16,10 +16,8 @@ public:
bool done = false;
};
static constexpr const char * label = "JoinToSubqueryTransform";
static bool needChildVisit(ASTPtr &, const ASTPtr &) { return true; }
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data);
static void visit(ASTPtr & ast, Data & data);
private:
static void visit(ASTSelectQuery & select, ASTPtr & ast, Data & data);

View File

@ -60,13 +60,12 @@ bool ExecuteScalarSubqueriesMatcher::needChildVisit(ASTPtr & node, const ASTPtr
return true;
}
std::vector<ASTPtr *> ExecuteScalarSubqueriesMatcher::visit(ASTPtr & ast, Data & data)
void ExecuteScalarSubqueriesMatcher::visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTSubquery *>(ast.get()))
visit(*t, ast, data);
if (auto * t = typeid_cast<ASTFunction *>(ast.get()))
return visit(*t, ast, data);
return {};
visit(*t, ast, data);
}
void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr & ast, Data & data)
@ -134,7 +133,7 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr
}
}
std::vector<ASTPtr *> ExecuteScalarSubqueriesMatcher::visit(const ASTFunction & func, ASTPtr & ast, Data &)
void ExecuteScalarSubqueriesMatcher::visit(const ASTFunction & func, ASTPtr & ast, Data & data)
{
/// Don't descend into subqueries in arguments of IN operator.
/// But if an argument is not subquery, than deeper may be scalar subqueries and we need to descend in them.
@ -156,7 +155,8 @@ std::vector<ASTPtr *> ExecuteScalarSubqueriesMatcher::visit(const ASTFunction &
for (auto & child : ast->children)
out.push_back(&child);
return out;
for (ASTPtr * add_node : out)
Visitor(data).visit(*add_node);
}
}

View File

@ -30,22 +30,22 @@ struct ASTTableExpression;
class ExecuteScalarSubqueriesMatcher
{
public:
using Visitor = InDepthNodeVisitor<ExecuteScalarSubqueriesMatcher, true>;
struct Data
{
const Context & context;
size_t subquery_depth;
};
static constexpr const char * label = "ExecuteScalarSubqueries";
static bool needChildVisit(ASTPtr & node, const ASTPtr &);
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data);
static void visit(ASTPtr & ast, Data & data);
private:
static void visit(const ASTSubquery & subquery, ASTPtr & ast, Data & data);
static std::vector<ASTPtr *> visit(const ASTFunction & func, ASTPtr & ast, Data & data);
static void visit(const ASTFunction & func, ASTPtr & ast, Data & data);
};
using ExecuteScalarSubqueriesVisitor = InDepthNodeVisitor<ExecuteScalarSubqueriesMatcher, true>;
using ExecuteScalarSubqueriesVisitor = ExecuteScalarSubqueriesMatcher::Visitor;
}

View File

@ -20,24 +20,20 @@ public:
Tables & external_tables;
};
static constexpr const char * label = "ExternalTables";
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data)
static void visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTIdentifier *>(ast.get()))
return visit(*t, ast, data);
return {};
visit(*t, ast, data);
}
static bool needChildVisit(ASTPtr &, const ASTPtr &) { return true; }
private:
static std::vector<ASTPtr *> visit(const ASTIdentifier & node, ASTPtr &, Data & data)
static void visit(const ASTIdentifier & node, ASTPtr &, Data & data)
{
if (auto opt_name = IdentifierSemantic::getTableName(node))
if (StoragePtr external_storage = data.context.tryGetExternalTable(*opt_name))
data.external_tables[*opt_name] = external_storage;
return {};
}
};

View File

@ -138,15 +138,12 @@ public:
}
};
static constexpr const char * label = "GlobalSubqueries";
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data)
static void visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTFunction *>(ast.get()))
visit(*t, ast, data);
if (auto * t = typeid_cast<ASTTablesInSelectQueryElement *>(ast.get()))
visit(*t, ast, data);
return {};
}
static bool needChildVisit(ASTPtr &, const ASTPtr & child)

View File

@ -1,5 +1,6 @@
#pragma once
#include <typeinfo>
#include <vector>
#include <Common/typeid_cast.h>
#include <Parsers/DumpASTNode.h>
@ -8,7 +9,7 @@ namespace DB
{
/// Visits AST tree in depth, call functions for nodes according to Matcher type data.
/// You need to define Data, label, visit() and needChildVisit() in Matcher class.
/// You need to define Data, visit() and needChildVisit() in Matcher class.
template <typename Matcher, bool _top_to_bottom>
class InDepthNodeVisitor
{
@ -23,17 +24,12 @@ public:
void visit(ASTPtr & ast)
{
DumpASTNode dump(*ast, ostr, visit_depth, Matcher::label);
DumpASTNode dump(*ast, ostr, visit_depth, typeid(Matcher).name());
if constexpr (!_top_to_bottom)
visitChildren(ast);
/// It operates with ASTPtr * cause we may want to rewrite ASTPtr in visit().
std::vector<ASTPtr *> additional_nodes = Matcher::visit(ast, data);
/// visit additional nodes (ex. only part of children)
for (ASTPtr * node : additional_nodes)
visit(*node);
Matcher::visit(ast, data);
if constexpr (_top_to_bottom)
visitChildren(ast);
@ -60,15 +56,12 @@ public:
using Data = _Data;
using TypeToVisit = typename Data::TypeToVisit;
static constexpr const char * label = "";
static bool needChildVisit(ASTPtr &, const ASTPtr &) { return _visit_children; }
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data)
static void visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<TypeToVisit *>(ast.get()))
data.visit(*t, ast);
return {};
}
};
@ -79,15 +72,12 @@ class LinkedMatcher
public:
using Data = std::pair<typename First::Data, typename Second::Data>;
static constexpr const char * label = "";
static bool needChildVisit(ASTPtr &, const ASTPtr &) { return true; }
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data)
static void visit(ASTPtr & ast, Data & data)
{
First::visit(ast, data.first);
Second::visit(ast, data.second);
return {};
}
};

View File

@ -79,8 +79,6 @@ struct ColumnAliasesMatcher
}
};
static constexpr const char * label = "ColumnAliases";
static bool needChildVisit(ASTPtr & node, const ASTPtr &)
{
if (typeid_cast<const ASTQualifiedAsterisk *>(node.get()))
@ -88,7 +86,7 @@ struct ColumnAliasesMatcher
return true;
}
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data)
static void visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTIdentifier *>(ast.get()))
visit(*t, ast, data);
@ -96,7 +94,6 @@ struct ColumnAliasesMatcher
if (typeid_cast<ASTAsterisk *>(ast.get()) ||
typeid_cast<ASTQualifiedAsterisk *>(ast.get()))
throw Exception("Multiple JOIN do not support asterisks yet", ErrorCodes::NOT_IMPLEMENTED);
return {};
}
static void visit(ASTIdentifier & node, ASTPtr &, Data & data)
@ -225,11 +222,10 @@ using AppendSemanticVisitor = InDepthNodeVisitor<AppendSemanticMatcher, true>;
} /// namelesspace
std::vector<ASTPtr *> JoinToSubqueryTransformMatcher::visit(ASTPtr & ast, Data & data)
void JoinToSubqueryTransformMatcher::visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTSelectQuery *>(ast.get()))
visit(*t, ast, data);
return {};
}
void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & data)

View File

@ -18,10 +18,8 @@ public:
bool done = false;
};
static constexpr const char * label = "JoinToSubqueryTransform";
static bool needChildVisit(ASTPtr &, const ASTPtr &) { return true; }
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data);
static void visit(ASTPtr & ast, Data & data);
private:
/// - combines two source TablesInSelectQueryElement into resulting one (Subquery)

View File

@ -39,20 +39,19 @@ bool QueryAliasesMatcher::needChildVisit(ASTPtr & node, const ASTPtr &)
return true;
}
std::vector<ASTPtr *> QueryAliasesMatcher::visit(ASTPtr & ast, Data & data)
void QueryAliasesMatcher::visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTSubquery *>(ast.get()))
return visit(*t, ast, data);
if (auto * t = typeid_cast<ASTArrayJoin *>(ast.get()))
return visit(*t, ast, data);
visitOther(ast, data);
return {};
if (auto * s = typeid_cast<ASTSubquery *>(ast.get()))
visit(*s, ast, data);
else if (auto * aj = typeid_cast<ASTArrayJoin *>(ast.get()))
visit(*aj, ast, data);
else
visitOther(ast, data);
}
/// The top-level aliases in the ARRAY JOIN section have a special meaning, we will not add them
/// (skip the expression list itself and its children).
std::vector<ASTPtr *> QueryAliasesMatcher::visit(const ASTArrayJoin &, const ASTPtr & ast, Data & data)
void QueryAliasesMatcher::visit(const ASTArrayJoin &, const ASTPtr & ast, Data & data)
{
visitOther(ast, data);
@ -64,14 +63,13 @@ std::vector<ASTPtr *> QueryAliasesMatcher::visit(const ASTArrayJoin &, const AST
/// create own visitor to run bottom to top
for (auto & child : grand_children)
QueryAliasesVisitor(data).visit(child);
return {};
Visitor(data).visit(child);
}
/// set unique aliases for all subqueries. this is needed, because:
/// 1) content of subqueries could change after recursive analysis, and auto-generated column names could become incorrect
/// 2) result of different scalar subqueries can be cached inside expressions compilation cache and must have different names
std::vector<ASTPtr *> QueryAliasesMatcher::visit(ASTSubquery & subquery, const ASTPtr & ast, Data & data)
void QueryAliasesMatcher::visit(ASTSubquery & subquery, const ASTPtr & ast, Data & data)
{
Aliases & aliases = data.aliases;
@ -92,7 +90,6 @@ std::vector<ASTPtr *> QueryAliasesMatcher::visit(ASTSubquery & subquery, const A
}
else
visitOther(ast, data);
return {};
}
void QueryAliasesMatcher::visitOther(const ASTPtr & ast, Data & data)

View File

@ -15,23 +15,23 @@ struct ASTArrayJoin;
class QueryAliasesMatcher
{
public:
using Visitor = InDepthNodeVisitor<QueryAliasesMatcher, false>;
struct Data
{
Aliases & aliases;
};
static constexpr const char * label = "QueryAliases";
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data);
static void visit(ASTPtr & ast, Data & data);
static bool needChildVisit(ASTPtr & node, const ASTPtr & child);
private:
static std::vector<ASTPtr *> visit(ASTSubquery & subquery, const ASTPtr & ast, Data & data);
static std::vector<ASTPtr *> visit(const ASTArrayJoin &, const ASTPtr & ast, Data & data);
static void visit(ASTSubquery & subquery, const ASTPtr & ast, Data & data);
static void visit(const ASTArrayJoin &, const ASTPtr & ast, Data & data);
static void visitOther(const ASTPtr & ast, Data & data);
};
/// Visits AST nodes and collect their aliases in one map (with links to source nodes).
using QueryAliasesVisitor = InDepthNodeVisitor<QueryAliasesMatcher, false>;
using QueryAliasesVisitor = QueryAliasesMatcher::Visitor;
}

View File

@ -62,20 +62,20 @@ bool RequiredSourceColumnsMatcher::needChildVisit(ASTPtr & node, const ASTPtr &
return true;
}
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTPtr & ast, Data & data)
void RequiredSourceColumnsMatcher::visit(ASTPtr & ast, Data & data)
{
/// results are columns
if (auto * t = typeid_cast<ASTIdentifier *>(ast.get()))
{
visit(*t, ast, data);
return {};
return;
}
if (auto * t = typeid_cast<ASTFunction *>(ast.get()))
{
data.addColumnAliasIfAny(*ast);
visit(*t, ast, data);
return {};
return;
}
/// results are tables
@ -83,24 +83,24 @@ std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTPtr & ast, Data & d
if (auto * t = typeid_cast<ASTTablesInSelectQueryElement *>(ast.get()))
{
visit(*t, ast, data);
return {};
return;
}
if (auto * t = typeid_cast<ASTTableExpression *>(ast.get()))
{
//data.addTableAliasIfAny(*ast); alias is attached to child
visit(*t, ast, data);
return {};
return;
}
if (auto * t = typeid_cast<ASTSelectQuery *>(ast.get()))
{
data.addTableAliasIfAny(*ast);
return visit(*t, ast, data);
visit(*t, ast, data);
return;
}
if (typeid_cast<ASTSubquery *>(ast.get()))
{
data.addTableAliasIfAny(*ast);
return {};
return;
}
/// other
@ -108,13 +108,12 @@ std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTPtr & ast, Data & d
if (auto * t = typeid_cast<ASTArrayJoin *>(ast.get()))
{
data.has_array_join = true;
return visit(*t, ast, data);
visit(*t, ast, data);
return;
}
return {};
}
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTSelectQuery & select, const ASTPtr &, Data & data)
void RequiredSourceColumnsMatcher::visit(ASTSelectQuery & select, const ASTPtr &, Data & data)
{
/// special case for top-level SELECT items: they are publics
for (auto & node : select.select_expression_list->children)
@ -132,7 +131,9 @@ std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTSelectQuery & selec
/// revisit select_expression_list (with children) when all the aliases are set
out.push_back(&select.select_expression_list);
return out;
for (ASTPtr * add_node : out)
Visitor(data).visit(*add_node);
}
void RequiredSourceColumnsMatcher::visit(const ASTIdentifier & node, const ASTPtr &, Data & data)
@ -180,29 +181,20 @@ void RequiredSourceColumnsMatcher::visit(ASTTablesInSelectQueryElement & node, c
data.tables.emplace_back(ColumnNamesContext::JoinedTable{expr, join});
}
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTTableExpression & node, const ASTPtr &, Data & data)
/// ASTIdentifiers here are tables. Do not visit them as generic ones.
void RequiredSourceColumnsMatcher::visit(ASTTableExpression & node, const ASTPtr &, Data & data)
{
/// ASTIdentifiers here are tables. Do not visit them as generic ones.
if (node.database_and_table_name)
data.addTableAliasIfAny(*node.database_and_table_name);
std::vector<ASTPtr *> out;
if (node.table_function)
{
data.addTableAliasIfAny(*node.table_function);
out.push_back(&node.table_function);
}
if (node.subquery)
{
data.addTableAliasIfAny(*node.subquery);
out.push_back(&node.subquery);
}
return out;
}
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(const ASTArrayJoin & node, const ASTPtr &, Data & data)
void RequiredSourceColumnsMatcher::visit(const ASTArrayJoin & node, const ASTPtr &, Data & data)
{
ASTPtr expression_list = node.expression_list;
if (!expression_list || expression_list->children.empty())
@ -224,7 +216,8 @@ std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(const ASTArrayJoin & n
out.push_back(&expr);
}
return out;
for (ASTPtr * add_node : out)
Visitor(data).visit(*add_node);
}
}

View File

@ -21,25 +21,24 @@ struct ASTTableExpression;
class RequiredSourceColumnsMatcher
{
public:
using Visitor = InDepthNodeVisitor<RequiredSourceColumnsMatcher, false>;
using Data = ColumnNamesContext;
static constexpr const char * label = "RequiredSourceColumns";
static bool needChildVisit(ASTPtr & node, const ASTPtr & child);
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data);
static void visit(ASTPtr & ast, Data & data);
private:
static void visit(const ASTIdentifier & node, const ASTPtr &, Data & data);
static void visit(const ASTFunction & node, const ASTPtr &, Data & data);
static void visit(ASTTablesInSelectQueryElement & node, const ASTPtr &, Data & data);
static std::vector<ASTPtr *> visit(ASTTableExpression & node, const ASTPtr &, Data & data);
static std::vector<ASTPtr *> visit(const ASTArrayJoin & node, const ASTPtr &, Data & data);
static std::vector<ASTPtr *> visit(ASTSelectQuery & select, const ASTPtr &, Data & data);
static void visit(ASTTableExpression & node, const ASTPtr &, Data & data);
static void visit(const ASTArrayJoin & node, const ASTPtr &, Data & data);
static void visit(ASTSelectQuery & select, const ASTPtr &, Data & data);
};
/// Extracts all the information about columns and tables from ASTSelectQuery block into ColumnNamesContext object.
/// It doesn't use anything but AST. It visits nodes from bottom to top except ASTFunction content to get aliases in right manner.
/// @note There's some ambiguousness with nested columns names that can't be solved without schema.
using RequiredSourceColumnsVisitor = InDepthNodeVisitor<RequiredSourceColumnsMatcher, false>;
using RequiredSourceColumnsVisitor = RequiredSourceColumnsMatcher::Visitor;
}

View File

@ -9,11 +9,41 @@
namespace DB
{
namespace
{
constexpr size_t DEFAULT_SYSTEM_LOG_FLUSH_INTERVAL_MILLISECONDS = 7500;
/// Creates a system log with MergeTree engine using parameters from config
template <typename TSystemLog>
std::unique_ptr<TSystemLog> createSystemLog(
Context & context,
const String & default_database_name,
const String & default_table_name,
const Poco::Util::AbstractConfiguration & config,
const String & config_prefix)
{
if (!config.has(config_prefix))
return {};
String database = config.getString(config_prefix + ".database", default_database_name);
String table = config.getString(config_prefix + ".table", default_table_name);
String partition_by = config.getString(config_prefix + ".partition_by", "toYYYYMM(event_date)");
String engine = "ENGINE = MergeTree PARTITION BY (" + partition_by + ") ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024";
size_t flush_interval_milliseconds = config.getUInt64(config_prefix + ".flush_interval_milliseconds", DEFAULT_SYSTEM_LOG_FLUSH_INTERVAL_MILLISECONDS);
return std::make_unique<TSystemLog>(context, database, table, engine, flush_interval_milliseconds);
}
}
SystemLogs::SystemLogs(Context & global_context, const Poco::Util::AbstractConfiguration & config)
{
query_log = createDefaultSystemLog<QueryLog>(global_context, "system", "query_log", config, "query_log");
query_thread_log = createDefaultSystemLog<QueryThreadLog>(global_context, "system", "query_thread_log", config, "query_thread_log");
part_log = createDefaultSystemLog<PartLog>(global_context, "system", "part_log", config, "part_log");
query_log = createSystemLog<QueryLog>(global_context, "system", "query_log", config, "query_log");
query_thread_log = createSystemLog<QueryThreadLog>(global_context, "system", "query_thread_log", config, "query_thread_log");
part_log = createSystemLog<PartLog>(global_context, "system", "part_log", config, "part_log");
part_log_database = config.getString("part_log.database", "system");
}

View File

@ -378,26 +378,4 @@ void SystemLog<LogElement>::prepareTable()
is_prepared = true;
}
/// Creates a system log with MergeTree engine using parameters from config
template<typename TSystemLog>
std::unique_ptr<TSystemLog> createDefaultSystemLog(
Context & context,
const String & default_database_name,
const String & default_table_name,
const Poco::Util::AbstractConfiguration & config,
const String & config_prefix)
{
static constexpr size_t DEFAULT_SYSTEM_LOG_FLUSH_INTERVAL_MILLISECONDS = 7500;
String database = config.getString(config_prefix + ".database", default_database_name);
String table = config.getString(config_prefix + ".table", default_table_name);
String partition_by = config.getString(config_prefix + ".partition_by", "toYYYYMM(event_date)");
String engine = "ENGINE = MergeTree PARTITION BY (" + partition_by + ") ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024";
size_t flush_interval_milliseconds = config.getUInt64(config_prefix + ".flush_interval_milliseconds", DEFAULT_SYSTEM_LOG_FLUSH_INTERVAL_MILLISECONDS);
return std::make_unique<TSystemLog>(context, database, table, engine, flush_interval_milliseconds);
}
}

View File

@ -44,22 +44,21 @@ bool TranslateQualifiedNamesMatcher::needChildVisit(ASTPtr & node, const ASTPtr
return true;
}
std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(ASTPtr & ast, Data & data)
void TranslateQualifiedNamesMatcher::visit(ASTPtr & ast, Data & data)
{
if (auto * t = typeid_cast<ASTIdentifier *>(ast.get()))
return visit(*t, ast, data);
visit(*t, ast, data);
if (auto * t = typeid_cast<ASTTableJoin *>(ast.get()))
return visit(*t, ast, data);
visit(*t, ast, data);
if (auto * t = typeid_cast<ASTSelectQuery *>(ast.get()))
return visit(*t, ast, data);
visit(*t, ast, data);
if (auto * node = typeid_cast<ASTExpressionList *>(ast.get()))
visit(*node, ast, data);
if (auto * node = typeid_cast<ASTFunction *>(ast.get()))
visit(*node, ast, data);
return {};
}
std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(ASTIdentifier & identifier, ASTPtr &, Data & data)
void TranslateQualifiedNamesMatcher::visit(ASTIdentifier & identifier, ASTPtr &, Data & data)
{
if (IdentifierSemantic::getColumnName(identifier))
{
@ -82,8 +81,6 @@ std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(ASTIdentifier & iden
if (!data.tables.empty())
IdentifierSemantic::setColumnNormalName(identifier, data.tables[best_table_pos].first);
}
return {};
}
/// As special case, treat count(*) as count(), not as count(list of all columns).
@ -98,7 +95,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTFunction & node, const ASTPtr &, D
func_arguments->children.clear();
}
std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk & , const ASTPtr & ast, Data & data)
void TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk & , const ASTPtr & ast, Data & data)
{
if (ast->children.size() != 1)
throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);
@ -110,22 +107,24 @@ std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAs
for (const auto & known_table : data.tables)
if (db_and_table.satisfies(known_table.first, true))
return {};
return;
throw Exception("Unknown qualified identifier: " + ident->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
}
std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(ASTTableJoin & join, const ASTPtr & , Data &)
void TranslateQualifiedNamesMatcher::visit(ASTTableJoin & join, const ASTPtr & , Data & data)
{
std::vector<ASTPtr *> out;
if (join.using_expression_list)
out.push_back(&join.using_expression_list);
else if (join.on_expression)
out.push_back(&join.on_expression);
return out;
for (ASTPtr * add_node : out)
Visitor(data).visit(*add_node);
}
std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(ASTSelectQuery & select, const ASTPtr & , Data & data)
void TranslateQualifiedNamesMatcher::visit(ASTSelectQuery & select, const ASTPtr & , Data & data)
{
if (auto join = select.join())
extractJoinUsingColumns(join->table_join, data);
@ -139,7 +138,9 @@ std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(ASTSelectQuery & sel
out.push_back(&select.where_expression);
if (select.having_expression)
out.push_back(&select.having_expression);
return out;
for (ASTPtr * add_node : out)
Visitor(data).visit(*add_node);
}
/// qualifed names for duplicates

View File

@ -20,6 +20,8 @@ class ASTFunction;
class TranslateQualifiedNamesMatcher
{
public:
using Visitor = InDepthNodeVisitor<TranslateQualifiedNamesMatcher, true>;
struct Data
{
const NameSet & source_columns;
@ -46,16 +48,14 @@ public:
bool processAsterisks() const { return !tables.empty() && has_columns; }
};
static constexpr const char * label = "TranslateQualifiedNames";
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data);
static void visit(ASTPtr & ast, Data & data);
static bool needChildVisit(ASTPtr & node, const ASTPtr & child);
private:
static std::vector<ASTPtr *> visit(ASTIdentifier & node, ASTPtr & ast, Data &);
static std::vector<ASTPtr *> visit(const ASTQualifiedAsterisk & node, const ASTPtr & ast, Data &);
static std::vector<ASTPtr *> visit(ASTTableJoin & node, const ASTPtr & ast, Data &);
static std::vector<ASTPtr *> visit(ASTSelectQuery & node, const ASTPtr & ast, Data &);
static void visit(ASTIdentifier & node, ASTPtr & ast, Data &);
static void visit(const ASTQualifiedAsterisk & node, const ASTPtr & ast, Data &);
static void visit(ASTTableJoin & node, const ASTPtr & ast, Data &);
static void visit(ASTSelectQuery & node, const ASTPtr & ast, Data &);
static void visit(ASTExpressionList &, const ASTPtr &, Data &);
static void visit(ASTFunction &, const ASTPtr &, Data &);
@ -64,6 +64,6 @@ private:
/// Visits AST for names qualification.
/// It finds columns and translate their names to the normal form. Expand asterisks and qualified asterisks with column names.
using TranslateQualifiedNamesVisitor = InDepthNodeVisitor<TranslateQualifiedNamesMatcher, true>;
using TranslateQualifiedNamesVisitor = TranslateQualifiedNamesMatcher::Visitor;
}

View File

@ -14,7 +14,7 @@ Don't use Docker from your system repository.
* [pip](https://pypi.python.org/pypi/pip). To install: `sudo apt-get install python-pip`
* [py.test](https://docs.pytest.org/) testing framework. To install: `sudo -H pip install pytest`
* [docker-compose](https://docs.docker.com/compose/) and additional python libraries. To install: `sudo -H pip install docker-compose docker dicttoxml kazoo PyMySQL psycopg2`
* [docker-compose](https://docs.docker.com/compose/) and additional python libraries. To install: `sudo -H pip install docker-compose docker dicttoxml kazoo PyMySQL psycopg2 pymongo tzlocal`
(highly not recommended) If you really want to use OS packages on modern debian/ubuntu instead of "pip": `sudo apt install -y docker docker-compose python-pytest python-dicttoxml python-docker python-pymysql python-kazoo python-psycopg2`

View File

@ -15,6 +15,8 @@ from kazoo.client import KazooClient
from kazoo.exceptions import KazooException
import psycopg2
import requests
import base64
import pymongo
import docker
from docker.errors import ContainerError
@ -98,6 +100,7 @@ class ClickHouseCluster:
self.with_kafka = False
self.with_odbc_drivers = False
self.with_hdfs = False
self.with_mongo = False
self.docker_client = None
self.is_up = False
@ -109,7 +112,7 @@ class ClickHouseCluster:
cmd += " client"
return cmd
def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None):
def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None):
"""Add an instance to the cluster.
name - the name of the instance directory and the value of the 'instance' macro in ClickHouse.
@ -127,7 +130,7 @@ class ClickHouseCluster:
instance = ClickHouseInstance(
self, self.base_dir, name, config_dir, main_configs, user_configs, macros, with_zookeeper,
self.zookeeper_config_path, with_mysql, with_kafka, self.base_configs_dir, self.server_bin_path,
self.zookeeper_config_path, with_mysql, with_kafka, with_mongo, self.base_configs_dir, self.server_bin_path,
self.odbc_bridge_bin_path, clickhouse_path_dir, with_odbc_drivers, hostname=hostname,
env_variables=env_variables, image=image, stay_alive=stay_alive, ipv4_address=ipv4_address, ipv6_address=ipv6_address)
@ -176,6 +179,11 @@ class ClickHouseCluster:
self.base_hdfs_cmd = ['docker-compose', '--project-directory', self.base_dir, '--project-name',
self.project_name, '--file', p.join(HELPERS_DIR, 'docker_compose_hdfs.yml')]
if with_mongo and not self.with_mongo:
self.with_mongo = True
self.base_cmd.extend(['--file', p.join(HELPERS_DIR, 'docker_compose_mongo.yml')])
self.base_mongo_cmd = ['docker-compose', '--project-directory', self.base_dir, '--project-name',
self.project_name, '--file', p.join(HELPERS_DIR, 'docker_compose_mongo.yml')]
return instance
@ -248,6 +256,20 @@ class ClickHouseCluster:
raise Exception("Can't wait HDFS to start")
def wait_mongo_to_start(self, timeout=30):
connection_str = 'mongodb://{user}:{password}@{host}:{port}'.format(
host='localhost', port='27018', user='root', password='clickhouse')
connection = pymongo.MongoClient(connection_str)
start = time.time()
while time.time() - start < timeout:
try:
connection.database_names()
print "Connected to Mongo dbs:", connection.database_names()
return
except Exception as ex:
print "Can't connect to Mongo " + str(ex)
time.sleep(1)
def start(self, destroy_dirs=True):
if self.is_up:
return
@ -290,6 +312,10 @@ class ClickHouseCluster:
subprocess_check_call(self.base_hdfs_cmd + ['up', '-d', '--force-recreate'])
self.wait_hdfs_to_start(120)
if self.with_mongo and self.base_mongo_cmd:
subprocess_check_call(self.base_mongo_cmd + ['up', '-d', '--force-recreate'])
self.wait_mongo_to_start(30)
subprocess_check_call(self.base_cmd + ['up', '-d', '--no-recreate'])
start_deadline = time.time() + 20.0 # seconds
@ -388,7 +414,7 @@ class ClickHouseInstance:
def __init__(
self, cluster, base_path, name, custom_config_dir, custom_main_configs, custom_user_configs, macros,
with_zookeeper, zookeeper_config_path, with_mysql, with_kafka, base_configs_dir, server_bin_path, odbc_bridge_bin_path,
with_zookeeper, zookeeper_config_path, with_mysql, with_kafka, with_mongo, base_configs_dir, server_bin_path, odbc_bridge_bin_path,
clickhouse_path_dir, with_odbc_drivers, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test",
stay_alive=False, ipv4_address=None, ipv6_address=None):
@ -412,6 +438,7 @@ class ClickHouseInstance:
self.with_mysql = with_mysql
self.with_kafka = with_kafka
self.with_mongo = with_mongo
self.path = p.join(self.cluster.instances_dir, name)
self.docker_compose_path = p.join(self.path, 'docker_compose.yml')
@ -456,10 +483,10 @@ class ClickHouseInstance:
return self.client.get_query_request(*args, **kwargs)
def exec_in_container(self, cmd, **kwargs):
def exec_in_container(self, cmd, detach=False, **kwargs):
container = self.get_docker_handle()
exec_id = self.docker_client.api.exec_create(container.id, cmd, **kwargs)
output = self.docker_client.api.exec_start(exec_id, detach=False)
output = self.docker_client.api.exec_start(exec_id, detach=detach)
output = output.decode('utf8')
exit_code = self.docker_client.api.exec_inspect(exec_id)['ExitCode']
@ -467,6 +494,13 @@ class ClickHouseInstance:
raise Exception('Cmd "{}" failed! Return code {}. Output: {}'.format(' '.join(cmd), exit_code, output))
return output
def copy_file_to_container(self, local_path, dest_path):
with open(local_path, 'r') as fdata:
data = fdata.read()
encoded_data = base64.b64encode(data)
self.exec_in_container(["bash", "-c", "echo {} | base64 --decode > {}".format(encoded_data, dest_path)])
def get_docker_handle(self):
return self.docker_client.containers.get(self.docker_id)

View File

@ -0,0 +1,10 @@
version: '2.2'
services:
mongo1:
image: mongo:3.6
restart: always
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: clickhouse
ports:
- 27018:27017

View File

@ -24,7 +24,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes -
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN pip install pytest docker-compose==1.22.0 docker dicttoxml kazoo PyMySQL psycopg2
RUN pip install pytest docker-compose==1.22.0 docker dicttoxml kazoo PyMySQL psycopg2 pymongo tzlocal
ENV DOCKER_CHANNEL stable
ENV DOCKER_VERSION 17.09.1-ce
@ -61,4 +61,4 @@ RUN set -x \
VOLUME /var/lib/docker
EXPOSE 2375
ENTRYPOINT ["dockerd-entrypoint.sh"]
CMD []
CMD []

View File

@ -0,0 +1,30 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>trace</level>
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
<size>1000M</size>
<count>10</count>
</logger>
<tcp_port>9000</tcp_port>
<listen_host>127.0.0.1</listen_host>
<openSSL>
<client>
<cacheSessions>true</cacheSessions>
<verificationMode>none</verificationMode>
<invalidCertificateHandler>
<name>AcceptCertificateHandler</name>
</invalidCertificateHandler>
</client>
</openSSL>
<max_concurrent_queries>500</max_concurrent_queries>
<mark_cache_size>5368709120</mark_cache_size>
<path>./clickhouse/</path>
<users_config>users.xml</users_config>
<dictionaries_config>/etc/clickhouse-server/config.d/*.xml</dictionaries_config>
</yandex>

View File

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<yandex>
<profiles>
<default>
</default>
</profiles>
<users>
<default>
<password></password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</default>
</users>
<quotas>
<default>
</default>
</quotas>
</yandex>

View File

@ -0,0 +1,337 @@
#-*- coding: utf-8 -*-
import copy
class Layout(object):
LAYOUTS_STR_DICT = {
'flat': '<flat/>',
'hashed': '<hashed/>',
'cache': '<cache><size_in_cells>128</size_in_cells></cache>',
'complex_key_hashed': '<complex_key_hashed/>',
'complex_key_cache': '<complex_key_cache><size_in_cells>128</size_in_cells></complex_key_cache>',
'range_hashed': '<range_hashed/>'
}
def __init__(self, name):
self.name = name
self.is_complex = False
self.is_simple = False
self.is_ranged = False
if self.name.startswith('complex'):
self.layout_type = "complex"
self.is_complex = True
elif name.startswith("range"):
self.layout_type = "ranged"
self.is_ranged = True
else:
self.layout_type = "simple"
self.is_simple = True
def get_str(self):
return self.LAYOUTS_STR_DICT[self.name]
def get_key_block_name(self):
if self.is_complex:
return 'key'
else:
return 'id'
class Row(object):
def __init__(self, fields, values):
self.data = {}
for field, value in zip(fields, values):
self.data[field.name] = value
def get_value_by_name(self, name):
return self.data[name]
class Field(object):
def __init__(self, name, field_type, is_key=False, is_range_key=False, default=None, hierarchical=False, range_hash_type=None, default_value_for_get=None):
self.name = name
self.field_type = field_type
self.is_key = is_key
self.default = default
self.hierarchical = hierarchical
self.range_hash_type = range_hash_type
self.is_range = self.range_hash_type is not None
self.is_range_key = is_range_key
self.default_value_for_get = default_value_for_get
def get_attribute_str(self):
return '''
<attribute>
<name>{name}</name>
<type>{field_type}</type>
<null_value>{default}</null_value>
<hierarchical>{hierarchical}</hierarchical>
</attribute>'''.format(
name=self.name,
field_type=self.field_type,
default=self.default if self.default else '',
hierarchical='true' if self.hierarchical else 'false',
)
def get_simple_index_str(self):
return '<name>{name}</name>'.format(name=self.name)
def get_range_hash_str(self):
if not self.range_hash_type:
raise Exception("Field {} is not range hashed".format(self.name))
return '''
<range_{type}>
<name>{name}</name>
</range_{type}>
'''.format(type=self.range_hash_type, name=self.name)
class DictionaryStructure(object):
def __init__(self, layout, fields):
self.layout = layout
self.keys = []
self.range_key = None
self.ordinary_fields = []
self.range_fields = []
for field in fields:
if field.is_key:
self.keys.append(field)
elif field.is_range:
self.range_fields.append(field)
else:
self.ordinary_fields.append(field)
if field.is_range_key:
if self.range_key is not None:
raise Exception("Duplicate range key {}".format(field.name))
self.range_key = field
if not self.layout.is_complex and len(self.keys) > 1:
raise Exception("More than one key {} field in non complex layout {}".format(len(self.keys), self.layout.name))
if self.layout.is_ranged and (not self.range_key or len(self.range_fields) != 2):
raise Exception("Inconsistent configuration of ranged dictionary")
def get_structure_str(self):
fields_strs = []
for field in self.ordinary_fields:
fields_strs.append(field.get_attribute_str())
key_strs = []
if self.layout.is_complex:
for key_field in self.keys:
key_strs.append(key_field.get_attribute_str())
else: # same for simple and ranged
for key_field in self.keys:
key_strs.append(key_field.get_simple_index_str())
ranged_strs = []
if self.layout.is_ranged:
for range_field in self.range_fields:
ranged_strs.append(range_field.get_range_hash_str())
return '''
<layout>
{layout_str}
</layout>
<structure>
<{key_block_name}>
{key_str}
</{key_block_name}>
{range_strs}
{attributes_str}
</structure>'''.format(
layout_str=self.layout.get_str(),
key_block_name=self.layout.get_key_block_name(),
key_str='\n'.join(key_strs),
attributes_str='\n'.join(fields_strs),
range_strs='\n'.join(ranged_strs),
)
def get_ordered_names(self):
fields_strs = []
for key_field in self.keys:
fields_strs.append(key_field.name)
for range_field in self.range_fields:
fields_strs.append(range_field.name)
for field in self.ordinary_fields:
fields_strs.append(field.name)
return fields_strs
def get_all_fields(self):
return self.keys + self.range_fields + self.ordinary_fields
def _get_dict_get_common_expression(self, dict_name, field, row, or_default, with_type, has):
if field in self.keys:
raise Exception("Trying to receive key field {} from dictionary".format(field.name))
if not self.layout.is_complex:
if not or_default:
key_expr = ', toUInt64({})'.format(row.data[self.keys[0].name])
else:
key_expr = ', toUInt64({})'.format(self.keys[0].default_value_for_get)
else:
key_exprs_strs = []
for key in self.keys:
if not or_default:
val = row.data[key.name]
else:
val = key.default_value_for_get
if isinstance(val, str):
val = "'" + val + "'"
key_exprs_strs.append('to{type}({value})'.format(type=key.field_type, value=val))
key_expr = ', (' + ','.join(key_exprs_strs) + ')'
date_expr = ''
if self.layout.is_ranged:
val = row.data[self.range_key.name]
if isinstance(val, str):
val = "'" + val + "'"
val = "to{type}({val})".format(type=self.range_key.field_type, val=val)
date_expr = ', ' + val
if or_default:
raise Exception("Can create 'dictGetOrDefault' query for ranged dictionary")
if or_default:
or_default_expr = 'OrDefault'
if field.default_value_for_get is None:
raise Exception("Can create 'dictGetOrDefault' query for field {} without default_value_for_get".format(field.name))
val = field.default_value_for_get
if isinstance(val, str):
val = "'" + val + "'"
default_value_for_get = ', to{type}({value})'.format(type=field.field_type, value=val)
else:
or_default_expr = ''
default_value_for_get = ''
if with_type:
field_type = field.field_type
else:
field_type = ''
field_name = ", '" + field.name + "'"
if has:
what = "Has"
field_type = ''
or_default = ''
field_name = ''
date_expr = ''
def_for_get = ''
else:
what = "Get"
return "dict{what}{field_type}{or_default}('{dict_name}'{field_name}{key_expr}{date_expr}{def_for_get})".format(
what=what,
field_type=field_type,
dict_name=dict_name,
field_name=field_name,
key_expr=key_expr,
date_expr=date_expr,
or_default=or_default_expr,
def_for_get=default_value_for_get,
)
def get_get_expressions(self, dict_name, field, row):
return [
self._get_dict_get_common_expression(dict_name, field, row, or_default=False, with_type=False, has=False),
self._get_dict_get_common_expression(dict_name, field, row, or_default=False, with_type=True, has=False),
]
def get_get_or_default_expressions(self, dict_name, field, row):
if not self.layout.is_ranged:
return [
self._get_dict_get_common_expression(dict_name, field, row, or_default=True, with_type=False, has=False),
self._get_dict_get_common_expression(dict_name, field, row, or_default=True, with_type=True, has=False),
]
return []
def get_has_expressions(self, dict_name, field, row):
if not self.layout.is_ranged:
return [self._get_dict_get_common_expression(dict_name, field, row, or_default=False, with_type=False, has=True)]
return []
def get_hierarchical_expressions(self, dict_name, row):
if self.layout.is_simple:
key_expr = 'toUInt64({})'.format(row.data[self.keys[0].name])
return [
"dictGetHierarchy('{dict_name}', {key})".format(
dict_name=dict_name,
key=key_expr,
),
]
return []
def get_is_in_expressions(self, dict_name, row, parent_row):
if self.layout.is_simple:
child_key_expr = 'toUInt64({})'.format(row.data[self.keys[0].name])
parent_key_expr = 'toUInt64({})'.format(parent_row.data[self.keys[0].name])
return [
"dictIsIn('{dict_name}', {child_key}, {parent_key})".format(
dict_name=dict_name,
child_key=child_key_expr,
parent_key=parent_key_expr,)
]
return []
class Dictionary(object):
def __init__(self, name, structure, source, config_path, table_name):
self.name = name
self.structure = copy.deepcopy(structure)
self.source = copy.deepcopy(source)
self.config_path = config_path
self.table_name = table_name
def generate_config(self):
with open(self.config_path, 'w') as result:
result.write('''
<dictionaries>
<dictionary>
<lifetime>
<min>3</min>
<max>5</max>
</lifetime>
<name>{name}</name>
{structure}
<source>
{source}
</source>
</dictionary>
</dictionaries>
'''.format(
name=self.name,
structure=self.structure.get_structure_str(),
source=self.source.get_source_str(self.table_name),
))
def prepare_source(self, cluster):
self.source.prepare(self.structure, self.table_name, cluster)
def load_data(self, data):
if not self.source.prepared:
raise Exception("Cannot load data for dictionary {}, source is not prepared".format(self.name))
self.source.load_data(data, self.table_name)
def get_select_get_queries(self, field, row):
return ['select {}'.format(expr) for expr in self.structure.get_get_expressions(self.name, field, row)]
def get_select_get_or_default_queries(self, field, row):
return ['select {}'.format(expr) for expr in self.structure.get_get_or_default_expressions(self.name, field, row)]
def get_select_has_queries(self, field, row):
return ['select {}'.format(expr) for expr in self.structure.get_has_expressions(self.name, field, row)]
def get_hierarchical_queries(self, row):
return ['select {}'.format(expr) for expr in self.structure.get_hierarchical_expressions(self.name, row)]
def get_is_in_queries(self, row, parent_row):
return ['select {}'.format(expr) for expr in self.structure.get_is_in_expressions(self.name, row, parent_row)]
def is_complex(self):
return self.structure.layout.is_complex

View File

@ -0,0 +1,374 @@
# -*- coding: utf-8 -*-
import warnings
import pymysql.cursors
import pymongo
from tzlocal import get_localzone
import datetime
import os
class ExternalSource(object):
def __init__(self, name, internal_hostname, internal_port,
docker_hostname, docker_port, user, password):
self.name = name
self.internal_hostname = internal_hostname
self.internal_port = int(internal_port)
self.docker_hostname = docker_hostname
self.docker_port = int(docker_port)
self.user = user
self.password = password
def get_source_str(self, table_name):
raise NotImplementedError("Method {} is not implemented for {}".format(
"get_source_config_part", self.__class__.__name__))
def prepare(self, structure, table_name, cluster):
raise NotImplementedError("Method {} is not implemented for {}".format(
"prepare_remote_source", self.__class__.__name__))
# data is banch of Row
def load_data(self, data):
raise NotImplementedError("Method {} is not implemented for {}".format(
"prepare_remote_source", self.__class__.__name__))
def compatible_with_layout(self, layout):
return True
class SourceMySQL(ExternalSource):
TYPE_MAPPING = {
'UInt8': 'tinyint unsigned',
'UInt16': 'smallint unsigned',
'UInt32': 'int unsigned',
'UInt64': 'bigint unsigned',
'Int8': 'tinyint',
'Int16': 'smallint',
'Int32': 'int',
'Int64': 'bigint',
'UUID': 'varchar(36)',
'Date': 'date',
'DateTime': 'datetime',
'String': 'text',
'Float32': 'float',
'Float64': 'double'
}
def create_mysql_conn(self):
self.connection = pymysql.connect(
user=self.user,
password=self.password,
host=self.internal_hostname,
port=self.internal_port)
def execute_mysql_query(self, query):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
with self.connection.cursor() as cursor:
cursor.execute(query)
self.connection.commit()
def get_source_str(self, table_name):
return '''
<mysql>
<replica>
<priority>1</priority>
<host>127.0.0.1</host>
<port>3333</port> <!-- Wrong port, for testing basic failover to work. -->
</replica>
<replica>
<priority>2</priority>
<host>{hostname}</host>
<port>{port}</port>
</replica>
<user>{user}</user>
<password>{password}</password>
<db>test</db>
<table>{tbl}</table>
</mysql>'''.format(
hostname=self.docker_hostname,
port=self.docker_port,
user=self.user,
password=self.password,
tbl=table_name,
)
def prepare(self, structure, table_name, cluster):
self.create_mysql_conn()
self.execute_mysql_query("create database if not exists test default character set 'utf8'")
fields_strs = []
for field in structure.keys + structure.ordinary_fields + structure.range_fields:
fields_strs.append(field.name + ' ' + self.TYPE_MAPPING[field.field_type])
create_query = '''create table test.{table_name} (
{fields_str});
'''.format(table_name=table_name, fields_str=','.join(fields_strs))
self.execute_mysql_query(create_query)
self.ordered_names = structure.get_ordered_names()
self.prepared = True
def load_data(self, data, table_name):
values_strs = []
if not data:
return
for row in data:
sorted_row = []
for name in self.ordered_names:
data = row.data[name]
if isinstance(row.data[name], str):
data = "'" + data + "'"
else:
data = str(data)
sorted_row.append(data)
values_strs.append('(' + ','.join(sorted_row) + ')')
query = 'insert into test.{} ({}) values {}'.format(
table_name,
','.join(self.ordered_names),
','.join(values_strs))
self.execute_mysql_query(query)
class SourceMongo(ExternalSource):
def get_source_str(self, table_name):
return '''
<mongodb>
<host>{host}</host>
<port>{port}</port>
<user>{user}</user>
<password>{password}</password>
<db>test</db>
<collection>{tbl}</collection>
</mongodb>
'''.format(
host=self.docker_hostname,
port=self.docker_port,
user=self.user,
password=self.password,
tbl=table_name,
)
def prepare(self, structure, table_name, cluster):
connection_str = 'mongodb://{user}:{password}@{host}:{port}'.format(
host=self.internal_hostname, port=self.internal_port,
user=self.user, password=self.password)
self.connection = pymongo.MongoClient(connection_str)
self.converters = {}
for field in structure.get_all_fields():
if field.field_type == "Date":
self.converters[field.name] = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
elif field.field_type == "DateTime":
self.converters[field.name] = lambda x: get_localzone().localize(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
else:
self.converters[field.name] = lambda x: x
self.db = self.connection['test']
self.db.add_user(self.user, self.password)
self.prepared = True
def load_data(self, data, table_name):
tbl = self.db[table_name]
to_insert = []
for row in data:
row_dict = {}
for cell_name, cell_value in row.data.items():
row_dict[cell_name] = self.converters[cell_name](cell_value)
to_insert.append(row_dict)
result = tbl.insert_many(to_insert)
class SourceClickHouse(ExternalSource):
def get_source_str(self, table_name):
return '''
<clickhouse>
<host>{host}</host>
<port>{port}</port>
<user>{user}</user>
<password>{password}</password>
<db>test</db>
<table>{tbl}</table>
</clickhouse>
'''.format(
host=self.docker_hostname,
port=self.docker_port,
user=self.user,
password=self.password,
tbl=table_name,
)
def prepare(self, structure, table_name, cluster):
self.node = cluster.instances[self.docker_hostname]
self.node.query("CREATE DATABASE IF NOT EXISTS test")
fields_strs = []
for field in structure.keys + structure.ordinary_fields + structure.range_fields:
fields_strs.append(field.name + ' ' + field.field_type)
create_query = '''CREATE TABLE test.{table_name} (
{fields_str}) ENGINE MergeTree ORDER BY tuple();
'''.format(table_name=table_name, fields_str=','.join(fields_strs))
self.node.query(create_query)
self.ordered_names = structure.get_ordered_names()
self.prepared = True
def load_data(self, data, table_name):
values_strs = []
if not data:
return
for row in data:
sorted_row = []
for name in self.ordered_names:
row_data = row.data[name]
if isinstance(row_data, str):
row_data = "'" + row_data + "'"
else:
row_data = str(row_data)
sorted_row.append(row_data)
values_strs.append('(' + ','.join(sorted_row) + ')')
query = 'INSERT INTO test.{} ({}) values {}'.format(
table_name,
','.join(self.ordered_names),
','.join(values_strs))
self.node.query(query)
class SourceFile(ExternalSource):
def get_source_str(self, table_name):
table_path = "/" + table_name + ".tsv"
return '''
<file>
<path>{path}</path>
<format>TabSeparated</format>
</file>
'''.format(
path=table_path,
)
def prepare(self, structure, table_name, cluster):
self.node = cluster.instances[self.docker_hostname]
path = "/" + table_name + ".tsv"
self.node.exec_in_container(["bash", "-c", "touch {}".format(path)])
self.ordered_names = structure.get_ordered_names()
self.prepared = True
def load_data(self, data, table_name):
if not data:
return
path = "/" + table_name + ".tsv"
for row in list(data):
sorted_row = []
for name in self.ordered_names:
sorted_row.append(str(row.data[name]))
str_data = '\t'.join(sorted_row)
self.node.exec_in_container(["bash", "-c", "echo \"{row}\" >> {fname}".format(row=str_data, fname=path)])
def compatible_with_layout(self, layout):
return 'cache' not in layout.name
class _SourceExecutableBase(ExternalSource):
def _get_cmd(self, path):
raise NotImplementedError("Method {} is not implemented for {}".format(
"_get_cmd", self.__class__.__name__))
def get_source_str(self, table_name):
table_path = "/" + table_name + ".tsv"
return '''
<executable>
<command>{cmd}</command>
<format>TabSeparated</format>
</executable>
'''.format(
cmd=self._get_cmd(table_path),
)
def prepare(self, structure, table_name, cluster):
self.node = cluster.instances[self.docker_hostname]
path = "/" + table_name + ".tsv"
self.node.exec_in_container(["bash", "-c", "touch {}".format(path)])
self.ordered_names = structure.get_ordered_names()
self.prepared = True
def load_data(self, data, table_name):
if not data:
return
path = "/" + table_name + ".tsv"
for row in list(data):
sorted_row = []
for name in self.ordered_names:
sorted_row.append(str(row.data[name]))
str_data = '\t'.join(sorted_row)
self.node.exec_in_container(["bash", "-c", "echo \"{row}\" >> {fname}".format(row=str_data, fname=path)])
class SourceExecutableCache(_SourceExecutableBase):
def _get_cmd(self, path):
return "cat {}".format(path)
def compatible_with_layout(self, layout):
return 'cache' not in layout.name
class SourceExecutableHashed(_SourceExecutableBase):
def _get_cmd(self, path):
return "cat - >/dev/null;cat {}".format(path)
def compatible_with_layout(self, layout):
return 'cache' in layout.name
class SourceHTTPBase(ExternalSource):
PORT_COUNTER = 5555
def get_source_str(self, table_name):
self.http_port = SourceHTTPBase.PORT_COUNTER
url = "{schema}://{host}:{port}/".format(schema=self._get_schema(), host=self.docker_hostname, port=self.http_port)
SourceHTTPBase.PORT_COUNTER += 1
return '''
<http>
<url>{url}</url>
<format>TabSeparated</format>
</http>
'''.format(url=url)
def prepare(self, structure, table_name, cluster):
self.node = cluster.instances[self.docker_hostname]
path = "/" + table_name + ".tsv"
self.node.exec_in_container(["bash", "-c", "touch {}".format(path)])
script_dir = os.path.dirname(os.path.realpath(__file__))
self.node.copy_file_to_container(os.path.join(script_dir, './http_server.py'), '/http_server.py')
self.node.copy_file_to_container(os.path.join(script_dir, './fake_cert.pem'), '/fake_cert.pem')
self.node.exec_in_container([
"bash",
"-c",
"python2 /http_server.py --data-path={tbl} --schema={schema} --host={host} --port={port} --cert-path=/fake_cert.pem".format(
tbl=path, schema=self._get_schema(), host=self.docker_hostname, port=self.http_port)
], detach=True)
self.ordered_names = structure.get_ordered_names()
self.prepared = True
def load_data(self, data, table_name):
if not data:
return
path = "/" + table_name + ".tsv"
for row in list(data):
sorted_row = []
for name in self.ordered_names:
sorted_row.append(str(row.data[name]))
str_data = '\t'.join(sorted_row)
self.node.exec_in_container(["bash", "-c", "echo \"{row}\" >> {fname}".format(row=str_data, fname=path)])
class SourceHTTP(SourceHTTPBase):
def _get_schema(self):
return "http"
class SourceHTTPS(SourceHTTPBase):
def _get_schema(self):
return "https"

View File

@ -0,0 +1,49 @@
-----BEGIN PRIVATE KEY-----
MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDDHnGYqN/ztiFE
rMQizbYiEpI/q/91bCDQ+xRes+gucKrr4qvQbosANYfpXgsaGizH24CpAXDvnFwC
oHqPmotHunJvG9uKiVvshy+tx1SNLZEN9DySri+8V+8fetn5PFxWQsKclMGCypyE
REV6H0vflPWmZRZWvAb5aaIxcRa2m3bTVUZPuY0wzCtc+ELPQ/sRc62gWH4bMlBo
0Wdai4+wcmpdcSR+rlZVDPt+ysxF/PcJFMAQ9CIRJRhXuK7Q/XCmAkagpH9tPPwY
SDMONTPhumXY7gCX4lmV9CflGJ6IpGmpEL04Rpr3gAcvz/w4JiMXgGpvtDjiJku9
qOdCYS/FAgMBAAECggEBAL/miULjlJ9VWZL5eE3ilGcebMhCmZUbK4td5cLenlRO
a0xkOydcEUm7XFihLboWVEScFgYibLi8x6Gtw9zI2oNJVJMCiwHN5qLSsonvqbDQ
SAG5XHnG5xwOQBht80O1ofsU3eKyS0AflaBgpRRfA3h6QL/OXBIiC5nx0ptd5kDh
HR0IHUcleBHt8I0d/PZbQE9oMOBlnMf8v2jGe80JXscQt2UabA/quCalDihhDt5J
qySfh4mDOrBOQEsmO/C1JCztQ6WZ2FVwRiITb/fRmsPadKJsIiMyy2w6NmP96v2a
V2ZqMvz9OZym8M2is4HR2pbn8XJ6vmW52fwNQhpWDgECgYEA8aiqF5df3j8YEDAX
XVAhIaubSLcS50qSk/p0/ZS9ETR1Uv8zjJDs6xBVBd4xXe/G2/XvvV6sGp4JcW3V
U66Ll3S1veMlnvCTjZUEi931EJbIdoyGACEG19QIVteSEhQkoSOk/Zx1lFSVm9UZ
hUV4JvWifQvLetS/v6MhnxSbTdUCgYEAzrK7+0gVT0a0szMs7CbeQVm80EWcqPea
p5jyLQHu+7vzcC8c9RRlqBPkxeG9BTt0sbBBJTrtvls15QaFoKCtTyjnrrLEHqu3
VZfIpjjrIhhvoRWP3A3r4DFMDGm/TOTUWEMSPJPXKe3uVm3buwVXWj4ipvhnAdr5
kJ+x1YqNIjECgYEAo0ISHzv53Vh8tjr3HehLacbYcmiUEcOUgPo8XTBGBsCM3pRg
S/+Av1FaT0uLyG17yBA/dYzm8liAAqxz6UPLNHf5bB5vxQ+8b3MUDjXWIO3s4gIP
aTjmuZqaQ6kBGsuW73H4PgmceagnJo7x3dJP2OoraxUz03i1Tg80YJd4UD0CgYBC
dzL/gJRpo6DjpuchIPaDKSoQBvJzWvt+PS5SzrZceHm1b1DudhqiS5NbFlXD4vSJ
VtX79NESTx4rgUdi+YgBVnP5tz5dZnZTrbU1zkO9+QGcWOSjrE5XD0MXEsITJdoq
b5bjp96eewYTAMyRfQwz1psp+eKVtCZgHRoAQsdTYQKBgQC7yBABJ4LDTie2C2n0
itO7SRT1tMfkNx8gK9RrgGawBUhD1EokmOKk+O1Ht6Cx7hqCd3Hsa4zc9se++jV1
Er+T8LW8FOFfAwtv8xggJtA8h6U8n6gIoq0EsSsWREJ4m9fDfZQnVTj8IPYvPHMr
Jv++IPqtFGG4O8IeWG+HY8mHxQ==
-----END PRIVATE KEY-----
-----BEGIN CERTIFICATE-----
MIIDYDCCAkigAwIBAgIJAKSJ3I0ORzjtMA0GCSqGSIb3DQEBCwUAMEUxCzAJBgNV
BAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRlcm5ldCBX
aWRnaXRzIFB0eSBMdGQwHhcNMTkwMjIyMDgxNTIzWhcNMjAwMjIyMDgxNTIzWjBF
MQswCQYDVQQGEwJBVTETMBEGA1UECAwKU29tZS1TdGF0ZTEhMB8GA1UECgwYSW50
ZXJuZXQgV2lkZ2l0cyBQdHkgTHRkMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB
CgKCAQEAwx5xmKjf87YhRKzEIs22IhKSP6v/dWwg0PsUXrPoLnCq6+Kr0G6LADWH
6V4LGhosx9uAqQFw75xcAqB6j5qLR7pybxvbiolb7IcvrcdUjS2RDfQ8kq4vvFfv
H3rZ+TxcVkLCnJTBgsqchERFeh9L35T1pmUWVrwG+WmiMXEWtpt201VGT7mNMMwr
XPhCz0P7EXOtoFh+GzJQaNFnWouPsHJqXXEkfq5WVQz7fsrMRfz3CRTAEPQiESUY
V7iu0P1wpgJGoKR/bTz8GEgzDjUz4bpl2O4Al+JZlfQn5RieiKRpqRC9OEaa94AH
L8/8OCYjF4Bqb7Q44iZLvajnQmEvxQIDAQABo1MwUTAdBgNVHQ4EFgQU6P39PMY3
jRgJM0svz9XpHH8z7xUwHwYDVR0jBBgwFoAU6P39PMY3jRgJM0svz9XpHH8z7xUw
DwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAdIKBKlCIprCDGTtn
xatBlcpkbys4hQhHwkWn5aAPKE2oZlUOTEe90xxLJuciK+vCXTwQ3mgjGFc+ioAo
B7m3VL1DLmHCw5DQ2T/g8TjVjlKoaCj+9SZZPga5ygYJChx5HKFO4eek9stWo6hA
BmXndKhdX7mphUoSqUnQ+RwQ9XA0n6eTPqXAThWVqvLQgDj7Msz1XeFfqFqyD9MN
RocFg87aASTtwxYneG3IZCOQudlbHaRuEflHjlty2V5mNPjzcS2QK598i/5vmIoD
ZiUBXg+P8n+dklEa4qnQplDKERD20GtDgWtgYrfmpspLWNv8/bZ4h4gmGsH0+3uz
dHQNQA==
-----END CERTIFICATE-----

View File

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
import argparse
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
import ssl
import csv
import os
def start_server(server_address, cert_path, data_path, schema):
class TSVHTTPHandler(BaseHTTPRequestHandler):
def _set_headers(self):
self.send_response(200)
self.send_header('Content-type', 'text/tsv')
self.end_headers()
def do_GET(self):
self._set_headers()
with open(data_path, 'r') as fl:
reader = csv.reader(fl, delimiter='\t')
for row in reader:
self.wfile.write('\t'.join(row) + '\n')
return
def do_POST(self):
return self.do_GET()
httpd = HTTPServer(server_address, TSVHTTPHandler)
if schema == 'https':
httpd.socket = ssl.wrap_socket(httpd.socket, certfile=cert_path, server_side=True)
httpd.serve_forever()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple HTTP server returns data from file")
parser.add_argument("--data-path", required=True)
parser.add_argument("--schema", choices=("http", "https"), required=True)
parser.add_argument("--host", default="localhost")
parser.add_argument("--port", default=5555, type=int)
parser.add_argument("--cert-path", default="./fake_cert.pem")
args = parser.parse_args()
start_server((args.host, args.port), args.cert_path, args.data_path, args.schema)

View File

@ -0,0 +1,264 @@
import pytest
import os
import time
from helpers.cluster import ClickHouseCluster
from dictionary import Field, Row, Dictionary, DictionaryStructure, Layout
from external_sources import SourceMySQL, SourceClickHouse, SourceFile, SourceExecutableCache, SourceExecutableHashed, SourceMongo
from external_sources import SourceHTTP, SourceHTTPS
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
FIELDS = {
"simple": [
Field("KeyField", 'UInt64', is_key=True, default_value_for_get=9999999),
Field("UInt8_", 'UInt8', default_value_for_get=55),
Field("UInt16_", 'UInt16', default_value_for_get=66),
Field("UInt32_", 'UInt32', default_value_for_get=77),
Field("UInt64_", 'UInt64', default_value_for_get=88),
Field("Int8_", 'Int8', default_value_for_get=-55),
Field("Int16_", 'Int16', default_value_for_get=-66),
Field("Int32_", 'Int32', default_value_for_get=-77),
Field("Int64_", 'Int64', default_value_for_get=-88),
Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'),
Field("Date_", 'Date', default_value_for_get='2018-12-30'),
Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'),
Field("String_", 'String', default_value_for_get='hi'),
Field("Float32_", 'Float32', default_value_for_get=555.11),
Field("Float64_", 'Float64', default_value_for_get=777.11),
Field("ParentKeyField", "UInt64", default_value_for_get=444, hierarchical=True)
],
"complex": [
Field("KeyField1", 'UInt64', is_key=True, default_value_for_get=9999999),
Field("KeyField2", 'String', is_key=True, default_value_for_get='xxxxxxxxx'),
Field("UInt8_", 'UInt8', default_value_for_get=55),
Field("UInt16_", 'UInt16', default_value_for_get=66),
Field("UInt32_", 'UInt32', default_value_for_get=77),
Field("UInt64_", 'UInt64', default_value_for_get=88),
Field("Int8_", 'Int8', default_value_for_get=-55),
Field("Int16_", 'Int16', default_value_for_get=-66),
Field("Int32_", 'Int32', default_value_for_get=-77),
Field("Int64_", 'Int64', default_value_for_get=-88),
Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'),
Field("Date_", 'Date', default_value_for_get='2018-12-30'),
Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'),
Field("String_", 'String', default_value_for_get='hi'),
Field("Float32_", 'Float32', default_value_for_get=555.11),
Field("Float64_", 'Float64', default_value_for_get=777.11),
],
"ranged": [
Field("KeyField1", 'UInt64', is_key=True),
Field("KeyField2", 'Date', is_range_key=True),
Field("StartDate", 'Date', range_hash_type='min'),
Field("EndDate", 'Date', range_hash_type='max'),
Field("UInt8_", 'UInt8', default_value_for_get=55),
Field("UInt16_", 'UInt16', default_value_for_get=66),
Field("UInt32_", 'UInt32', default_value_for_get=77),
Field("UInt64_", 'UInt64', default_value_for_get=88),
Field("Int8_", 'Int8', default_value_for_get=-55),
Field("Int16_", 'Int16', default_value_for_get=-66),
Field("Int32_", 'Int32', default_value_for_get=-77),
Field("Int64_", 'Int64', default_value_for_get=-88),
Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'),
Field("Date_", 'Date', default_value_for_get='2018-12-30'),
Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'),
Field("String_", 'String', default_value_for_get='hi'),
Field("Float32_", 'Float32', default_value_for_get=555.11),
Field("Float64_", 'Float64', default_value_for_get=777.11),
]
}
LAYOUTS = [
Layout("hashed"),
Layout("cache"),
Layout("flat"),
Layout("complex_key_hashed"),
Layout("complex_key_cache"),
Layout("range_hashed")
]
SOURCES = [
# some troubles with that dictionary
SourceMongo("MongoDB", "localhost", "27018", "mongo1", "27017", "root", "clickhouse"),
SourceMySQL("MySQL", "localhost", "3308", "mysql1", "3306", "root", "clickhouse"),
SourceClickHouse("RemoteClickHouse", "localhost", "9000", "clickhouse1", "9000", "default", ""),
SourceClickHouse("LocalClickHouse", "localhost", "9000", "node", "9000", "default", ""),
SourceFile("File", "localhost", "9000", "node", "9000", "", ""),
SourceExecutableHashed("ExecutableHashed", "localhost", "9000", "node", "9000", "", ""),
SourceExecutableCache("ExecutableCache", "localhost", "9000", "node", "9000", "", ""),
SourceHTTP("SourceHTTP", "localhost", "9000", "clickhouse1", "9000", "", ""),
SourceHTTPS("SourceHTTPS", "localhost", "9000", "clickhouse1", "9000", "", ""),
]
DICTIONARIES = []
cluster = None
node = None
def setup_module(module):
global DICTIONARIES
global cluster
global node
dict_configs_path = os.path.join(SCRIPT_DIR, 'configs/dictionaries')
for f in os.listdir(dict_configs_path):
os.remove(os.path.join(dict_configs_path, f))
for layout in LAYOUTS:
for source in SOURCES:
if source.compatible_with_layout(layout):
structure = DictionaryStructure(layout, FIELDS[layout.layout_type])
dict_name = source.name + "_" + layout.name
dict_path = os.path.join(dict_configs_path, dict_name + '.xml')
dictionary = Dictionary(dict_name, structure, source, dict_path, "table_" + dict_name)
dictionary.generate_config()
DICTIONARIES.append(dictionary)
else:
print "Source", source.name, "incompatible with layout", layout.name
main_configs = []
for fname in os.listdir(dict_configs_path):
main_configs.append(os.path.join(dict_configs_path, fname))
cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs'))
node = cluster.add_instance('node', main_configs=main_configs, with_mysql=True, with_mongo=True)
cluster.add_instance('clickhouse1')
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
for dictionary in DICTIONARIES:
print "Preparing", dictionary.name
dictionary.prepare_source(cluster)
print "Prepared"
yield cluster
finally:
cluster.shutdown()
def test_simple_dictionaries(started_cluster):
fields = FIELDS["simple"]
data = [
Row(fields,
[1, 22, 333, 4444, 55555, -6, -77,
-888, -999, '550e8400-e29b-41d4-a716-446655440003',
'1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0]),
Row(fields,
[2, 3, 4, 5, 6, -7, -8,
-9, -10, '550e8400-e29b-41d4-a716-446655440002',
'1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1]),
]
simple_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "simple"]
for dct in simple_dicts:
dct.load_data(data)
node.query("system reload dictionaries")
queries_with_answers = []
for dct in simple_dicts:
for row in data:
for field in fields:
if not field.is_key:
for query in dct.get_select_get_queries(field, row):
queries_with_answers.append((query, row.get_value_by_name(field.name)))
for query in dct.get_select_has_queries(field, row):
queries_with_answers.append((query, 1))
for query in dct.get_select_get_or_default_queries(field, row):
queries_with_answers.append((query, field.default_value_for_get))
for query in dct.get_hierarchical_queries(data[0]):
queries_with_answers.append((query, [1]))
for query in dct.get_hierarchical_queries(data[1]):
queries_with_answers.append((query, [2, 1]))
for query in dct.get_is_in_queries(data[0], data[1]):
queries_with_answers.append((query, 0))
for query in dct.get_is_in_queries(data[1], data[0]):
queries_with_answers.append((query, 1))
for query, answer in queries_with_answers:
print query
if isinstance(answer, list):
answer = str(answer).replace(' ', '')
assert node.query(query) == str(answer) + '\n'
def test_complex_dictionaries(started_cluster):
fields = FIELDS["complex"]
data = [
Row(fields,
[1, 'world', 22, 333, 4444, 55555, -6,
-77, -888, -999, '550e8400-e29b-41d4-a716-446655440003',
'1973-06-28', '1985-02-28 23:43:25',
'hello', 22.543, 3332154213.4]),
Row(fields,
[2, 'qwerty2', 52, 2345, 6544, 9191991, -2,
-717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007',
'1975-09-28', '2000-02-28 23:33:24',
'my', 255.543, 3332221.44]),
]
complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex"]
for dct in complex_dicts:
dct.load_data(data)
node.query("system reload dictionaries")
queries_with_answers = []
for dct in complex_dicts:
for row in data:
for field in fields:
if not field.is_key:
for query in dct.get_select_get_queries(field, row):
queries_with_answers.append((query, row.get_value_by_name(field.name)))
for query in dct.get_select_has_queries(field, row):
queries_with_answers.append((query, 1))
for query in dct.get_select_get_or_default_queries(field, row):
queries_with_answers.append((query, field.default_value_for_get))
for query, answer in queries_with_answers:
print query
assert node.query(query) == str(answer) + '\n'
def test_ranged_dictionaries(started_cluster):
fields = FIELDS["ranged"]
data = [
Row(fields,
[1, '2019-02-10', '2019-02-01', '2019-02-28',
22, 333, 4444, 55555, -6, -77, -888, -999,
'550e8400-e29b-41d4-a716-446655440003',
'1973-06-28', '1985-02-28 23:43:25', 'hello',
22.543, 3332154213.4]),
Row(fields,
[1, '2019-04-10', '2019-04-01', '2019-04-28',
11, 3223, 41444, 52515, -65, -747, -8388, -9099,
'550e8400-e29b-41d4-a716-446655440004',
'1973-06-29', '2002-02-28 23:23:25', '!!!!',
32.543, 3332543.4]),
]
ranged_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged"]
for dct in ranged_dicts:
dct.load_data(data)
node.query("system reload dictionaries")
queries_with_answers = []
for dct in ranged_dicts:
for row in data:
for field in fields:
if not field.is_key and not field.is_range:
for query in dct.get_select_get_queries(field, row):
queries_with_answers.append((query, row.get_value_by_name(field.name)))
for query, answer in queries_with_answers:
print query
assert node.query(query) == str(answer) + '\n'

View File

@ -0,0 +1,82 @@
<test>
<name>website</name>
<type>loop</type>
<preconditions>
<table_exists>hits_10m_single</table_exists>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<stop_conditions>
<all_of>
<total_time_ms>60000</total_time_ms>
<iterations>3</iterations>
</all_of>
<any_of>
<iterations>30</iterations>
</any_of>
</stop_conditions>
<main_metric>
<min_time/>
</main_metric>
<settings>
<max_memory_usage>20000000000</max_memory_usage>
</settings>
<substitutions>
<substitution>
<name>table</name>
<values>
<value>hits_10m_single</value>
<value>hits_100m_single</value>
</values>
</substitution>
</substitutions>
<query>SELECT count() FROM {table}</query>
<query>SELECT count() FROM {table} WHERE AdvEngineID != 0</query>
<query>SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM {table}</query>
<query>SELECT sum(UserID) FROM {table}</query>
<query>SELECT uniq(UserID) FROM {table}</query>
<query>SELECT uniq(SearchPhrase) FROM {table}</query>
<query>SELECT min(EventDate), max(EventDate) FROM {table}</query>
<query>SELECT AdvEngineID, count() FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC</query>
<query>SELECT RegionID, uniq(UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10</query>
<query>SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10</query>
<query>SELECT MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10</query>
<query>SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10</query>
<query>SELECT SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10</query>
<query>SELECT SearchPhrase, uniq(UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10</query>
<query>SELECT SearchEngineID, SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10</query>
<query>SELECT UserID, count() FROM {table} GROUP BY UserID ORDER BY count() DESC LIMIT 10</query>
<query>SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10</query>
<query>SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10</query>
<query>SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10</query>
<query>SELECT UserID FROM {table} WHERE UserID = 12345678901234567890</query>
<query>SELECT count() FROM {table} WHERE URL LIKE '%metrika%'</query>
<query>SELECT SearchPhrase, any(URL), count() AS c FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10</query>
<query>SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10</query>
<query>SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10</query>
<query>SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10</query>
<query>SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10</query>
<query>SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10</query>
<query>SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25</query>
<query>SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25</query>
<query>SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table}</query>
<query>SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10</query>
<query>SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10</query>
<query>SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10</query>
<query>SELECT URL, count() AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10</query>
<query>SELECT 1, URL, count() AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10</query>
<query>SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM {table} GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10</query>
<query>SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT DontCountHits AND NOT Refresh AND notEmpty(URL) GROUP BY URL ORDER BY PageViews DESC LIMIT 10</query>
<query>SELECT Title, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT DontCountHits AND NOT Refresh AND notEmpty(Title) GROUP BY Title ORDER BY PageViews DESC LIMIT 10</query>
<query>SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000</query>
<query>SELECT TraficSourceID, SearchEngineID, AdvEngineID, ((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src, URL AS Dst, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000</query>
<query>SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100</query>
<query>SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000</query>
<query>SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-02' AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute</query>
</test>

View File

@ -1,6 +1,6 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get -y install tzdata
RUN apt-get update && apt-get -y install tzdata python
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

View File

@ -9,7 +9,7 @@ COPY s3downloader /s3downloader
COPY run.sh /run.sh
ENV OPEN_DATASETS="hits"
ENV PRIVATE_DATASETS="hits_100m_single"
ENV PRIVATE_DATASETS="hits_100m_single hits_10m_single"
ENV DOWNLOAD_DATASETS=1
CMD /run.sh

View File

@ -0,0 +1,97 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import tarfile
import logging
import argparse
import requests
import tempfile
DEFAULT_URL = 'https://clickhouse-datasets.s3.yandex.net'
AVAILABLE_DATASETS = {
'hits': 'hits_v1.tar',
'visits': 'visits_v1.tar',
'hits_100m_single': 'hits_100m_single.tar',
'hits_1000m_single': 'hits_1000m_single.tar',
'hits_10m_single': 'hits_10m_single.tar',
'trips_mergetree': 'trips_mergetree.tar',
}
def _get_temp_file_name():
return os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
def build_url(base_url, dataset):
return os.path.join(base_url, dataset, 'partitions', AVAILABLE_DATASETS[dataset])
def dowload_with_progress(url, path):
logging.info("Downloading from %s to temp path %s", url, path)
with open(path, 'w') as f:
response = requests.get(url, stream=True)
response.raise_for_status()
total_length = response.headers.get('content-length')
if total_length is None or int(total_length) == 0:
logging.info("No content-length, will download file without progress")
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
logging.info("Content length is %ld bytes", total_length)
counter = 0
for data in response.iter_content(chunk_size=4096):
dl += len(data)
counter += 1
f.write(data)
done = int(50 * dl / total_length)
percent = int(100 * float(dl) / total_length)
if sys.stdout.isatty():
sys.stdout.write("\r[{}{}] {}%".format('=' * done, ' ' * (50-done), percent))
sys.stdout.flush()
elif counter % 1000 == 0:
sys.stdout.write("{}%".format(percent))
sys.stdout.flush()
sys.stdout.write("\n")
logging.info("Downloading finished")
def unpack_to_clickhouse_directory(tar_path, clickhouse_path):
logging.info("Will unpack data from temp path %s to clickhouse db %s", tar_path, clickhouse_path)
with tarfile.open(tar_path, 'r') as comp_file:
comp_file.extractall(path=clickhouse_path)
logging.info("Unpack finished")
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s: %(message)s')
parser = argparse.ArgumentParser(
description="Simple tool for dowloading datasets for clickhouse from S3")
parser.add_argument('--dataset-names', required=True, nargs='+', choices=AVAILABLE_DATASETS.keys())
parser.add_argument('--url-prefix', default=DEFAULT_URL)
parser.add_argument('--clickhouse-data-path', default='/var/lib/clickhouse/')
args = parser.parse_args()
datasets = args.dataset_names
logging.info("Will fetch following datasets: %s", ', '.join(datasets))
for dataset in datasets:
logging.info("Processing %s", dataset)
temp_archive_path = _get_temp_file_name()
try:
download_url_for_dataset = build_url(args.url_prefix, dataset)
dowload_with_progress(download_url_for_dataset, temp_archive_path)
unpack_to_clickhouse_directory(temp_archive_path, args.clickhouse_data_path)
except Exception as ex:
logging.info("Some exception occured %s", str(ex))
raise
finally:
logging.info("Will remove dowloaded file %s from filesystem if it exists", temp_archive_path)
if os.path.exists(temp_archive_path):
os.remove(temp_archive_path)
logging.info("Processing of %s finished, table placed at", dataset)
logging.info("Fetch finished, enjoy your tables!")

View File

@ -94,7 +94,7 @@ PoolWithFailover::Entry PoolWithFailover::Get()
}
catch (const Poco::Exception & e)
{
if (e.displayText() == "mysqlxx::Pool is full") /// NOTE: String comparison is trashy code.
if (e.displayText().find("mysqlxx::Pool is full") != std::string::npos) /// NOTE: String comparison is trashy code.
{
full_pool = &pool;
}

View File

@ -94,7 +94,7 @@
</div>
<div id="announcement" class="colored-block">
<div class="page">
Upcoming ClickHouse Community Meetups: <a class="announcement-link" href="https://www.eventbrite.com/e/meetup-clickhouse-in-the-wild-deployment-success-stories-registration-55305051899" rel="external nofollow" target="_blank">San Francisco</a> on February 19 and <a class="announcement-link" href="https://www.eventbrite.com/e/clickhouse-meetup-in-madrid-registration-55376746339" rel="external nofollow" target="_blank">Madrid</a> on April 2
Upcoming ClickHouse Community Meetups: <a class="announcement-link" href="https://www.eventbrite.com/e/clickhouse-meetup-in-madrid-registration-55376746339" rel="external nofollow" target="_blank">Madrid</a> on April 2
</div>
</div>
<div class="page">