Merge branch 'ClickHouse:master' into master

This commit is contained in:
simpleton 2023-01-16 09:36:38 +08:00 committed by GitHub
commit 1cdd7361b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
88 changed files with 4303 additions and 880 deletions

View File

@ -683,3 +683,4 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 finish_check.py
python3 merge_pr.py

View File

@ -169,3 +169,4 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 finish_check.py
python3 merge_pr.py --check-approved

View File

@ -4388,3 +4388,4 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 finish_check.py
python3 merge_pr.py --check-approved

3
.gitmodules vendored
View File

@ -327,3 +327,6 @@
[submodule "contrib/aws-s2n-tls"]
path = contrib/aws-s2n-tls
url = https://github.com/ClickHouse/s2n-tls
[submodule "contrib/crc32-vpmsum"]
path = contrib/crc32-vpmsum
url = https://github.com/antonblanchard/crc32-vpmsum.git

View File

@ -55,6 +55,7 @@ else ()
endif ()
add_contrib (miniselect-cmake miniselect)
add_contrib (pdqsort-cmake pdqsort)
add_contrib (crc32-vpmsum-cmake crc32-vpmsum)
add_contrib (sparsehash-c11-cmake sparsehash-c11)
add_contrib (abseil-cpp-cmake abseil-cpp)
add_contrib (magic-enum-cmake magic_enum)

1
contrib/crc32-vpmsum vendored Submodule

@ -0,0 +1 @@
Subproject commit 452155439389311fc7d143621eaf56a258e02476

View File

@ -0,0 +1,14 @@
# module crc32-vpmsum gets build along with the files vec_crc32.h and crc32_constants.h in crc32-vpmsum-cmake
# Please see README.md for information about how to generate crc32_constants.h
if (NOT ARCH_PPC64LE)
message (STATUS "crc32-vpmsum library is only supported on ppc64le")
return()
endif()
SET(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/crc32-vpmsum")
add_library(_crc32-vpmsum
"${LIBRARY_DIR}/vec_crc32.c"
)
target_include_directories(_crc32-vpmsum SYSTEM BEFORE PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
add_library(ch_contrib::crc32-vpmsum ALIAS _crc32-vpmsum)

View File

@ -0,0 +1,9 @@
# To Generate crc32_constants.h
- Run make file in `../crc32-vpmsum` directory using following options and CRC polynomial. These options should use the same polynomial and order used by intel intrinisic functions
```bash
make crc32_constants.h CRC="0x11EDC6F41" OPTIONS="-x -r -c"
```
- move the generated `crc32_constants.h` into this directory
- To understand more about this go here: https://masterchef2209.wordpress.com/2020/06/17/guide-to-intel-sse4-2-crc-intrinisics-implementation-for-simde/
- Here is the link to information about intel intrinsic functions: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64&ig_expand=1492,1493,1559

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,26 @@
#ifndef VEC_CRC32
#define VEC_CRC32
#if ! ((defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
# error PowerPC architecture is expected
#endif
#ifdef __cplusplus
extern "C" {
#endif
unsigned int crc32_vpmsum(unsigned int crc, const unsigned char *p, unsigned long len);
static inline uint32_t crc32_ppc(uint64_t crc, unsigned char const *buffer, size_t len)
{
assert(buffer);
crc = crc32_vpmsum(crc, buffer, (unsigned long)len);
return crc;
}
#ifdef __cplusplus
}
#endif
#endif

View File

@ -5,6 +5,7 @@ set -x
# core.COMM.PID-TID
sysctl kernel.core_pattern='core.%e.%p-%P'
dmesg --clear ||:
set -e
set -u
@ -368,6 +369,7 @@ if [ -f core.zst ]; then
fi
rg --text -F '<Fatal>' server.log > fatal.log ||:
dmesg -T > dmesg.log ||:
zstd --threads=0 server.log
@ -396,6 +398,7 @@ p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-s
<a href="fuzzer.log">fuzzer.log</a>
<a href="server.log.zst">server.log.zst</a>
<a href="main.log">main.log</a>
<a href="dmesg.log">dmesg.log</a>
${CORE_LINK}
</p>
<table>

View File

@ -136,3 +136,7 @@ DESCRIBE TABLE test_database.test_table;
│ data │ Nullable(String) │
└────────┴───────────────────┘
```
## Related content
- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)

View File

@ -175,3 +175,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32)
- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md)
- [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
## Related content
- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)

View File

@ -120,5 +120,6 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec.
## Related Content
- [Extracting, converting, and querying data in local files using clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local)
- [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1)
- [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data)

View File

@ -57,6 +57,7 @@ ClickHouse-specific aggregate functions:
- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md)
- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md)
- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md)
- [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md)
- [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md)
- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md)
- [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md)
@ -77,4 +78,6 @@ ClickHouse-specific aggregate functions:
- [contingency](./contingency.md)
- [cramersV](./cramersv.md)
- [cramersVBiasCorrected](./cramersvbiascorrected.md)
- [theilsU](./theilsu.md)
- [theilsU](./theilsu.md)
- [maxIntersections](./maxintersections.md)
- [maxIntersectionsPosition](./maxintersectionsposition.md)

View File

@ -0,0 +1,64 @@
---
slug: /en/sql-reference/aggregate-functions/reference/maxintersections
sidebar_position: 360
title: maxIntersections
---
# maxIntersections
Aggregate function that calculates the maximum number of times that a group of intervals intersects each other (if all the intervals intersect at least once).
The syntax is:
```sql
maxIntersections(start_column, end_column)
```
**Arguments**
- `start_column` the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
**Returned value**
Returns the maximum number of intersected intervals.
**Example**
```sql
CREATE TABLE my_events (
start UInt32,
end UInt32
)
Engine = MergeTree
ORDER BY tuple();
INSERT INTO my_events VALUES
(1, 3),
(1, 6),
(2, 5),
(3, 7);
```
The intervals look like the following:
```response
1 - 3
1 - - - - 6
2 - - 5
3 - - - 7
```
Three of these intervals have a common value (the value is `4`, but the value that is common is not important, we are measuring the count of the intersections). The intervals `(1,3)` and `(3,7)` share an endpoint but are not considered intersecting by the `maxIntersections` function.
```sql
SELECT maxIntersections(start, end) FROM my_events;
```
Response:
```response
3
```
If you have multiple occurrences of the maximum interval, you can use the [`maxIntersectionsPosition` function](./maxintersectionsposition.md) to locate the number and location of those occurrences.

View File

@ -0,0 +1,64 @@
---
slug: /en/sql-reference/aggregate-functions/reference/maxintersectionsposition
sidebar_position: 361
title: maxIntersectionsPosition
---
# maxIntersectionsPosition
Aggregate function that calculates the positions of the occurrences of the [`maxIntersections` function](./maxintersections.md).
The syntax is:
```sql
maxIntersectionsPosition(start_column, end_column)
```
**Arguments**
- `start_column` the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
**Returned value**
Returns the start positions of the maximum number of intersected intervals.
**Example**
```sql
CREATE TABLE my_events (
start UInt32,
end UInt32
)
Engine = MergeTree
ORDER BY tuple();
INSERT INTO my_events VALUES
(1, 3),
(1, 6),
(2, 5),
(3, 7);
```
The intervals look like the following:
```response
1 - 3
1 - - - - 6
2 - - 5
3 - - - 7
```
Notice that three of these intervals have the value 4 in common, and that starts with the 2nd interval:
```sql
SELECT maxIntersectionsPosition(start, end) FROM my_events;
```
Response:
```response
2
```
In other words, the `(1,6)` row is the start of the 3 intervals that intersect, and 3 is the maximum number of intervals that intersect.

View File

@ -6,6 +6,10 @@ sidebar_label: JSON
# JSON
:::warning
This feature is experimental and is not production ready. If you need to work with JSON documents, consider using [this guide](/docs/en/guides/developer/working-with-json/json-load-data.md) instead.
:::
Stores JavaScript Object Notation (JSON) documents in a single column.
`JSON` is an alias for `Object('json')`.

View File

@ -39,3 +39,16 @@ SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(
│ [68] │ -67417.0770 │ ('2080-03-12 14:17:31.269','110425e5-413f-10a6-05ba-fa6b3e929f15') │
└──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
```
```sql
CREATE TABLE random (a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)) engine=Memory;
INSERT INTO random SELECT * FROM generateRandom() LIMIT 2;
SELECT * FROM random;
```
```text
┌─a────────────────────────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐
│ [] │ 68091.8197 │ ('2037-10-02 12:44:23.368','039ecab7-81c2-45ee-208c-844e5c6c5652') │
│ [8,-83,0,-22,65,9,-30,28,64] │ -186233.4909 │ ('2062-01-11 00:06:04.124','69563ea1-5ad1-f870-16d8-67061da0df25') │
└──────────────────────────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
```

View File

@ -214,6 +214,11 @@ IQueryTreeNode::Hash IQueryTreeNode::getTreeHash() const
}
QueryTreeNodePtr IQueryTreeNode::clone() const
{
return cloneAndReplace({});
}
QueryTreeNodePtr IQueryTreeNode::cloneAndReplace(const ReplacementMap & replacement_map) const
{
/** Clone tree with this node as root.
*
@ -236,11 +241,11 @@ QueryTreeNodePtr IQueryTreeNode::clone() const
const auto [node_to_clone, place_for_cloned_node] = nodes_to_clone.back();
nodes_to_clone.pop_back();
auto node_clone = node_to_clone->cloneImpl();
auto it = replacement_map.find(node_to_clone);
auto node_clone = it != replacement_map.end() ? it->second : node_to_clone->cloneImpl();
*place_for_cloned_node = node_clone;
node_clone->setAlias(node_to_clone->alias);
node_clone->setOriginalAST(node_to_clone->original_ast);
node_clone->children = node_to_clone->children;
node_clone->weak_pointers = node_to_clone->weak_pointers;

View File

@ -110,6 +110,13 @@ public:
/// Get a deep copy of the query tree
QueryTreeNodePtr clone() const;
/** Get a deep copy of the query tree.
* If node to clone is key in replacement map, then instead of clone it
* use value node from replacement map.
*/
using ReplacementMap = std::unordered_map<const IQueryTreeNode *, QueryTreeNodePtr>;
QueryTreeNodePtr cloneAndReplace(const ReplacementMap & replacement_map) const;
/// Returns true if node has alias, false otherwise
bool hasAlias() const
{

View File

@ -73,7 +73,7 @@ public:
if (!inner_function_node)
return;
auto & inner_function_arguments_nodes = inner_function_node->getArguments().getNodes();
const auto & inner_function_arguments_nodes = inner_function_node->getArguments().getNodes();
if (inner_function_arguments_nodes.size() != 2)
return;
@ -119,13 +119,15 @@ public:
{
lower_function_name = function_name_if_constant_is_negative;
}
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[1], lower_function_name);
auto inner_function = aggregate_function_arguments_nodes[0];
auto inner_function_right_argument = std::move(inner_function_arguments_nodes[1]);
aggregate_function_arguments_nodes = {inner_function_right_argument};
inner_function_arguments_nodes[1] = node;
node = std::move(inner_function);
auto inner_function_clone = inner_function_node->clone();
auto & inner_function_clone_arguments = inner_function_clone->as<FunctionNode &>().getArguments();
auto & inner_function_clone_arguments_nodes = inner_function_clone_arguments.getNodes();
auto inner_function_clone_right_argument = inner_function_clone_arguments_nodes[1];
aggregate_function_arguments_nodes = {inner_function_clone_right_argument};
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_clone_right_argument, lower_function_name);
inner_function_clone_arguments_nodes[1] = node;
node = std::move(inner_function_clone);
}
else if (right_argument_constant_node)
{
@ -136,18 +138,20 @@ public:
{
lower_function_name = function_name_if_constant_is_negative;
}
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[0], function_name_if_constant_is_negative);
auto inner_function = aggregate_function_arguments_nodes[0];
auto inner_function_left_argument = std::move(inner_function_arguments_nodes[0]);
aggregate_function_arguments_nodes = {inner_function_left_argument};
inner_function_arguments_nodes[0] = node;
node = std::move(inner_function);
auto inner_function_clone = inner_function_node->clone();
auto & inner_function_clone_arguments = inner_function_clone->as<FunctionNode &>().getArguments();
auto & inner_function_clone_arguments_nodes = inner_function_clone_arguments.getNodes();
auto inner_function_clone_left_argument = inner_function_clone_arguments_nodes[0];
aggregate_function_arguments_nodes = {inner_function_clone_left_argument};
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_clone_left_argument, lower_function_name);
inner_function_clone_arguments_nodes[0] = node;
node = std::move(inner_function_clone);
}
}
private:
static inline void resolveAggregateFunctionNode(FunctionNode & function_node, QueryTreeNodePtr & argument, const String & aggregate_function_name)
static inline void resolveAggregateFunctionNode(FunctionNode & function_node, const QueryTreeNodePtr & argument, const String & aggregate_function_name)
{
auto function_aggregate_function = function_node.getAggregateFunction();

View File

@ -0,0 +1,124 @@
#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
#include <Analyzer/ColumnNode.h>
#include <Analyzer/FunctionNode.h>
#include <Analyzer/HashUtils.h>
#include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/QueryNode.h>
#include <Analyzer/SortNode.h>
#include <Functions/IFunction.h>
namespace DB
{
namespace
{
class OptimizeRedundantFunctionsInOrderByVisitor : public InDepthQueryTreeVisitor<OptimizeRedundantFunctionsInOrderByVisitor>
{
public:
static bool needChildVisit(QueryTreeNodePtr & node, QueryTreeNodePtr & /*parent*/)
{
if (node->as<FunctionNode>())
return false;
return true;
}
void visitImpl(QueryTreeNodePtr & node)
{
auto * query = node->as<QueryNode>();
if (!query)
return;
if (!query->hasOrderBy())
return;
auto & order_by = query->getOrderBy();
for (auto & elem : order_by.getNodes())
{
auto * order_by_elem = elem->as<SortNode>();
if (order_by_elem->withFill())
return;
}
QueryTreeNodes new_order_by_nodes;
new_order_by_nodes.reserve(order_by.getNodes().size());
for (auto & elem : order_by.getNodes())
{
auto & order_by_expr = elem->as<SortNode>()->getExpression();
switch (order_by_expr->getNodeType())
{
case QueryTreeNodeType::FUNCTION:
{
if (isRedundantExpression(order_by_expr))
continue;
break;
}
case QueryTreeNodeType::COLUMN:
{
existing_keys.insert(order_by_expr);
break;
}
default:
break;
}
new_order_by_nodes.push_back(elem);
}
existing_keys.clear();
if (new_order_by_nodes.size() < order_by.getNodes().size())
order_by.getNodes() = std::move(new_order_by_nodes);
}
private:
QueryTreeNodePtrWithHashSet existing_keys;
bool isRedundantExpression(QueryTreeNodePtr function)
{
QueryTreeNodes nodes_to_process{ function };
while (!nodes_to_process.empty())
{
auto node = nodes_to_process.back();
nodes_to_process.pop_back();
// TODO: handle constants here
switch (node->getNodeType())
{
case QueryTreeNodeType::FUNCTION:
{
auto * function_node = node->as<FunctionNode>();
const auto & function_arguments = function_node->getArguments().getNodes();
if (function_arguments.empty())
return false;
const auto & function_base = function_node->getFunction();
if (!function_base || !function_base->isDeterministicInScopeOfQuery())
return false;
// Process arguments in order
for (auto it = function_arguments.rbegin(); it != function_arguments.rend(); ++it)
nodes_to_process.push_back(*it);
break;
}
case QueryTreeNodeType::COLUMN:
{
if (!existing_keys.contains(node))
return false;
break;
}
default:
return false;
}
}
return true;
}
};
}
void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr /*context*/)
{
OptimizeRedundantFunctionsInOrderByVisitor().visit(query_tree_node);
}
}

View File

@ -0,0 +1,23 @@
#pragma once
#include <Analyzer/IQueryTreePass.h>
namespace DB
{
/** If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x.
* Optimize ORDER BY x, y, f(x), g(x, y), f(h(x)), t(f(x), g(x)) into ORDER BY x, y
* in case if f(), g(), h(), t() are deterministic (in scope of query).
* Don't optimize ORDER BY f(x), g(x), x even if f(x) is bijection for x or g(x).
*/
class OptimizeRedundantFunctionsInOrderByPass final : public IQueryTreePass
{
public:
String getName() override { return "OptimizeRedundantFunctionsInOrderBy"; }
String getDescription() override { return "If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x."; }
void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override;
};
}

View File

@ -1695,7 +1695,7 @@ void QueryAnalyzer::evaluateScalarSubqueryIfNeeded(QueryTreeNodePtr & node, size
subquery_context->setSettings(subquery_settings);
auto options = SelectQueryOptions(QueryProcessingStage::Complete, subquery_depth, true /*is_subquery*/);
auto interpreter = std::make_unique<InterpreterSelectQueryAnalyzer>(node, options, subquery_context);
auto interpreter = std::make_unique<InterpreterSelectQueryAnalyzer>(node, subquery_context, options);
auto io = interpreter->execute();
@ -2027,7 +2027,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveTableIdentifierFromDatabaseCatalog(con
auto storage_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);
auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context);
return std::make_shared<TableNode>(std::move(storage), storage_lock, storage_snapshot);
return std::make_shared<TableNode>(std::move(storage), std::move(storage_lock), std::move(storage_snapshot));
}
/// Resolve identifier from compound expression

View File

@ -77,11 +77,11 @@ public:
if (!nested_function || nested_function->getFunctionName() != "if")
return;
auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
const auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
if (nested_if_function_arguments_nodes.size() != 3)
return;
auto & cond_argument = nested_if_function_arguments_nodes[0];
const auto & cond_argument = nested_if_function_arguments_nodes[0];
const auto * if_true_condition_constant_node = nested_if_function_arguments_nodes[1]->as<ConstantNode>();
const auto * if_false_condition_constant_node = nested_if_function_arguments_nodes[2]->as<ConstantNode>();
@ -101,7 +101,7 @@ public:
/// Rewrite `sum(if(cond, 1, 0))` into `countIf(cond)`.
if (if_true_condition_value == 1 && if_false_condition_value == 0)
{
function_node_arguments_nodes[0] = std::move(nested_if_function_arguments_nodes[0]);
function_node_arguments_nodes[0] = nested_if_function_arguments_nodes[0];
function_node_arguments_nodes.resize(1);
resolveAsCountIfAggregateFunction(*function_node, function_node_arguments_nodes[0]->getResultType());
@ -120,7 +120,7 @@ public:
auto not_function = std::make_shared<FunctionNode>("not");
auto & not_function_arguments = not_function->getArguments().getNodes();
not_function_arguments.push_back(std::move(nested_if_function_arguments_nodes[0]));
not_function_arguments.push_back(nested_if_function_arguments_nodes[0]);
not_function->resolveAsFunction(FunctionFactory::instance().get("not", context)->build(not_function->getArgumentColumns()));

View File

@ -15,6 +15,7 @@
#include <Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h>
#include <Analyzer/Passes/FuseFunctionsPass.h>
#include <Analyzer/Passes/IfTransformStringsToEnumPass.h>
#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
@ -91,7 +92,6 @@ public:
* TODO: Support setting optimize_move_functions_out_of_any.
* TODO: Support setting optimize_aggregators_of_group_by_keys.
* TODO: Support setting optimize_duplicate_order_by_and_distinct.
* TODO: Support setting optimize_redundant_functions_in_order_by.
* TODO: Support setting optimize_monotonous_functions_in_order_by.
* TODO: Support settings.optimize_or_like_chain.
* TODO: Add optimizations based on function semantics. Example: SELECT * FROM test_table WHERE id != id. (id is not nullable column).
@ -203,6 +203,9 @@ void addQueryTreePasses(QueryTreePassManager & manager)
if (settings.optimize_if_chain_to_multiif)
manager.addPass(std::make_unique<IfChainToMultiIfPass>());
if (settings.optimize_redundant_functions_in_order_by)
manager.addPass(std::make_unique<OptimizeRedundantFunctionsInOrderByPass>());
manager.addPass(std::make_unique<OrderByTupleEliminationPass>());
manager.addPass(std::make_unique<OrderByLimitByDuplicateEliminationPass>());

View File

@ -364,6 +364,10 @@ if (TARGET ch_contrib::crc32_s390x)
target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::crc32_s390x)
endif()
if (TARGET ch_contrib::crc32-vpmsum)
target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::crc32-vpmsum)
endif()
dbms_target_link_libraries(PUBLIC ch_contrib::abseil_swiss_tables)
target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::abseil_swiss_tables)

View File

@ -48,6 +48,10 @@ inline DB::UInt64 intHash64(DB::UInt64 x)
#include <arm_acle.h>
#endif
#if (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#include "vec_crc32.h"
#endif
#if defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
#include <crc32-s390x.h>
@ -87,6 +91,8 @@ inline DB::UInt64 intHashCRC32(DB::UInt64 x)
return _mm_crc32_u64(-1ULL, x);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cd(-1U, x);
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return crc32_ppc(-1U, reinterpret_cast<const unsigned char *>(&x), sizeof(x));
#elif defined(__s390x__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return s390x_crc32(-1U, x)
#else
@ -101,6 +107,8 @@ inline DB::UInt64 intHashCRC32(DB::UInt64 x, DB::UInt64 updated_value)
return _mm_crc32_u64(updated_value, x);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cd(static_cast<UInt32>(updated_value), x);
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return crc32_ppc(updated_value, reinterpret_cast<const unsigned char *>(&x), sizeof(x));
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
return s390x_crc32(updated_value, x);
#else

View File

@ -91,14 +91,20 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con
expected.dumpStructure()),
code);
if (isColumnConst(*actual.column) && isColumnConst(*expected.column))
if (isColumnConst(*actual.column) && isColumnConst(*expected.column)
&& !actual.column->empty() && !expected.column->empty()) /// don't check values in empty columns
{
Field actual_value = assert_cast<const ColumnConst &>(*actual.column).getField();
Field expected_value = assert_cast<const ColumnConst &>(*expected.column).getField();
if (actual_value != expected_value)
return onError<ReturnType>("Block structure mismatch in " + std::string(context_description) + " stream: different values of constants, actual: "
+ applyVisitor(FieldVisitorToString(), actual_value) + ", expected: " + applyVisitor(FieldVisitorToString(), expected_value),
return onError<ReturnType>(
fmt::format(
"Block structure mismatch in {} stream: different values of constants in column '{}': actual: {}, expected: {}",
context_description,
actual.name,
applyVisitor(FieldVisitorToString(), actual_value),
applyVisitor(FieldVisitorToString(), expected_value)),
code);
}

View File

@ -86,6 +86,10 @@ if (TARGET ch_contrib::rapidjson)
list (APPEND PRIVATE_LIBS ch_contrib::rapidjson)
endif()
if (TARGET ch_contrib::crc32-vpmsum)
list (APPEND PUBLIC_LIBS ch_contrib::crc32-vpmsum)
endif()
add_subdirectory(GatherUtils)
list (APPEND PRIVATE_LIBS clickhouse_functions_gatherutils)

View File

@ -14,6 +14,10 @@
#include <city.h>
#if (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#include "vec_crc32.h"
#endif
namespace DB
{
@ -36,6 +40,8 @@ struct Hash
return _mm_crc32_u64(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cd(static_cast<UInt32>(crc), val);
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
return s390x_crc32(crc, val);
#else
@ -49,6 +55,8 @@ struct Hash
return _mm_crc32_u32(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cw(crc, val);
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
return s390x_crc32_u32(crc, val);
#else
@ -62,6 +70,8 @@ struct Hash
return _mm_crc32_u16(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32ch(crc, val);
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
return s390x_crc32_u16(crc, val);
#else
@ -75,6 +85,8 @@ struct Hash
return _mm_crc32_u8(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cb(crc, val);
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
return s390x_crc32_u8(crc, val);
#else

View File

@ -24,6 +24,10 @@
# include <arm_acle.h>
#endif
#if (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#include "vec_crc32.h"
#endif
namespace DB
{
/** Distance function implementation.
@ -70,6 +74,8 @@ struct NgramDistanceImpl
return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu;
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32cd(code_points[2], combined) & 0xFFFFu;
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return crc32_ppc(code_points[2], reinterpret_cast<const unsigned char *>(&combined), sizeof(combined)) & 0xFFFFu;
#elif defined(__s390x__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return s390x_crc32(code_points[2], combined) & 0xFFFFu;
#else

View File

@ -320,11 +320,18 @@ void PocoHTTPClient::makeRequestInternal(
const std::string & query = target_uri.getRawQuery();
const std::string reserved = "?#:;+@&=%"; /// Poco::URI::RESERVED_QUERY_PARAM without '/' plus percent sign.
Poco::URI::encode(target_uri.getPath(), reserved, path_and_query);
if (!query.empty())
{
path_and_query += '?';
path_and_query += query;
}
/// `target_uri.getPath()` could return an empty string, but a proper HTTP request must
/// always contain a non-empty URI in its first line (e.g. "POST / HTTP/1.1").
if (path_and_query.empty())
path_and_query = "/";
poco_request.setURI(path_and_query);
switch (request.GetMethod())
@ -366,11 +373,12 @@ void PocoHTTPClient::makeRequestInternal(
if (enable_s3_requests_logging)
LOG_TEST(log, "Writing request body.");
if (attempt > 0) /// rewind content body buffer.
{
request.GetContentBody()->clear();
request.GetContentBody()->seekg(0);
}
/// Rewind content body buffer.
/// NOTE: we should do that always (even if `attempt == 0`) because the same request can be retried also by AWS,
/// see retryStrategy in Aws::Client::ClientConfiguration.
request.GetContentBody()->clear();
request.GetContentBody()->seekg(0);
auto size = Poco::StreamCopier::copyStream(*request.GetContentBody(), request_body_stream);
if (enable_s3_requests_logging)
LOG_TEST(log, "Written {} bytes to request body", size);
@ -385,8 +393,16 @@ void PocoHTTPClient::makeRequestInternal(
int status_code = static_cast<int>(poco_response.getStatus());
if (enable_s3_requests_logging)
LOG_TEST(log, "Response status: {}, {}", status_code, poco_response.getReason());
if (status_code >= SUCCESS_RESPONSE_MIN && status_code <= SUCCESS_RESPONSE_MAX)
{
if (enable_s3_requests_logging)
LOG_TEST(log, "Response status: {}, {}", status_code, poco_response.getReason());
}
else
{
/// Error statuses are more important so we show them even if `enable_s3_requests_logging == false`.
LOG_INFO(log, "Response status: {}, {}", status_code, poco_response.getReason());
}
if (poco_response.getStatus() == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT)
{

View File

@ -13,6 +13,7 @@
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <Core/SortDescription.h>
#include <Planner/PlannerActionsVisitor.h>
#include <stack>
#include <base/sort.h>
@ -216,6 +217,22 @@ const ActionsDAG::Node & ActionsDAG::addFunction(
all_const);
}
const ActionsDAG::Node & ActionsDAG::addCast(const Node & node_to_cast, const DataTypePtr & cast_type)
{
Field cast_type_constant_value(cast_type->getName());
ColumnWithTypeAndName column;
column.name = calculateConstantActionNodeName(cast_type_constant_value);
column.column = DataTypeString().createColumnConst(0, cast_type_constant_value);
column.type = std::make_shared<DataTypeString>();
const auto * cast_type_constant_node = &addColumn(std::move(column));
ActionsDAG::NodeRawConstPtrs children = {&node_to_cast, cast_type_constant_node};
FunctionOverloadResolverPtr func_builder_cast = CastInternalOverloadResolver<CastType::nonAccurate>::createImpl();
return addFunction(func_builder_cast, std::move(children), node_to_cast.result_name);
}
const ActionsDAG::Node & ActionsDAG::addFunctionImpl(
const FunctionBasePtr & function_base,
NodeRawConstPtrs children,

View File

@ -143,6 +143,7 @@ public:
const FunctionBasePtr & function_base,
NodeRawConstPtrs children,
std::string result_name);
const Node & addCast(const Node & node_to_cast, const DataTypePtr & cast_type);
/// Find first column by name in output nodes. This search is linear.
const Node & findInOutputs(const std::string & name) const;

View File

@ -78,7 +78,7 @@ BlockIO InterpreterDescribeQuery::execute()
if (settings.allow_experimental_analyzer)
{
SelectQueryOptions select_query_options;
names_and_types = InterpreterSelectQueryAnalyzer(select_query, select_query_options, current_context).getSampleBlock().getNamesAndTypesList();
names_and_types = InterpreterSelectQueryAnalyzer(select_query, current_context, select_query_options).getSampleBlock().getNamesAndTypesList();
}
else
{

View File

@ -423,7 +423,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
if (getContext()->getSettingsRef().allow_experimental_analyzer)
{
InterpreterSelectQueryAnalyzer interpreter(ast.getExplainedQuery(), options, getContext());
InterpreterSelectQueryAnalyzer interpreter(ast.getExplainedQuery(), getContext(), options);
context = interpreter.getContext();
plan = std::move(interpreter).extractQueryPlan();
}
@ -469,7 +469,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
if (getContext()->getSettingsRef().allow_experimental_analyzer)
{
InterpreterSelectQueryAnalyzer interpreter(ast.getExplainedQuery(), options, getContext());
InterpreterSelectQueryAnalyzer interpreter(ast.getExplainedQuery(), getContext(), options);
context = interpreter.getContext();
plan = std::move(interpreter).extractQueryPlan();
}

View File

@ -126,7 +126,7 @@ std::unique_ptr<IInterpreter> InterpreterFactory::get(ASTPtr & query, ContextMut
if (query->as<ASTSelectQuery>())
{
if (context->getSettingsRef().allow_experimental_analyzer)
return std::make_unique<InterpreterSelectQueryAnalyzer>(query, options, context);
return std::make_unique<InterpreterSelectQueryAnalyzer>(query, context, options);
/// This is internal part of ASTSelectWithUnionQuery.
/// Even if there is SELECT without union, it is represented by ASTSelectWithUnionQuery with single ASTSelectQuery as a child.
@ -137,7 +137,7 @@ std::unique_ptr<IInterpreter> InterpreterFactory::get(ASTPtr & query, ContextMut
ProfileEvents::increment(ProfileEvents::SelectQuery);
if (context->getSettingsRef().allow_experimental_analyzer)
return std::make_unique<InterpreterSelectQueryAnalyzer>(query, options, context);
return std::make_unique<InterpreterSelectQueryAnalyzer>(query, context, options);
return std::make_unique<InterpreterSelectWithUnionQuery>(query, context, options);
}

View File

@ -45,13 +45,17 @@ ASTPtr normalizeAndValidateQuery(const ASTPtr & query)
}
}
QueryTreeNodePtr buildQueryTreeAndRunPasses(const ASTPtr & query, const ContextPtr & context)
QueryTreeNodePtr buildQueryTreeAndRunPasses(const ASTPtr & query, const SelectQueryOptions & select_query_options, const ContextPtr & context)
{
auto query_tree = buildQueryTree(query, context);
QueryTreePassManager query_tree_pass_manager(context);
addQueryTreePasses(query_tree_pass_manager);
query_tree_pass_manager.run(query_tree);
if (select_query_options.ignore_ast_optimizations)
query_tree_pass_manager.run(query_tree, 1 /*up_to_pass_index*/);
else
query_tree_pass_manager.run(query_tree);
return query_tree;
}
@ -60,24 +64,24 @@ QueryTreeNodePtr buildQueryTreeAndRunPasses(const ASTPtr & query, const ContextP
InterpreterSelectQueryAnalyzer::InterpreterSelectQueryAnalyzer(
const ASTPtr & query_,
const SelectQueryOptions & select_query_options_,
ContextPtr context_)
const ContextPtr & context_,
const SelectQueryOptions & select_query_options_)
: query(normalizeAndValidateQuery(query_))
, query_tree(buildQueryTreeAndRunPasses(query, context_))
, context(Context::createCopy(context_))
, select_query_options(select_query_options_)
, context(std::move(context_))
, query_tree(buildQueryTreeAndRunPasses(query, select_query_options, context))
, planner(query_tree, select_query_options)
{
}
InterpreterSelectQueryAnalyzer::InterpreterSelectQueryAnalyzer(
const QueryTreeNodePtr & query_tree_,
const SelectQueryOptions & select_query_options_,
ContextPtr context_)
const ContextPtr & context_,
const SelectQueryOptions & select_query_options_)
: query(query_tree_->toAST())
, query_tree(query_tree_)
, context(Context::createCopy(context_))
, select_query_options(select_query_options_)
, context(std::move(context_))
, query_tree(query_tree_)
, planner(query_tree, select_query_options)
{
}
@ -122,4 +126,17 @@ void InterpreterSelectQueryAnalyzer::extendQueryLogElemImpl(QueryLogElement & el
elem.query_kind = "Select";
}
void InterpreterSelectQueryAnalyzer::setMergeTreeReadTaskCallbackAndClientInfo(MergeTreeReadTaskCallback && callback)
{
context->getClientInfo().collaborate_with_initiator = true;
context->setMergeTreeReadTaskCallback(std::move(callback));
}
void InterpreterSelectQueryAnalyzer::setProperClientInfo(size_t replica_number, size_t count_participating_replicas)
{
context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
context->getClientInfo().number_of_current_replica = replica_number;
context->getClientInfo().count_participating_replicas = count_participating_replicas;
}
}

View File

@ -3,11 +3,11 @@
#include <Interpreters/IInterpreter.h>
#include <Interpreters/SelectQueryOptions.h>
#include <Analyzer/QueryTreePassManager.h>
#include <Storages/MergeTree/RequestResponse.h>
#include <Processors/QueryPlan/QueryPlan.h>
#include <Interpreters/Context_fwd.h>
#include <Analyzer/QueryTreePassManager.h>
#include <Planner/Planner.h>
#include <Interpreters/Context_fwd.h>
namespace DB
{
@ -17,20 +17,15 @@ class InterpreterSelectQueryAnalyzer : public IInterpreter
public:
/// Initialize interpreter with query AST
InterpreterSelectQueryAnalyzer(const ASTPtr & query_,
const SelectQueryOptions & select_query_options_,
ContextPtr context_);
const ContextPtr & context_,
const SelectQueryOptions & select_query_options_);
/// Initialize interpreter with query tree
InterpreterSelectQueryAnalyzer(const QueryTreeNodePtr & query_tree_,
const SelectQueryOptions & select_query_options_,
ContextPtr context_);
const ContextPtr & context_,
const SelectQueryOptions & select_query_options_);
const ContextPtr & getContext() const
{
return context;
}
ContextPtr & getContext()
ContextPtr getContext() const
{
return context;
}
@ -51,11 +46,17 @@ public:
void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, ContextPtr) const override;
/// Set merge tree read task callback in context and set collaborate_with_initiator in client info
void setMergeTreeReadTaskCallbackAndClientInfo(MergeTreeReadTaskCallback && callback);
/// Set number_of_current_replica and count_participating_replicas in client_info
void setProperClientInfo(size_t replica_number, size_t count_participating_replicas);
private:
ASTPtr query;
QueryTreeNodePtr query_tree;
ContextMutablePtr context;
SelectQueryOptions select_query_options;
ContextPtr context;
QueryTreeNodePtr query_tree;
Planner planner;
};

View File

@ -1,13 +1,17 @@
#include <Interpreters/LogicalExpressionsOptimizer.h>
#include <Interpreters/IdentifierSemantic.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <Core/Settings.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTIdentifier.h>
#include <Common/typeid_cast.h>
#include <deque>
#include <vector>
#include <base/sort.h>
@ -32,8 +36,9 @@ bool LogicalExpressionsOptimizer::OrWithExpression::operator<(const OrWithExpres
return std::tie(this->or_function, this->expression) < std::tie(rhs.or_function, rhs.expression);
}
LogicalExpressionsOptimizer::LogicalExpressionsOptimizer(ASTSelectQuery * select_query_, UInt64 optimize_min_equality_disjunction_chain_length)
: select_query(select_query_), settings(optimize_min_equality_disjunction_chain_length)
LogicalExpressionsOptimizer::LogicalExpressionsOptimizer(ASTSelectQuery * select_query_,
const TablesWithColumns & tables_with_columns_, UInt64 optimize_min_equality_disjunction_chain_length)
: select_query(select_query_), tables_with_columns(tables_with_columns_), settings(optimize_min_equality_disjunction_chain_length)
{
}
@ -196,13 +201,41 @@ inline ASTs & getFunctionOperands(const ASTFunction * or_function)
}
bool LogicalExpressionsOptimizer::isLowCardinalityEqualityChain(const std::vector<ASTFunction *> & functions) const
{
if (functions.size() > 1)
{
/// Check if identifier is LowCardinality type
auto & first_operands = getFunctionOperands(functions[0]);
const auto * identifier = first_operands[0]->as<ASTIdentifier>();
if (identifier)
{
auto pos = IdentifierSemantic::getMembership(*identifier);
if (!pos)
pos = IdentifierSemantic::chooseTableColumnMatch(*identifier, tables_with_columns, true);
if (pos)
{
if (auto data_type_and_name = tables_with_columns[*pos].columns.tryGetByName(identifier->shortName()))
{
if (typeid_cast<const DataTypeLowCardinality *>(data_type_and_name->type.get()))
return true;
}
}
}
}
return false;
}
bool LogicalExpressionsOptimizer::mayOptimizeDisjunctiveEqualityChain(const DisjunctiveEqualityChain & chain) const
{
const auto & equalities = chain.second;
const auto & equality_functions = equalities.functions;
/// We eliminate too short chains.
if (equality_functions.size() < settings.optimize_min_equality_disjunction_chain_length)
/// For LowCardinality column, the dict is usually smaller and the index is relatively large.
/// In most cases, merging OR-chain as IN is better than converting each LowCardinality into full column individually.
/// For non-LowCardinality, we need to eliminate too short chains.
if (equality_functions.size() < settings.optimize_min_equality_disjunction_chain_length &&
!isLowCardinalityEqualityChain(equality_functions))
return false;
/// We check that the right-hand sides of all equalities have the same type.

View File

@ -1,6 +1,7 @@
#pragma once
#include <Parsers/IAST.h>
#include <Interpreters/DatabaseAndTableWithAlias.h>
#include <string>
#include <vector>
@ -36,7 +37,7 @@ class LogicalExpressionsOptimizer final
public:
/// Constructor. Accepts the root of the query DAG.
LogicalExpressionsOptimizer(ASTSelectQuery * select_query_, UInt64 optimize_min_equality_disjunction_chain_length);
LogicalExpressionsOptimizer(ASTSelectQuery * select_query_, const TablesWithColumns & tables_with_columns_, UInt64 optimize_min_equality_disjunction_chain_length);
/** Replace all rather long homogeneous OR-chains expr = x1 OR ... OR expr = xN
* on the expressions `expr` IN (x1, ..., xN).
@ -79,6 +80,9 @@ private:
*/
bool mayOptimizeDisjunctiveEqualityChain(const DisjunctiveEqualityChain & chain) const;
/// Check if is LowCardinality OR chain
bool isLowCardinalityEqualityChain(const std::vector<ASTFunction *> & functions) const;
/// Insert the IN expression into the OR chain.
static void addInExpression(const DisjunctiveEqualityChain & chain);
@ -96,6 +100,7 @@ private:
using ColumnToPosition = std::unordered_map<const IAST *, size_t>;
ASTSelectQuery * select_query;
const TablesWithColumns & tables_with_columns;
const ExtractedSettings settings;
/// Information about the OR-chains inside the query.
DisjunctiveEqualityChainsMap disjunctive_equality_chains_map;

View File

@ -1298,7 +1298,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
ASTPtr & query,
TreeRewriterResult && result,
const SelectQueryOptions & select_options,
const std::vector<TableWithColumnNamesAndTypes> & tables_with_columns,
const TablesWithColumns & tables_with_columns,
const Names & required_result_columns,
std::shared_ptr<TableJoin> table_join,
bool is_parameterized_view,
@ -1344,7 +1344,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
translateQualifiedNames(query, *select_query, source_columns_set, tables_with_columns, parameter_values, parameter_types);
/// Optimizes logical expressions.
LogicalExpressionsOptimizer(select_query, settings.optimize_min_equality_disjunction_chain_length.value).perform();
LogicalExpressionsOptimizer(select_query, tables_with_columns, settings.optimize_min_equality_disjunction_chain_length.value).perform();
NameSet all_source_columns_set = source_columns_set;
if (table_join)

View File

@ -17,7 +17,6 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int UNSUPPORTED_METHOD;
}
namespace
@ -104,9 +103,6 @@ void collectTableExpressionData(QueryTreeNodePtr & query_node, PlannerContext &
bool storage_is_remote = table_function_node->getStorage()->isRemote();
table_expression_data.setIsRemote(storage_is_remote);
}
if (table_expression_data.isRemote())
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Remote storages are not supported");
}
CollectSourceColumnsVisitor collect_source_columns_visitor(planner_context);

File diff suppressed because it is too large Load Diff

View File

@ -16,17 +16,24 @@ using GlobalPlannerContextPtr = std::shared_ptr<GlobalPlannerContext>;
class PlannerContext;
using PlannerContextPtr = std::shared_ptr<PlannerContext>;
struct PlannerConfiguration
{
bool only_analyze = false;
};
class Planner
{
public:
/// Initialize planner with query tree after analysis phase
Planner(const QueryTreeNodePtr & query_tree_,
const SelectQueryOptions & select_query_options_);
const SelectQueryOptions & select_query_options_,
PlannerConfiguration planner_configuration_ = {});
/// Initialize planner with query tree after query analysis phase and global planner context
Planner(const QueryTreeNodePtr & query_tree_,
const SelectQueryOptions & select_query_options_,
GlobalPlannerContextPtr global_planner_context_);
GlobalPlannerContextPtr global_planner_context_,
PlannerConfiguration planner_configuration_ = {});
const QueryPlan & getQueryPlan() const
{
@ -48,10 +55,15 @@ public:
void addStorageLimits(const StorageLimitsList & limits);
private:
void buildPlanForUnionNode();
void buildPlanForQueryNode();
QueryTreeNodePtr query_tree;
QueryPlan query_plan;
SelectQueryOptions select_query_options;
PlannerContextPtr planner_context;
PlannerConfiguration planner_configuration;
StorageLimitsList storage_limits;
};

View File

@ -3,7 +3,6 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/CastOverloadResolver.h>
#include <Access/Common/AccessFlags.h>
#include <Access/ContextAccess.h>
@ -11,6 +10,7 @@
#include <Storages/IStorage.h>
#include <Storages/StorageDictionary.h>
#include <Analyzer/ConstantNode.h>
#include <Analyzer/ColumnNode.h>
#include <Analyzer/TableNode.h>
#include <Analyzer/TableFunctionNode.h>
@ -18,6 +18,7 @@
#include <Analyzer/UnionNode.h>
#include <Analyzer/JoinNode.h>
#include <Analyzer/ArrayJoinNode.h>
#include <Analyzer/Utils.h>
#include <Processors/Sources/NullSource.h>
#include <Processors/QueryPlan/SortingStep.h>
@ -49,6 +50,9 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
extern const int SYNTAX_ERROR;
extern const int ACCESS_DENIED;
extern const int PARAMETER_OUT_OF_BOUND;
extern const int TOO_MANY_COLUMNS;
extern const int UNSUPPORTED_METHOD;
}
namespace
@ -139,11 +143,17 @@ NameAndTypePair chooseSmallestColumnToReadFromStorage(const StoragePtr & storage
return result;
}
QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
SelectQueryInfo & select_query_info,
JoinTreeQueryPlan buildQueryPlanForTableExpression(const QueryTreeNodePtr & table_expression,
const SelectQueryInfo & select_query_info,
const SelectQueryOptions & select_query_options,
PlannerContextPtr & planner_context)
PlannerContextPtr & planner_context,
bool is_single_table_expression)
{
const auto & query_context = planner_context->getQueryContext();
const auto & settings = query_context->getSettingsRef();
QueryProcessingStage::Enum from_stage = QueryProcessingStage::Enum::FetchColumns;
auto * table_node = table_expression->as<TableNode>();
auto * table_function_node = table_expression->as<TableFunctionNode>();
auto * query_node = table_expression->as<QueryNode>();
@ -161,14 +171,93 @@ QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
auto table_expression_query_info = select_query_info;
table_expression_query_info.table_expression = table_expression;
size_t max_streams = settings.max_threads;
size_t max_threads_execute_query = settings.max_threads;
/** With distributed query processing, almost no computations are done in the threads,
* but wait and receive data from remote servers.
* If we have 20 remote servers, and max_threads = 8, then it would not be efficient to
* connect and ask only 8 servers at a time.
* To simultaneously query more remote servers,
* instead of max_threads, max_distributed_connections is used.
*/
bool is_remote = table_expression_data.isRemote();
if (is_remote)
{
max_streams = settings.max_distributed_connections;
max_threads_execute_query = settings.max_distributed_connections;
}
UInt64 max_block_size = settings.max_block_size;
auto & main_query_node = select_query_info.query_tree->as<QueryNode &>();
if (is_single_table_expression)
{
size_t limit_length = 0;
if (main_query_node.hasLimit())
{
/// Constness of limit is validated during query analysis stage
limit_length = main_query_node.getLimit()->as<ConstantNode &>().getValue().safeGet<UInt64>();
}
size_t limit_offset = 0;
if (main_query_node.hasOffset())
{
/// Constness of offset is validated during query analysis stage
limit_offset = main_query_node.getOffset()->as<ConstantNode &>().getValue().safeGet<UInt64>();
}
/** If not specified DISTINCT, WHERE, GROUP BY, HAVING, ORDER BY, JOIN, LIMIT BY, LIMIT WITH TIES
* but LIMIT is specified, and limit + offset < max_block_size,
* then as the block size we will use limit + offset (not to read more from the table than requested),
* and also set the number of threads to 1.
*/
if (main_query_node.hasLimit() &&
!main_query_node.isDistinct() &&
!main_query_node.isLimitWithTies() &&
!main_query_node.hasPrewhere() &&
!main_query_node.hasWhere() &&
select_query_info.filter_asts.empty() &&
!main_query_node.hasGroupBy() &&
!main_query_node.hasHaving() &&
!main_query_node.hasOrderBy() &&
!main_query_node.hasLimitBy() &&
!select_query_info.need_aggregate &&
!select_query_info.has_window &&
limit_length <= std::numeric_limits<UInt64>::max() - limit_offset)
{
if (limit_length + limit_offset < max_block_size)
{
max_block_size = std::max<UInt64>(1, limit_length + limit_offset);
max_streams = 1;
max_threads_execute_query = 1;
}
if (limit_length + limit_offset < select_query_info.local_storage_limits.local_limits.size_limits.max_rows)
{
table_expression_query_info.limit = limit_length + limit_offset;
}
}
if (!max_block_size)
throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND,
"Setting 'max_block_size' cannot be zero");
}
if (max_streams == 0)
max_streams = 1;
/// If necessary, we request more sources than the number of threads - to distribute the work evenly over the threads.
if (max_streams > 1 && !is_remote)
max_streams = static_cast<size_t>(max_streams * settings.max_streams_to_max_threads_ratio);
if (table_node)
table_expression_query_info.table_expression_modifiers = table_node->getTableExpressionModifiers();
else
table_expression_query_info.table_expression_modifiers = table_function_node->getTableExpressionModifiers();
auto & query_context = planner_context->getQueryContext();
auto from_stage = storage->getQueryProcessingStage(query_context, select_query_options.to_stage, storage_snapshot, table_expression_query_info);
from_stage = storage->getQueryProcessingStage(query_context, select_query_options.to_stage, storage_snapshot, table_expression_query_info);
Names columns_names = table_expression_data.getColumnNames();
@ -183,6 +272,13 @@ QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
checkAccessRights(*table_node, column_names_with_aliases, planner_context->getQueryContext());
}
/// Limitation on the number of columns to read
if (settings.max_columns_to_read && columns_names.size() > settings.max_columns_to_read)
throw Exception(ErrorCodes::TOO_MANY_COLUMNS,
"Limit for number of columns to read exceeded. Requested: {}, maximum: {}",
columns_names.size(),
settings.max_columns_to_read);
if (columns_names.empty())
{
auto additional_column_to_read = chooseSmallestColumnToReadFromStorage(storage, storage_snapshot);
@ -191,9 +287,6 @@ QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
table_expression_data.addColumn(additional_column_to_read, column_identifier);
}
size_t max_block_size = query_context->getSettingsRef().max_block_size;
size_t max_streams = query_context->getSettingsRef().max_threads;
bool need_rewrite_query_with_final = storage->needRewriteQueryWithFinal(columns_names);
if (need_rewrite_query_with_final)
{
@ -217,9 +310,21 @@ QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
storage->read(query_plan, columns_names, storage_snapshot, table_expression_query_info, query_context, from_stage, max_block_size, max_streams);
/// Create step which reads from empty source if storage has no data.
if (!query_plan.isInitialized())
if (query_plan.isInitialized())
{
/** Specify the number of threads only if it wasn't specified in storage.
*
* But in case of remote query and prefer_localhost_replica=1 (default)
* The inner local query (that is done in the same process, without
* network interaction), it will setMaxThreads earlier and distributed
* query will not update it.
*/
if (!query_plan.getMaxThreads() || is_remote)
query_plan.setMaxThreads(max_threads_execute_query);
}
else
{
/// Create step which reads from empty source if storage has no data.
auto source_header = storage_snapshot->getSampleBlockForColumns(columns_names);
Pipe pipe(std::make_shared<NullSource>(source_header));
auto read_from_pipe = std::make_unique<ReadFromPreparedSource>(std::move(pipe));
@ -239,50 +344,52 @@ QueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expression,
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected table, table function, query or union. Actual {}", table_expression->formatASTForErrorMessage());
}
auto rename_actions_dag = std::make_shared<ActionsDAG>(query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName());
ActionsDAG::NodeRawConstPtrs updated_actions_dag_outputs;
for (auto & output_node : rename_actions_dag->getOutputs())
if (from_stage == QueryProcessingStage::FetchColumns)
{
const auto * column_identifier = table_expression_data.getColumnIdentifierOrNull(output_node->result_name);
if (!column_identifier)
continue;
auto rename_actions_dag = std::make_shared<ActionsDAG>(query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName());
ActionsDAG::NodeRawConstPtrs updated_actions_dag_outputs;
updated_actions_dag_outputs.push_back(&rename_actions_dag->addAlias(*output_node, *column_identifier));
for (auto & output_node : rename_actions_dag->getOutputs())
{
const auto * column_identifier = table_expression_data.getColumnIdentifierOrNull(output_node->result_name);
if (!column_identifier)
continue;
updated_actions_dag_outputs.push_back(&rename_actions_dag->addAlias(*output_node, *column_identifier));
}
rename_actions_dag->getOutputs() = std::move(updated_actions_dag_outputs);
auto rename_step = std::make_unique<ExpressionStep>(query_plan.getCurrentDataStream(), rename_actions_dag);
rename_step->setStepDescription("Change column names to column identifiers");
query_plan.addStep(std::move(rename_step));
}
rename_actions_dag->getOutputs() = std::move(updated_actions_dag_outputs);
auto rename_step = std::make_unique<ExpressionStep>(query_plan.getCurrentDataStream(), rename_actions_dag);
rename_step->setStepDescription("Change column names to column identifiers");
query_plan.addStep(std::move(rename_step));
return query_plan;
return {std::move(query_plan), from_stage};
}
QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
SelectQueryInfo & select_query_info,
const SelectQueryOptions & select_query_options,
JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_expression,
JoinTreeQueryPlan left_join_tree_query_plan,
JoinTreeQueryPlan right_join_tree_query_plan,
const ColumnIdentifierSet & outer_scope_columns,
PlannerContextPtr & planner_context)
{
auto & join_node = join_tree_node->as<JoinNode &>();
auto & join_node = join_table_expression->as<JoinNode &>();
if (left_join_tree_query_plan.from_stage != QueryProcessingStage::FetchColumns)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
"JOIN {} left table expression expected to process query to fetch columns stage. Actual {}",
join_node.formatASTForErrorMessage(),
QueryProcessingStage::toString(left_join_tree_query_plan.from_stage));
ColumnIdentifierSet current_scope_columns = outer_scope_columns;
collectTopLevelColumnIdentifiers(join_tree_node, planner_context, current_scope_columns);
auto left_plan = buildQueryPlanForJoinTreeNode(join_node.getLeftTableExpression(),
select_query_info,
select_query_options,
current_scope_columns,
planner_context);
auto left_plan = std::move(left_join_tree_query_plan.query_plan);
auto left_plan_output_columns = left_plan.getCurrentDataStream().header.getColumnsWithTypeAndName();
if (right_join_tree_query_plan.from_stage != QueryProcessingStage::FetchColumns)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
"JOIN {} right table expression expected to process query to fetch columns stage. Actual {}",
join_node.formatASTForErrorMessage(),
QueryProcessingStage::toString(right_join_tree_query_plan.from_stage));
auto right_plan = buildQueryPlanForJoinTreeNode(join_node.getRightTableExpression(),
select_query_info,
select_query_options,
current_scope_columns,
planner_context);
auto right_plan = std::move(right_join_tree_query_plan.query_plan);
auto right_plan_output_columns = right_plan.getCurrentDataStream().header.getColumnsWithTypeAndName();
JoinClausesAndActions join_clauses_and_actions;
@ -291,7 +398,7 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
std::optional<bool> join_constant;
if (join_node.getStrictness() == JoinStrictness::All)
join_constant = tryExtractConstantFromJoinNode(join_tree_node);
join_constant = tryExtractConstantFromJoinNode(join_table_expression);
if (join_constant)
{
@ -308,7 +415,7 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
{
join_clauses_and_actions = buildJoinClausesAndActions(left_plan_output_columns,
right_plan_output_columns,
join_tree_node,
join_table_expression,
planner_context);
join_clauses_and_actions.left_join_expressions_actions->projectInput();
@ -365,22 +472,7 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
continue;
const auto & cast_type = it->second;
auto cast_type_name = cast_type->getName();
Field cast_type_constant_value(cast_type_name);
ColumnWithTypeAndName column;
column.name = calculateConstantActionNodeName(cast_type_constant_value);
column.column = DataTypeString().createColumnConst(0, cast_type_constant_value);
column.type = std::make_shared<DataTypeString>();
const auto * cast_type_constant_node = &cast_actions_dag->addColumn(std::move(column));
FunctionCastBase::Diagnostic diagnostic = {output_node->result_name, output_node->result_name};
FunctionOverloadResolverPtr func_builder_cast
= CastInternalOverloadResolver<CastType::nonAccurate>::createImpl(std::move(diagnostic));
ActionsDAG::NodeRawConstPtrs children = {output_node, cast_type_constant_node};
output_node = &cast_actions_dag->addFunction(func_builder_cast, std::move(children), output_node->result_name);
output_node = &cast_actions_dag->addCast(*output_node, cast_type);
}
cast_actions_dag->projectInput();
@ -575,12 +667,10 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
if (join_algorithm->isFilled())
{
size_t max_block_size = query_context->getSettingsRef().max_block_size;
auto filled_join_step = std::make_unique<FilledJoinStep>(
left_plan.getCurrentDataStream(),
join_algorithm,
max_block_size);
settings.max_block_size);
filled_join_step->setStepDescription("Filled JOIN");
left_plan.addStep(std::move(filled_join_step));
@ -645,16 +735,13 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
add_sorting(right_plan, join_clause.key_names_right, JoinTableSide::Right);
}
size_t max_block_size = query_context->getSettingsRef().max_block_size;
size_t max_streams = query_context->getSettingsRef().max_threads;
JoinPipelineType join_pipeline_type = join_algorithm->pipelineType();
auto join_pipeline_type = join_algorithm->pipelineType();
auto join_step = std::make_unique<JoinStep>(
left_plan.getCurrentDataStream(),
right_plan.getCurrentDataStream(),
std::move(join_algorithm),
max_block_size,
max_streams,
settings.max_block_size,
settings.max_threads,
false /*optimize_read_in_order*/);
join_step->setStepDescription(fmt::format("JOIN {}", join_pipeline_type));
@ -690,22 +777,21 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
drop_unused_columns_after_join_transform_step->setStepDescription("DROP unused columns after JOIN");
result_plan.addStep(std::move(drop_unused_columns_after_join_transform_step));
return result_plan;
return {std::move(result_plan), QueryProcessingStage::FetchColumns};
}
QueryPlan buildQueryPlanForArrayJoinNode(QueryTreeNodePtr table_expression,
SelectQueryInfo & select_query_info,
const SelectQueryOptions & select_query_options,
const ColumnIdentifierSet & outer_scope_columns,
JoinTreeQueryPlan buildQueryPlanForArrayJoinNode(const QueryTreeNodePtr & array_join_table_expression,
JoinTreeQueryPlan join_tree_query_plan,
PlannerContextPtr & planner_context)
{
auto & array_join_node = table_expression->as<ArrayJoinNode &>();
auto & array_join_node = array_join_table_expression->as<ArrayJoinNode &>();
if (join_tree_query_plan.from_stage != QueryProcessingStage::FetchColumns)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
"ARRAY JOIN {} table expression expected to process query to fetch columns stage. Actual {}",
array_join_node.formatASTForErrorMessage(),
QueryProcessingStage::toString(join_tree_query_plan.from_stage));
auto plan = buildQueryPlanForJoinTreeNode(array_join_node.getTableExpression(),
select_query_info,
select_query_options,
outer_scope_columns,
planner_context);
auto plan = std::move(join_tree_query_plan.query_plan);
auto plan_output_columns = plan.getCurrentDataStream().header.getColumnsWithTypeAndName();
ActionsDAGPtr array_join_action_dag = std::make_shared<ActionsDAG>(plan_output_columns);
@ -736,46 +822,89 @@ QueryPlan buildQueryPlanForArrayJoinNode(QueryTreeNodePtr table_expression,
array_join_step->setStepDescription("ARRAY JOIN");
plan.addStep(std::move(array_join_step));
return plan;
return {std::move(plan), QueryProcessingStage::FetchColumns};
}
}
QueryPlan buildQueryPlanForJoinTreeNode(QueryTreeNodePtr join_tree_node,
SelectQueryInfo & select_query_info,
JoinTreeQueryPlan buildJoinTreeQueryPlan(const QueryTreeNodePtr & query_node,
const SelectQueryInfo & select_query_info,
const SelectQueryOptions & select_query_options,
const ColumnIdentifierSet & outer_scope_columns,
PlannerContextPtr & planner_context)
{
auto join_tree_node_type = join_tree_node->getNodeType();
const auto & query_node_typed = query_node->as<QueryNode &>();
auto table_expressions_stack = buildTableExpressionsStack(query_node_typed.getJoinTree());
size_t table_expressions_stack_size = table_expressions_stack.size();
bool is_single_table_expression = table_expressions_stack_size == 1;
switch (join_tree_node_type)
std::vector<ColumnIdentifierSet> table_expressions_outer_scope_columns(table_expressions_stack_size);
ColumnIdentifierSet current_outer_scope_columns = outer_scope_columns;
for (Int64 i = table_expressions_stack_size - 1; i >= 0; --i)
{
case QueryTreeNodeType::TABLE:
[[fallthrough]];
case QueryTreeNodeType::TABLE_FUNCTION:
[[fallthrough]];
case QueryTreeNodeType::QUERY:
[[fallthrough]];
case QueryTreeNodeType::UNION:
table_expressions_outer_scope_columns[i] = current_outer_scope_columns;
if (table_expressions_stack[i]->getNodeType() == QueryTreeNodeType::JOIN)
collectTopLevelColumnIdentifiers(table_expressions_stack[i], planner_context, current_outer_scope_columns);
}
std::vector<JoinTreeQueryPlan> query_plans_stack;
for (size_t i = 0; i < table_expressions_stack_size; ++i)
{
const auto & table_expression = table_expressions_stack[i];
if (auto * array_join_node = table_expression->as<ArrayJoinNode>())
{
return buildQueryPlanForTableExpression(join_tree_node, select_query_info, select_query_options, planner_context);
if (query_plans_stack.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expected at least 1 query plan on stack before ARRAY JOIN processing");
auto query_plan = std::move(query_plans_stack.back());
query_plans_stack.back() = buildQueryPlanForArrayJoinNode(table_expression,
std::move(query_plan),
planner_context);
}
case QueryTreeNodeType::JOIN:
else if (auto * join_node = table_expression->as<JoinNode>())
{
return buildQueryPlanForJoinNode(join_tree_node, select_query_info, select_query_options, outer_scope_columns, planner_context);
size_t table_expressions_column_nodes_with_names_stack_size = query_plans_stack.size();
if (table_expressions_column_nodes_with_names_stack_size < 2)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expected at least 2 query plans on stack before JOIN processing. Actual {}",
table_expressions_column_nodes_with_names_stack_size);
auto right_query_plan = std::move(query_plans_stack.back());
query_plans_stack.pop_back();
auto left_query_plan = std::move(query_plans_stack.back());
query_plans_stack.pop_back();
query_plans_stack.push_back(buildQueryPlanForJoinNode(table_expression,
std::move(left_query_plan),
std::move(right_query_plan),
table_expressions_outer_scope_columns[i],
planner_context));
}
case QueryTreeNodeType::ARRAY_JOIN:
else
{
return buildQueryPlanForArrayJoinNode(join_tree_node, select_query_info, select_query_options, outer_scope_columns, planner_context);
}
default:
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expected table, table function, query, union, join or array join query node. Actual {}",
join_tree_node->formatASTForErrorMessage());
const auto & table_expression_data = planner_context->getTableExpressionDataOrThrow(table_expression);
if (table_expression_data.isRemote() && !is_single_table_expression)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
"JOIN with remote storages is unsuppored");
query_plans_stack.push_back(buildQueryPlanForTableExpression(table_expression,
select_query_info,
select_query_options,
planner_context,
is_single_table_expression));
}
}
if (query_plans_stack.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected at least 1 query plan for JOIN TREE");
return std::move(query_plans_stack.back());
}
}

View File

@ -11,9 +11,15 @@
namespace DB
{
/// Build query plan for query JOIN TREE node
QueryPlan buildQueryPlanForJoinTreeNode(QueryTreeNodePtr join_tree_node,
SelectQueryInfo & select_query_info,
struct JoinTreeQueryPlan
{
QueryPlan query_plan;
QueryProcessingStage::Enum from_stage;
};
/// Build JOIN TREE query plan for query node
JoinTreeQueryPlan buildJoinTreeQueryPlan(const QueryTreeNodePtr & query_node,
const SelectQueryInfo & select_query_info,
const SelectQueryOptions & select_query_options,
const ColumnIdentifierSet & outer_scope_columns,
PlannerContextPtr & planner_context);

View File

@ -17,8 +17,6 @@
#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsConversion.h>
#include <Functions/CastOverloadResolver.h>
#include <Analyzer/FunctionNode.h>
#include <Analyzer/ConstantNode.h>
@ -465,40 +463,11 @@ JoinClausesAndActions buildJoinClausesAndActions(const ColumnsWithTypeAndName &
throw;
}
auto cast_type_name = common_type->getName();
Field cast_type_constant_value(cast_type_name);
ColumnWithTypeAndName cast_column;
cast_column.name = calculateConstantActionNodeName(cast_type_constant_value);
cast_column.column = DataTypeString().createColumnConst(0, cast_type_constant_value);
cast_column.type = std::make_shared<DataTypeString>();
const ActionsDAG::Node * cast_type_constant_node = nullptr;
if (!left_key_node->result_type->equals(*common_type))
{
cast_type_constant_node = &join_expression_actions->addColumn(cast_column);
FunctionCastBase::Diagnostic diagnostic = {left_key_node->result_name, left_key_node->result_name};
FunctionOverloadResolverPtr func_builder_cast
= CastInternalOverloadResolver<CastType::nonAccurate>::createImpl(diagnostic);
ActionsDAG::NodeRawConstPtrs children = {left_key_node, cast_type_constant_node};
left_key_node = &join_expression_actions->addFunction(func_builder_cast, std::move(children), {});
}
left_key_node = &join_expression_actions->addCast(*left_key_node, common_type);
if (!right_key_node->result_type->equals(*common_type))
{
if (!cast_type_constant_node)
cast_type_constant_node = &join_expression_actions->addColumn(cast_column);
FunctionCastBase::Diagnostic diagnostic = {right_key_node->result_name, right_key_node->result_name};
FunctionOverloadResolverPtr func_builder_cast
= CastInternalOverloadResolver<CastType::nonAccurate>::createImpl(std::move(diagnostic));
ActionsDAG::NodeRawConstPtrs children = {right_key_node, cast_type_constant_node};
right_key_node = &join_expression_actions->addFunction(func_builder_cast, std::move(children), {});
}
right_key_node = &join_expression_actions->addCast(*right_key_node, common_type);
}
join_expression_actions->addOrReplaceInOutputs(*left_key_node);

View File

@ -0,0 +1,91 @@
#pragma once
#include <Common/Exception.h>
#include <Core/QueryProcessingStage.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
class PlannerQueryProcessingInfo
{
public:
PlannerQueryProcessingInfo(QueryProcessingStage::Enum from_stage_, QueryProcessingStage::Enum to_stage_)
: from_stage(from_stage_)
, to_stage(to_stage_)
{
if (isIntermediateStage())
{
if (isFirstStage() || isSecondStage())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Query with intermediate stage cannot have any other stages");
}
if (isFromAggregationState())
{
if (isIntermediateStage() || isFirstStage() || isSecondStage())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Query with after aggregation stage cannot have any other stages");
}
}
QueryProcessingStage::Enum getFromStage() const
{
return from_stage;
}
QueryProcessingStage::Enum getToStage() const
{
return to_stage;
}
/** Do I need to perform the first part of the pipeline?
* Running on remote servers during distributed processing or if query is not distributed.
*
* Also note that with distributed_group_by_no_merge=1 or when there is
* only one remote server, it is equal to local query in terms of query
* stages (or when due to optimize_distributed_group_by_sharding_key the query was processed up to Complete stage).
*/
bool isFirstStage() const
{
return from_stage < QueryProcessingStage::WithMergeableState
&& to_stage >= QueryProcessingStage::WithMergeableState;
}
/** Do I need to execute the second part of the pipeline?
* Running on the initiating server during distributed processing or if query is not distributed.
*
* Also note that with distributed_group_by_no_merge=2 (i.e. when optimize_distributed_group_by_sharding_key takes place)
* the query on the remote server will be processed up to WithMergeableStateAfterAggregationAndLimit,
* So it will do partial second stage (second_stage=true), and initiator will do the final part.
*/
bool isSecondStage() const
{
return from_stage <= QueryProcessingStage::WithMergeableState
&& to_stage > QueryProcessingStage::WithMergeableState;
}
bool isIntermediateStage() const
{
return from_stage == QueryProcessingStage::WithMergeableState && to_stage == QueryProcessingStage::WithMergeableState;
}
bool isToAggregationState() const
{
return to_stage >= QueryProcessingStage::WithMergeableStateAfterAggregation;
}
bool isFromAggregationState() const
{
return from_stage >= QueryProcessingStage::WithMergeableStateAfterAggregation;
}
private:
QueryProcessingStage::Enum from_stage;
QueryProcessingStage::Enum to_stage;
};
}

View File

@ -8,6 +8,8 @@
#include <IO/WriteBufferFromString.h>
#include <Functions/FunctionFactory.h>
#include <Interpreters/Context.h>
#include <Analyzer/ConstantNode.h>
@ -308,4 +310,14 @@ bool queryHasWithTotalsInAnySubqueryInJoinTree(const QueryTreeNodePtr & query_no
return false;
}
QueryTreeNodePtr mergeConditionNodes(const QueryTreeNodes & condition_nodes, const ContextPtr & context)
{
auto function_node = std::make_shared<FunctionNode>("and");
auto and_function = FunctionFactory::instance().get("and", context);
function_node->getArguments().getNodes() = condition_nodes;
function_node->resolveAsFunction(and_function->build(function_node->getArgumentColumns()));
return function_node;
}
}

View File

@ -59,4 +59,7 @@ bool queryHasArrayJoinInJoinTree(const QueryTreeNodePtr & query_node);
*/
bool queryHasWithTotalsInAnySubqueryInJoinTree(const QueryTreeNodePtr & query_node);
/// Returns `and` function node that has condition nodes as its arguments
QueryTreeNodePtr mergeConditionNodes(const QueryTreeNodes & condition_nodes, const ContextPtr & context);
}

View File

@ -3,6 +3,7 @@
#include <Processors/QueryPlan/ExpressionStep.h>
#include <Interpreters/ActionsDAG.h>
#include <Interpreters/InterpreterSelectQuery.h>
#include <Interpreters/InterpreterSelectQueryAnalyzer.h>
namespace DB
{
@ -48,26 +49,42 @@ std::unique_ptr<QueryPlan> createLocalPlan(
checkStackSize();
auto query_plan = std::make_unique<QueryPlan>();
/// Do not apply AST optimizations, because query
/// is already optimized and some optimizations
/// can be applied only for non-distributed tables
/// and we can produce query, inconsistent with remote plans.
auto interpreter = InterpreterSelectQuery(
query_ast, context,
SelectQueryOptions(processed_stage)
.setShardInfo(shard_num, shard_count)
.ignoreASTOptimizations());
auto select_query_options = SelectQueryOptions(processed_stage)
.setShardInfo(shard_num, shard_count)
.ignoreASTOptimizations();
interpreter.setProperClientInfo(replica_num, replica_count);
if (coordinator)
auto update_interpreter = [&](auto & interpreter)
{
interpreter.setMergeTreeReadTaskCallbackAndClientInfo([coordinator](PartitionReadRequest request) -> std::optional<PartitionReadResponse>
interpreter.setProperClientInfo(replica_num, replica_count);
if (coordinator)
{
return coordinator->handleRequest(request);
});
interpreter.setMergeTreeReadTaskCallbackAndClientInfo([coordinator](PartitionReadRequest request) -> std::optional<PartitionReadResponse>
{
return coordinator->handleRequest(request);
});
}
};
if (context->getSettingsRef().allow_experimental_analyzer)
{
auto interpreter = InterpreterSelectQueryAnalyzer(query_ast, context, select_query_options);
update_interpreter(interpreter);
query_plan = std::make_unique<QueryPlan>(std::move(interpreter).extractQueryPlan());
}
else
{
auto interpreter = InterpreterSelectQuery(
query_ast, context,
select_query_options);
update_interpreter(interpreter);
interpreter.buildQueryPlan(*query_plan);
}
interpreter.buildQueryPlan(*query_plan);
addConvertingActions(*query_plan, header);
return query_plan;
}

View File

@ -952,7 +952,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
std::unordered_map<std::string, ColumnWithTypeAndName> node_name_to_input_node_column;
if (context->getSettingsRef().allow_experimental_analyzer)
if (settings.allow_experimental_analyzer)
{
const auto & table_expression_data = query_info.planner_context->getTableExpressionDataOrThrow(query_info.table_expression);
for (const auto & [column_identifier, column_name] : table_expression_data.getColumnIdentifierToColumnName())

View File

@ -216,7 +216,7 @@ void ReplicatedMergeTreeQueue::insertUnlocked(
{
auto entry_virtual_parts = entry->getVirtualPartNames(format_version);
LOG_TEST(log, "Insert entry {} to queue with type {}", entry->znode_name, entry->getDescriptionForLogs(format_version));
LOG_TRACE(log, "Insert entry {} to queue with type {}", entry->znode_name, entry->getDescriptionForLogs(format_version));
for (const String & virtual_part_name : entry_virtual_parts)
{

View File

@ -182,6 +182,9 @@ struct SelectQueryInfo
ASTPtr view_query; /// Optimized VIEW query
ASTPtr original_query; /// Unmodified query for projection analysis
/// Query tree
QueryTreeNodePtr query_tree;
/// Planner context
PlannerContextPtr planner_context;
@ -193,6 +196,9 @@ struct SelectQueryInfo
std::shared_ptr<const StorageLimitsList> storage_limits;
/// Local storage limits
StorageLimits local_storage_limits;
/// Cluster for the query.
ClusterPtr cluster;
/// Optimized cluster for the query.
@ -226,6 +232,9 @@ struct SelectQueryInfo
bool need_aggregate = false;
PrewhereInfoPtr prewhere_info;
/// If query has aggregate functions
bool has_aggregates = false;
ClusterPtr getCluster() const { return !optimized_cluster ? cluster : optimized_cluster; }
/// If not null, it means we choose a projection to execute current query.

View File

@ -38,6 +38,11 @@
#include <Parsers/parseQuery.h>
#include <Parsers/IAST.h>
#include <Analyzer/TableNode.h>
#include <Planner/Planner.h>
#include <Planner/Utils.h>
#include <Interpreters/ClusterProxy/SelectStreamFactory.h>
#include <Interpreters/ClusterProxy/executeQuery.h>
#include <Interpreters/Cluster.h>
@ -66,6 +71,7 @@
#include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
#include <Processors/QueryPlan/ReadFromPreparedSource.h>
#include <Processors/QueryPlan/ExpressionStep.h>
#include <Processors/Sources/NullSource.h>
#include <Processors/Sources/RemoteSource.h>
#include <Processors/Sinks/EmptySink.h>
@ -123,6 +129,7 @@ namespace ErrorCodes
extern const int DISTRIBUTED_TOO_MANY_PENDING_BYTES;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int TOO_LARGE_DISTRIBUTED_DEPTH;
extern const int UNSUPPORTED_METHOD;
}
namespace ActionLocks
@ -566,13 +573,14 @@ std::optional<QueryProcessingStage::Enum> StorageDistributed::getOptimizedQueryP
return {};
}
/// TODO: Analyzer syntax analyzer result
if (!query_info.syntax_analyzer_result)
return {};
// GROUP BY
const ASTPtr group_by = select.groupBy();
if (!query_info.syntax_analyzer_result->aggregates.empty() || group_by)
bool has_aggregates = query_info.has_aggregates;
if (query_info.syntax_analyzer_result)
has_aggregates = query_info.syntax_analyzer_result->aggregates.empty();
if (!has_aggregates || group_by)
{
if (!optimize_sharding_key_aggregation || !group_by || !expr_contains_sharding_key(group_by->children))
return {};
@ -651,6 +659,31 @@ StorageSnapshotPtr StorageDistributed::getStorageSnapshotForQuery(
return std::make_shared<StorageSnapshot>(*this, metadata_snapshot, object_columns, std::move(snapshot_data));
}
namespace
{
QueryTreeNodePtr buildQueryTreeDistributedTableReplacedWithLocalTable(const SelectQueryInfo & query_info, StorageID remote_storage_id)
{
const auto & query_context = query_info.planner_context->getQueryContext();
auto resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id);
auto storage = DatabaseCatalog::instance().tryGetTable(resolved_remote_storage_id, query_context);
if (!storage)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
"Distributed local table {} does not exists on coordinator",
remote_storage_id.getFullTableName());
auto storage_lock = storage->lockForShare(query_context->getInitialQueryId(), query_context->getSettingsRef().lock_acquire_timeout);
auto storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), query_context);
auto replacement_table_expression = std::make_shared<TableNode>(std::move(storage), std::move(storage_lock), std::move(storage_snapshot));
std::unordered_map<const IQueryTreeNode *, QueryTreeNodePtr> replacement_map;
replacement_map.emplace(query_info.table_expression.get(), std::move(replacement_table_expression));
return query_info.query_tree->cloneAndReplace(replacement_map);
}
}
void StorageDistributed::read(
QueryPlan & query_plan,
const Names &,
@ -665,12 +698,28 @@ void StorageDistributed::read(
if (select_query->final() && local_context->getSettingsRef().allow_experimental_parallel_reading_from_replicas)
throw Exception(ErrorCodes::ILLEGAL_FINAL, "Final modifier is not allowed together with parallel reading from replicas feature");
const auto & modified_query_ast = rewriteSelectQuery(
local_context, query_info.query,
remote_database, remote_table, remote_table_function_ptr);
Block header;
ASTPtr query_ast;
Block header =
InterpreterSelectQuery(query_info.query, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock();
if (local_context->getSettingsRef().allow_experimental_analyzer)
{
StorageID remote_storage_id{remote_database, remote_table};
auto query_tree_with_replaced_distributed_table = buildQueryTreeDistributedTableReplacedWithLocalTable(query_info, remote_storage_id);
query_ast = queryNodeToSelectQuery(query_tree_with_replaced_distributed_table);
Planner planner(query_tree_with_replaced_distributed_table, SelectQueryOptions(processed_stage), PlannerConfiguration{.only_analyze = true});
planner.buildQueryPlanIfNeeded();
header = planner.getQueryPlan().getCurrentDataStream().header;
}
else
{
header =
InterpreterSelectQuery(query_info.query, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock();
query_ast = query_info.query;
}
auto modified_query_ast = rewriteSelectQuery(
local_context, query_ast,
remote_database, remote_table, remote_table_function_ptr);
/// Return directly (with correct header) if no shard to query.
if (query_info.getCluster()->getShardsInfo().empty())
@ -718,6 +767,22 @@ void StorageDistributed::read(
/// This is a bug, it is possible only when there is no shards to query, and this is handled earlier.
if (!query_plan.isInitialized())
throw Exception("Pipeline is not initialized", ErrorCodes::LOGICAL_ERROR);
if (local_context->getSettingsRef().allow_experimental_analyzer)
{
Planner planner(query_info.query_tree, SelectQueryOptions(processed_stage), PlannerConfiguration{.only_analyze = true});
planner.buildQueryPlanIfNeeded();
auto expected_header = planner.getQueryPlan().getCurrentDataStream().header;
auto rename_actions_dag = ActionsDAG::makeConvertingActions(
query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName(),
expected_header.getColumnsWithTypeAndName(),
ActionsDAG::MatchColumnsMode::Position,
true /*ignore_constant_values*/);
auto rename_step = std::make_unique<ExpressionStep>(query_plan.getCurrentDataStream(), std::move(rename_actions_dag));
rename_step->setStepDescription("Change remote column names to local column names");
query_plan.addStep(std::move(rename_step));
}
}

View File

@ -1762,7 +1762,8 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry, bool need_to_che
if (!need_to_check_missing_part)
return false;
throw Exception("No active replica has part " + entry.new_part_name + " or covering part", ErrorCodes::NO_REPLICA_HAS_PART);
throw Exception(ErrorCodes::NO_REPLICA_HAS_PART, "No active replica has part {} or covering part (cannot execute {}: {})",
entry.new_part_name, entry.znode_name, entry.getDescriptionForLogs(format_version));
}
}
@ -5876,7 +5877,24 @@ void StorageReplicatedMergeTree::getStatus(ReplicatedTableStatus & res, bool wit
{
try
{
auto log_entries = zookeeper->getChildren(fs::path(zookeeper_path) / "log");
std::vector<std::string> paths;
paths.push_back(fs::path(zookeeper_path) / "log");
paths.push_back(fs::path(zookeeper_path) / "replicas");
auto children_result = zookeeper->getChildren(paths);
const auto & log_entries = children_result[0].names;
const auto & all_replicas = children_result[1].names;
paths.clear();
paths.push_back(fs::path(replica_path) / "log_pointer");
for (const String & replica : all_replicas)
paths.push_back(fs::path(zookeeper_path) / "replicas" / replica / "is_active");
auto get_result = zookeeper->tryGet(paths);
const auto & log_pointer_str = get_result[0].data;
if (get_result[0].error == Coordination::Error::ZNONODE)
throw zkutil::KeeperException(get_result[0].error);
if (!log_entries.empty())
{
@ -5884,17 +5902,14 @@ void StorageReplicatedMergeTree::getStatus(ReplicatedTableStatus & res, bool wit
res.log_max_index = parse<UInt64>(last_log_entry.substr(strlen("log-")));
}
String log_pointer_str = zookeeper->get(fs::path(replica_path) / "log_pointer");
res.log_pointer = log_pointer_str.empty() ? 0 : parse<UInt64>(log_pointer_str);
auto all_replicas = zookeeper->getChildren(fs::path(zookeeper_path) / "replicas");
res.total_replicas = all_replicas.size();
for (const String & replica : all_replicas)
for (size_t i = 0, size = all_replicas.size(); i < size; ++i)
{
bool is_replica_active = zookeeper->exists(fs::path(zookeeper_path) / "replicas" / replica / "is_active");
bool is_replica_active = get_result[i + 1].error != Coordination::Error::ZNONODE;
res.active_replicas += static_cast<UInt8>(is_replica_active);
res.replica_is_active.emplace(replica, is_replica_active);
res.replica_is_active.emplace(all_replicas[i], is_replica_active);
}
}
catch (const Coordination::Exception &)

View File

@ -146,7 +146,7 @@ void StorageView::read(
if (context->getSettingsRef().allow_experimental_analyzer)
{
InterpreterSelectQueryAnalyzer interpreter(current_inner_query, options, getViewContext(context));
InterpreterSelectQueryAnalyzer interpreter(current_inner_query, getViewContext(context), options);
interpreter.addStorageLimits(*query_info.storage_limits);
query_plan = std::move(interpreter).extractQueryPlan();
}

View File

@ -10,6 +10,7 @@
#include <Common/typeid_cast.h>
#include <Databases/IDatabase.h>
#include <Processors/Sources/SourceFromSingleChunk.h>
#include <Common/getNumberOfPhysicalCPUCores.h>
namespace DB
@ -151,14 +152,31 @@ Pipe StorageSystemReplicas::read(
MutableColumns res_columns = storage_snapshot->metadata->getSampleBlock().cloneEmptyColumns();
for (size_t i = 0, size = col_database->size(); i < size; ++i)
size_t tables_size = col_database->size();
std::vector<ReplicatedTableStatus> statuses(tables_size);
size_t thread_pool_size = std::min(tables_size, static_cast<size_t>(getNumberOfPhysicalCPUCores()));
auto settings = context->getSettingsRef();
if (settings.max_threads != 0)
thread_pool_size = std::min(thread_pool_size, static_cast<size_t>(settings.max_threads));
ThreadPool thread_pool(thread_pool_size);
for (size_t i = 0; i < tables_size; ++i)
{
ReplicatedTableStatus status;
dynamic_cast<StorageReplicatedMergeTree &>(
thread_pool.scheduleOrThrowOnError([&, i=i]
{
dynamic_cast<StorageReplicatedMergeTree &>(
*replicated_tables
[(*col_database)[i].safeGet<const String &>()]
[(*col_table)[i].safeGet<const String &>()]).getStatus(status, with_zk_fields);
[(*col_table)[i].safeGet<const String &>()]).getStatus(statuses[i], with_zk_fields);
});
}
thread_pool.wait();
for (const auto & status: statuses)
{
size_t col_num = 3;
res_columns[col_num++]->insert(status.is_leader);
res_columns[col_num++]->insert(status.can_become_leader);

View File

@ -25,6 +25,7 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int LOGICAL_ERROR;
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
}
void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, ContextPtr /*context*/)
@ -37,9 +38,7 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co
ASTs & args = args_func.at(0)->children;
if (args.empty())
throw Exception("Table function '" + getName() + "' requires at least one argument: "
" structure, [random_seed, max_string_length, max_array_length].",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
return;
if (args.size() > 4)
throw Exception("Table function '" + getName() + "' requires at most four arguments: "
@ -77,12 +76,23 @@ void TableFunctionGenerateRandom::parseArguments(const ASTPtr & ast_function, Co
ColumnsDescription TableFunctionGenerateRandom::getActualTableStructure(ContextPtr context) const
{
if (structure == "auto")
{
if (structure_hint.empty())
throw Exception(
ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE,
"Table function '{}' was used without structure argument but structure could not be determined automatically. Please, "
"provide structure manually",
getName());
return structure_hint;
}
return parseColumnsListFromString(structure, context);
}
StoragePtr TableFunctionGenerateRandom::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const
{
auto columns = getActualTableStructure(context);
ColumnsDescription columns = getActualTableStructure(context);
auto res = std::make_shared<StorageGenerateRandom>(
StorageID(getDatabaseName(), table_name), columns, String{}, max_array_length, max_string_length, random_seed);
res->startup();

View File

@ -5,7 +5,7 @@
namespace DB
{
/* generateRandom(structure, [max_array_length, max_string_length, random_seed])
/* generateRandom([structure, max_array_length, max_string_length, random_seed])
* - creates a temporary storage that generates columns with random data
*/
class TableFunctionGenerateRandom : public ITableFunction
@ -13,7 +13,11 @@ class TableFunctionGenerateRandom : public ITableFunction
public:
static constexpr auto name = "generateRandom";
std::string getName() const override { return name; }
bool hasStaticStructure() const override { return true; }
bool hasStaticStructure() const override { return structure != "auto"; }
bool needStructureHint() const override { return structure == "auto"; }
void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }
private:
StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override;
const char * getStorageTypeName() const override { return "GenerateRandom"; }
@ -21,11 +25,11 @@ private:
ColumnsDescription getActualTableStructure(ContextPtr context) const override;
void parseArguments(const ASTPtr & ast_function, ContextPtr context) override;
String structure;
String structure = "auto";
UInt64 max_string_length = 10;
UInt64 max_array_length = 10;
std::optional<UInt64> random_seed;
ColumnsDescription structure_hint;
};

View File

@ -120,6 +120,7 @@ if __name__ == "__main__":
"fuzzer.log": os.path.join(workspace_path, "fuzzer.log"),
"report.html": os.path.join(workspace_path, "report.html"),
"core.zst": os.path.join(workspace_path, "core.zst"),
"dmesg.log": os.path.join(workspace_path, "dmesg.log"),
}
s3_helper = S3Helper()

205
tests/ci/merge_pr.py Normal file
View File

@ -0,0 +1,205 @@
#!/usr/bin/env python
"""Script to check if PR is mergeable and merge it"""
import argparse
import logging
from datetime import datetime
from os import getenv
from typing import Dict, List
from github.PullRequestReview import PullRequestReview
from commit_status_helper import get_commit_filtered_statuses
from get_robot_token import get_best_robot_token
from github_helper import GitHub, NamedUser, PullRequest
from pr_info import PRInfo
# The team name for accepted approvals
TEAM_NAME = getenv("GITHUB_TEAM_NAME", "core")
class Reviews:
STATES = ["CHANGES_REQUESTED", "APPROVED"]
def __init__(self, pr: PullRequest):
"""The reviews are proceed in the next logic:
- if review for an author does not exist, set it
- the review status can be changed from CHANGES_REQUESTED and APPROVED
only to either one
"""
logging.info("Checking the PR for approvals")
self.pr = pr
self.reviews = pr.get_reviews()
# the reviews are ordered by time
self._review_per_user = {} # type: Dict[NamedUser, PullRequestReview]
self.approved_at = datetime.fromtimestamp(0)
for r in self.reviews:
user = r.user
if self._review_per_user.get(user):
if r.state in self.STATES:
self._review_per_user[user] = r
if r.state == "APPROVED":
self.approved_at = max(r.submitted_at, self.approved_at)
continue
self._review_per_user[user] = r
def is_approved(self, team: List[NamedUser]) -> bool:
"""Checks if the PR is approved, and no changes made after the last approval"""
if not self.reviews:
logging.info("There aren't reviews for PR #%s", self.pr.number)
return False
# We consider reviews only from the given list of users
statuses = {
r.state
for user, r in self._review_per_user.items()
if r.state == "CHANGES_REQUESTED"
or (r.state == "APPROVED" and user in team)
}
if "CHANGES_REQUESTED" in statuses:
logging.info(
"The following users requested changes for the PR: %s",
", ".join(
user.login
for user, r in self._review_per_user.items()
if r.state == "CHANGES_REQUESTED"
),
)
return False
if "APPROVED" in statuses:
logging.info(
"The following users from %s team approved the PR: %s",
TEAM_NAME,
", ".join(
user.login
for user, r in self._review_per_user.items()
if r.state == "APPROVED" and user in team
),
)
# The only reliable place to get the 100% accurate last_modified
# info is when the commit was pushed to GitHub. The info is
# available as a header 'last-modified' of /{org}/{repo}/commits/{sha}.
# Unfortunately, it's formatted as 'Wed, 04 Jan 2023 11:05:13 GMT'
commit = self.pr.head.repo.get_commit(self.pr.head.sha)
if commit.stats.last_modified is None:
logging.warning(
"Unable to get info about the commit %s", self.pr.head.sha
)
return False
last_changed = datetime.strptime(
commit.stats.last_modified, "%a, %d %b %Y %H:%M:%S GMT"
)
if self.approved_at < last_changed:
logging.info(
"There are changes after approve at %s",
self.approved_at.isoformat(),
)
return False
return True
logging.info("The PR #%s is not approved", self.pr.number)
return False
def parse_args() -> argparse.Namespace:
pr_info = PRInfo()
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="Script to merge the given PR. Additional checks for approved "
"status and green commit statuses could be done",
)
parser.add_argument(
"--check-approved",
action="store_true",
help="if set, checks that the PR is approved and no changes required",
)
parser.add_argument("--check-green", default=True, help=argparse.SUPPRESS)
parser.add_argument(
"--no-check-green",
dest="check_green",
action="store_false",
default=argparse.SUPPRESS,
help="(dangerous) if set, skip check commit to having all green statuses",
)
parser.add_argument(
"--repo",
default=pr_info.repo_full_name,
help="PR number to check",
)
parser.add_argument(
"--pr",
type=int,
default=pr_info.number,
help="PR number to check",
)
parser.add_argument(
"--token",
type=str,
default="",
help="a token to use for GitHub API requests, will be received from SSM "
"if empty",
)
args = parser.parse_args()
args.pr_info = pr_info
return args
def main():
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
args = parse_args()
logging.info("Going to process PR #%s in repo %s", args.pr, args.repo)
token = args.token or get_best_robot_token()
gh = GitHub(token, per_page=100)
repo = gh.get_repo(args.repo)
# An ugly and not nice fix to patch the wrong organization URL,
# see https://github.com/PyGithub/PyGithub/issues/2395#issuecomment-1378629710
# pylint: disable=protected-access
repo.organization._url.value = repo.organization.url.replace( # type: ignore
"/users/", "/orgs/", 1
)
# pylint: enable=protected-access
pr = repo.get_pull(args.pr)
if pr.is_merged():
logging.info("The PR #%s is already merged", pr.number)
return
not_ready_to_merge = pr.draft or "WIP" in pr.title
if not_ready_to_merge:
logging.info("The PR #%s is not ready for merge, stopping", pr.number)
return
if args.check_green:
logging.info("Checking that all PR's statuses are green")
commit = repo.get_commit(pr.head.sha)
failed_statuses = [
status.context
for status in get_commit_filtered_statuses(commit)
if status.state != "success"
]
if failed_statuses:
logging.warning(
"Some statuses aren't success:\n %s", ",\n ".join(failed_statuses)
)
return
if args.check_approved:
reviews = Reviews(pr)
team = repo.organization.get_team_by_slug(TEAM_NAME)
members = list(team.get_members())
if not reviews.is_approved(members):
logging.warning("We don't merge the PR")
return
logging.info("Merging the PR")
pr.merge()
if __name__ == "__main__":
main()

View File

@ -3,6 +3,7 @@ from ast import literal_eval
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple
from html import escape
import csv
import os
import datetime
@ -372,9 +373,10 @@ def create_test_html_report(
row += "</tr>"
rows_part += row
if test_result.raw_logs is not None:
raw_logs = escape(test_result.raw_logs)
row = (
'<tr class="failed-content">'
f'<td colspan="{colspan}"><pre>{test_result.raw_logs}</pre></td>'
f'<td colspan="{colspan}"><pre>{raw_logs}</pre></td>'
"</tr>"
)
rows_part += row

View File

@ -232,17 +232,20 @@ def need_retry(args, stdout, stderr, total_time):
def get_processlist(args):
if args.replicated_database:
return clickhouse_execute_json(
args,
"""
SELECT materialize((hostName(), tcpPort())) as host, *
FROM clusterAllReplicas('test_cluster_database_replicated', system.processes)
WHERE query NOT LIKE '%system.processes%'
""",
)
else:
return clickhouse_execute_json(args, "SHOW PROCESSLIST")
try:
if args.replicated_database:
return clickhouse_execute_json(
args,
"""
SELECT materialize((hostName(), tcpPort())) as host, *
FROM clusterAllReplicas('test_cluster_database_replicated', system.processes)
WHERE query NOT LIKE '%system.processes%'
""",
)
else:
return clickhouse_execute_json(args, "SHOW PROCESSLIST")
except Exception as e:
return "Failed to get processlist: " + str(e)
def get_transactions_list(args):
@ -1955,19 +1958,22 @@ def main(args):
def find_binary(name):
if os.path.exists(name) and os.access(name, os.X_OK):
return True
if os.access(name, os.X_OK):
return name
paths = os.environ.get("PATH").split(":")
for path in paths:
if os.access(os.path.join(path, name), os.X_OK):
return True
bin_path = os.path.join(path, name)
if os.access(bin_path, os.X_OK):
return bin_path
# maybe it wasn't in PATH
if os.access(os.path.join("/usr/local/bin", name), os.X_OK):
return True
if os.access(os.path.join("/usr/bin", name), os.X_OK):
return True
return False
bin_path = os.path.join("/usr/local/bin", name)
if os.access(bin_path, os.X_OK):
return bin_path
bin_path = os.path.join("/usr/bin", name)
if os.access(bin_path, os.X_OK):
return bin_path
return None
def get_additional_client_options(args):
@ -2007,9 +2013,8 @@ if __name__ == "__main__":
parser.add_argument(
"-b",
"--binary",
default="clickhouse",
help="Path to clickhouse"
"binary or name of binary in PATH",
default=find_binary("clickhouse"),
help="Path to clickhouse binary or name of binary in PATH",
)
parser.add_argument(
@ -2279,18 +2284,13 @@ if __name__ == "__main__":
if args.tmp is None:
args.tmp = args.queries
if args.client is None:
if find_binary(args.binary + "-client"):
args.client = args.binary + "-client"
print("Using " + args.client + " as client program")
elif find_binary(args.binary):
client_bin = find_binary(args.binary + "-client")
if client_bin is not None:
args.client = client_bin
print("Using {args.client} as client program")
elif args.binary:
args.client = args.binary + " client"
print(
"Using "
+ args.client
+ " as client program (expecting monolithic build)"
)
print(f"Using {args.client} as client program (expecting monolithic build)")
else:
print(
"No 'clickhouse' or 'clickhouse-client' client binary found",

View File

@ -1625,6 +1625,12 @@ def test_rename(start_cluster):
"""
)
# We want to check that after inserts, some parts were moved to external disk
# and some parts are still on the main disk, but because of merge all parts
# might end up on external disk.
node1.query("SYSTEM STOP MERGES default.renaming_table")
# jbod1 disk is 40mb
for _ in range(5):
data = []
for i in range(10):
@ -1635,8 +1641,14 @@ def test_rename(start_cluster):
)
)
disks = get_used_disks_for_table(node1, "renaming_table")
assert len(disks) > 1
# data is moved in the background, so check with retries
num_try = 0
while get_used_disks_for_table(node1, "renaming_table") == 1:
time.sleep(1)
num_try += 1
if num_try == 20:
break
assert len(get_used_disks_for_table(node1, "renaming_table")) > 1
assert node1.query("SELECT COUNT() FROM default.renaming_table") == "50\n"
node1.query("RENAME TABLE default.renaming_table TO default.renaming_table1")

View File

@ -22,9 +22,6 @@ def started_single_node_cluster():
def test_move_and_s3_memory_usage(started_single_node_cluster):
pytest.skip("Test is too flaky. Disable it for now.")
if small_node.is_built_with_sanitizer() or small_node.is_debug_build():
pytest.skip("Disabled for debug and sanitizers. Too slow.")
@ -51,7 +48,9 @@ def test_move_and_s3_memory_usage(started_single_node_cluster):
)
small_node.query("system flush logs")
max_usage = small_node.query(
"select max(CurrentMetric_MemoryTracking) from system.metric_log"
"""select max(m.val - am.val * 4096) from
(select toStartOfMinute(event_time) as time, max(CurrentMetric_MemoryTracking) as val from system.metric_log group by time) as m join
(select toStartOfMinute(event_time) as time, min(value) as val from system.asynchronous_metric_log where metric='jemalloc.arenas.all.pdirty' group by time) as am using time"""
)
# 3G limit is a big one. However, we can hit it anyway with parallel s3 writes enabled.
# Also actual value can be bigger because of memory drift.

View File

@ -0,0 +1,13 @@
<test>
<create_query>DROP TABLE IF EXISTS test_lc_query</create_query>
<create_query>
CREATE TABLE test_lc_query (x UInt64, lc LowCardinality(Nullable(String))) ENGINE = MergeTree order by x
</create_query>
<fill_query>INSERT INTO test_lc_query SELECT number, toString(number % 100) FROM numbers(1e7)</fill_query>
<query>SELECT count() FROM test_lc_query WHERE lc = '12' OR lc = '34'</query>
<query>SELECT count() FROM test_lc_query WHERE lc = '12' OR lc = '34' OR lc = '56'</query>
<drop_query>DROP TABLE IF EXISTS test_lc_query</drop_query>
</test>

View File

@ -1,3 +1,6 @@
-- Tags: no-cpu-ppc64le
-- Tag no-cpu-ppc64le: Depending on the target platform, CRC32C function returns different hash values. So, should not run on PowerPC. Whenever a new test gets added here, same has to be updated in 01016_simhash_minhash_ppc.sql
SELECT ngramSimHash('');
SELECT ngramSimHash('what a cute cat.');
SELECT ngramSimHashCaseInsensitive('what a cute cat.');

View File

@ -0,0 +1,141 @@
18446744073709551615
1737075136
1737075136
4018781633
4018781633
1846985414
1846985414
1846985414
1846985414
(10693559443859979498,10693559443859979498)
(12279482788274235946,6436413987527322272)
(12279482788274235946,6436413987527322272)
(13257488272755813409,6436413987527322272)
(13257488272755813409,6436413987527322272)
(13762864994695140861,13762864994695140861)
(13762864994695140861,13762864994695140861)
(13762864994695140861,13762864994695140861)
(13762864994695140861,13762864994695140861)
3023525975
3040303199
3023509591
3023510623
3040303191
3040303191
3023510615
3023510615
1999952988
926211140
1999699532
1999683148
1999952988
926211140
1999699532
1999683148
(16071125717475221203,9592059329600248798)
(16071125717475221203,1914899959549098907)
(16071125717475221203,7986182634218042944)
(16071125717475221203,7986182634218042944)
(16071125717475221203,9592059329600248798)
(16071125717475221203,1914899959549098907)
(16071125717475221203,7986182634218042944)
(16071125717475221203,7986182634218042944)
(10576877560263640956,4278250516018530743)
(16211512098526494023,11479872370566432466)
(13515070557027359649,17725505493832406849)
(12589381623326290380,575343713614534202)
(10576877560263640956,4278250516018530743)
(16211512098526494023,11479872370566432466)
(13515070557027359649,17725505493832406849)
(12589381623326290380,575343713614534202)
uniqExact 6
ngramSimHash
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 1211135069
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 1546679389
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2293265501
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 3392173149
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 3627054169
ngramSimHashCaseInsensitive
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2291168349
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 3358618717
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 3425727581
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 3627054429
ngramSimHashUTF8
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 1211135069
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 1546679389
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2284876893
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 3459282013
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 3694163037
ngramSimHashCaseInsensitiveUTF8
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2291168349
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 3358618717
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 3425727581
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 3627054429
wordShingleSimHash
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 192157020
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 460591452
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 1492386136
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 1525941084
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2339636568
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 3401122928
wordShingleSimHashCaseInsensitive
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 183785812
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 1525943132
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2199148880
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2199148884
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 3400551536
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 3673512784
wordShingleSimHashUTF8
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 192157020
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 460591452
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 1492386136
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 1525941084
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2339636568
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 3401122928
wordShingleSimHashCaseInsensitiveUTF8
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 183785812
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 1525943132
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2199148880
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2199148884
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 3400551536
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 3673512784
ngramMinHash
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (2793448378579182412,5526633106516004292)
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (8530889421347045182,5150364204263408031)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (8992738078100405992,5526633106516004292)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (15193387305258759701,5526633106516004292)
ngramMinHashCaseInsensitive
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (2793448378579182412,5526633106516004292)
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (8530889421347045182,5150364204263408031)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (8992738078100405992,5526633106516004292)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (15193387305258759701,5526633106516004292)
ngramMinHashUTF8
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (2793448378579182412,5526633106516004292)
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (8530889421347045182,5150364204263408031)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (8992738078100405992,5526633106516004292)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (15193387305258759701,5526633106516004292)
ngramMinHashCaseInsensitiveUTF8
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (2793448378579182412,5526633106516004292)
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (8530889421347045182,5150364204263408031)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (8992738078100405992,5526633106516004292)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (15193387305258759701,5526633106516004292)
wordShingleMinHash
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (6579710252960108857,2848666928617645043)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (16802224947162838854,4032169656367376737)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (16802224947162838854,17232647740399944031)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (17996725009512358105,9079979506678996383)
wordShingleMinHashCaseInsensitive
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (6579710252960108857,2848666928617645043)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (16802224947162838854,334416161876576673)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (16802224947162838854,12756399179623007102)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (17996725009512358105,9385516997538506173)
wordShingleMinHashUTF8
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (6579710252960108857,2848666928617645043)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (16802224947162838854,4032169656367376737)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (16802224947162838854,17232647740399944031)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (17996725009512358105,9079979506678996383)
wordShingleMinHashCaseInsensitiveUTF8
ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (6579710252960108857,2848666928617645043)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (16802224947162838854,334416161876576673)
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (16802224947162838854,12756399179623007102)
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (17996725009512358105,9385516997538506173)

View File

@ -0,0 +1,118 @@
-- Tags: no-cpu-x86_64, no-cpu-aarch64
-- Tag no-cpu-x86_64 and no-cpu-aarch64: Depending on the target platform, CRC32C function returns different hash values. So, should not run on X86_64 and ARM. Whenever a new test gets added here, same has to be updated in 01016_simhash_minhash.sql
SELECT ngramSimHash('');
SELECT ngramSimHash('what a cute cat.');
SELECT ngramSimHashCaseInsensitive('what a cute cat.');
SELECT ngramSimHashUTF8('what a cute cat.');
SELECT ngramSimHashCaseInsensitiveUTF8('what a cute cat.');
SELECT wordShingleSimHash('what a cute cat.');
SELECT wordShingleSimHashCaseInsensitive('what a cute cat.');
SELECT wordShingleSimHashUTF8('what a cute cat.');
SELECT wordShingleSimHashCaseInsensitiveUTF8('what a cute cat.');
SELECT ngramMinHash('');
SELECT ngramMinHash('what a cute cat.');
SELECT ngramMinHashCaseInsensitive('what a cute cat.');
SELECT ngramMinHashUTF8('what a cute cat.');
SELECT ngramMinHashCaseInsensitiveUTF8('what a cute cat.');
SELECT wordShingleMinHash('what a cute cat.');
SELECT wordShingleMinHashCaseInsensitive('what a cute cat.');
SELECT wordShingleMinHashUTF8('what a cute cat.');
SELECT wordShingleMinHashCaseInsensitiveUTF8('what a cute cat.');
DROP TABLE IF EXISTS defaults;
CREATE TABLE defaults
(
s String
)ENGINE = Memory();
INSERT INTO defaults values ('It is the latest occurrence of the Southeast European haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.') ('It is the latest occurrence of the Southeast Asian haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.');
SELECT ngramSimHash(s) FROM defaults;
SELECT ngramSimHashCaseInsensitive(s) FROM defaults;
SELECT ngramSimHashUTF8(s) FROM defaults;
SELECT ngramSimHashCaseInsensitiveUTF8(s) FROM defaults;
SELECT wordShingleSimHash(s) FROM defaults;
SELECT wordShingleSimHashCaseInsensitive(s) FROM defaults;
SELECT wordShingleSimHashUTF8(s) FROM defaults;
SELECT wordShingleSimHashCaseInsensitiveUTF8(s) FROM defaults;
SELECT ngramMinHash(s) FROM defaults;
SELECT ngramMinHashCaseInsensitive(s) FROM defaults;
SELECT ngramMinHashUTF8(s) FROM defaults;
SELECT ngramMinHashCaseInsensitiveUTF8(s) FROM defaults;
SELECT wordShingleMinHash(s) FROM defaults;
SELECT wordShingleMinHashCaseInsensitive(s) FROM defaults;
SELECT wordShingleMinHashUTF8(s) FROM defaults;
SELECT wordShingleMinHashCaseInsensitiveUTF8(s) FROM defaults;
TRUNCATE TABLE defaults;
INSERT INTO defaults SELECT arrayJoin(splitByString('\n\n',
'ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.
ClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.
ClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems.
ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.
ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system''s read and write availability.
ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems.
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.
ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system''s read / write availability.
ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.
ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.
ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.
ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.
ClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.
ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.
ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.
ClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.'
));
SELECT 'uniqExact', uniqExact(s) FROM defaults;
SELECT 'ngramSimHash';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimHash(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'ngramSimHashCaseInsensitive';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimHashCaseInsensitive(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'ngramSimHashUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimHashUTF8(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'ngramSimHashCaseInsensitiveUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimHashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleSimHash';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimHash(s, 2) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleSimHashCaseInsensitive';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimHashCaseInsensitive(s, 2) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleSimHashUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimHashUTF8(s, 2) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleSimHashCaseInsensitiveUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimHashCaseInsensitiveUTF8(s, 2) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'ngramMinHash';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinHash(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'ngramMinHashCaseInsensitive';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinHashCaseInsensitive(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'ngramMinHashUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinHashUTF8(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'ngramMinHashCaseInsensitiveUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinHashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleMinHash';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinHash(s, 2, 3) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleMinHashCaseInsensitive';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinHashCaseInsensitive(s, 2, 3) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleMinHashUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinHashUTF8(s, 2, 3) as h FROM defaults GROUP BY h ORDER BY h;
SELECT 'wordShingleMinHashCaseInsensitiveUTF8';
SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinHashCaseInsensitiveUTF8(s, 2, 3) as h FROM defaults GROUP BY h ORDER BY h;
SELECT wordShingleSimHash('foobar', 9223372036854775807); -- { serverError 69 }
SELECT wordShingleSimHash('foobar', 1001); -- { serverError 69 }
SELECT wordShingleSimHash('foobar', 0); -- { serverError 69 }
DROP TABLE defaults;

View File

@ -1,5 +1,5 @@
SELECT * FROM generateRandom('i8', 1, 10, 10); -- { serverError 62 }
SELECT * FROM generateRandom; -- { serverError 60 }
SELECT * FROM generateRandom(); -- { serverError 42 }
SELECT * FROM generateRandom(); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE }
SELECT * FROM generateRandom('i8 UInt8', 1, 10, 10, 10, 10); -- { serverError 42 }
SELECT * FROM generateRandom('', 1, 10, 10); -- { serverError 62 }

View File

@ -1,6 +1,15 @@
[0,1,2]
[0,1,2]
[0,1,2]
[0,1,2]
[0,1,2]
[0,1,2]
0 0 0 0
0 1 1 1
2 2 2 2
3 3 3 3
4 0 0
5 0 0
0 0 0 0
0 1 1 1
2 2 2 2
@ -15,6 +24,14 @@
1 1
2 2
3 3
0 0
1 1
2 2
3 3
0 0
1 1
2 2
3 3
SELECT groupArray(x)
FROM
(
@ -22,6 +39,32 @@ FROM
FROM numbers(3)
ORDER BY x ASC
)
QUERY id: 0
PROJECTION COLUMNS
groupArray(x) Array(UInt64)
PROJECTION
LIST id: 1, nodes: 1
FUNCTION id: 2, function_name: groupArray, function_type: aggregate, result_type: Array(UInt64)
ARGUMENTS
LIST id: 3, nodes: 1
COLUMN id: 4, column_name: x, result_type: UInt64, source_id: 5
JOIN TREE
QUERY id: 5, is_subquery: 1
PROJECTION COLUMNS
x UInt64
PROJECTION
LIST id: 6, nodes: 1
COLUMN id: 7, column_name: number, result_type: UInt64, source_id: 8
JOIN TREE
TABLE_FUNCTION id: 8, table_function_name: numbers
ARGUMENTS
LIST id: 9, nodes: 1
CONSTANT id: 10, constant_value: UInt64_3, constant_value_type: UInt8
ORDER BY
LIST id: 11, nodes: 1
SORT id: 12, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 7, column_name: number, result_type: UInt64, source_id: 8
SELECT groupArray(x)
FROM
(
@ -29,6 +72,32 @@ FROM
FROM numbers(3)
ORDER BY x ASC
)
QUERY id: 0
PROJECTION COLUMNS
groupArray(x) Array(UInt64)
PROJECTION
LIST id: 1, nodes: 1
FUNCTION id: 2, function_name: groupArray, function_type: aggregate, result_type: Array(UInt64)
ARGUMENTS
LIST id: 3, nodes: 1
COLUMN id: 4, column_name: x, result_type: UInt64, source_id: 5
JOIN TREE
QUERY id: 5, is_subquery: 1
PROJECTION COLUMNS
x UInt64
PROJECTION
LIST id: 6, nodes: 1
COLUMN id: 7, column_name: number, result_type: UInt64, source_id: 8
JOIN TREE
TABLE_FUNCTION id: 8, table_function_name: numbers
ARGUMENTS
LIST id: 9, nodes: 1
CONSTANT id: 10, constant_value: UInt64_3, constant_value_type: UInt8
ORDER BY
LIST id: 11, nodes: 1
SORT id: 12, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 7, column_name: number, result_type: UInt64, source_id: 8
SELECT groupArray(x)
FROM
(
@ -38,6 +107,38 @@ FROM
exp(x) ASC,
x ASC
)
QUERY id: 0
PROJECTION COLUMNS
groupArray(x) Array(UInt64)
PROJECTION
LIST id: 1, nodes: 1
FUNCTION id: 2, function_name: groupArray, function_type: aggregate, result_type: Array(UInt64)
ARGUMENTS
LIST id: 3, nodes: 1
COLUMN id: 4, column_name: x, result_type: UInt64, source_id: 5
JOIN TREE
QUERY id: 5, is_subquery: 1
PROJECTION COLUMNS
x UInt64
PROJECTION
LIST id: 6, nodes: 1
COLUMN id: 7, column_name: number, result_type: UInt64, source_id: 8
JOIN TREE
TABLE_FUNCTION id: 8, table_function_name: numbers
ARGUMENTS
LIST id: 9, nodes: 1
CONSTANT id: 10, constant_value: UInt64_3, constant_value_type: UInt8
ORDER BY
LIST id: 11, nodes: 2
SORT id: 12, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
FUNCTION id: 13, function_name: exp, function_type: ordinary, result_type: Float64
ARGUMENTS
LIST id: 14, nodes: 1
COLUMN id: 7, column_name: number, result_type: UInt64, source_id: 8
SORT id: 15, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 7, column_name: number, result_type: UInt64, source_id: 8
SELECT
key,
a,
@ -52,6 +153,53 @@ ALL FULL OUTER JOIN test AS t USING (key)
ORDER BY
key ASC,
t.key ASC
QUERY id: 0
PROJECTION COLUMNS
key UInt64
a UInt8
b String
c Float64
PROJECTION
LIST id: 1, nodes: 4
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
COLUMN id: 4, column_name: a, result_type: UInt8, source_id: 5
COLUMN id: 6, column_name: b, result_type: String, source_id: 5
COLUMN id: 7, column_name: c, result_type: Float64, source_id: 5
JOIN TREE
JOIN id: 8, strictness: ALL, kind: FULL
LEFT TABLE EXPRESSION
QUERY id: 3, alias: s, is_subquery: 1
PROJECTION COLUMNS
key UInt64
PROJECTION
LIST id: 9, nodes: 1
FUNCTION id: 10, function_name: plus, function_type: ordinary, result_type: UInt64
ARGUMENTS
LIST id: 11, nodes: 2
COLUMN id: 12, column_name: number, result_type: UInt64, source_id: 13
CONSTANT id: 14, constant_value: UInt64_2, constant_value_type: UInt8
JOIN TREE
TABLE_FUNCTION id: 13, table_function_name: numbers
ARGUMENTS
LIST id: 15, nodes: 1
CONSTANT id: 16, constant_value: UInt64_4, constant_value_type: UInt8
RIGHT TABLE EXPRESSION
TABLE id: 5, alias: t, table_name: default.test
JOIN EXPRESSION
LIST id: 17, nodes: 1
COLUMN id: 18, column_name: key, result_type: UInt64, source_id: 8
EXPRESSION
LIST id: 19, nodes: 2
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
COLUMN id: 20, column_name: key, result_type: UInt64, source_id: 5
ORDER BY
LIST id: 21, nodes: 2
SORT id: 22, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 23, column_name: key, result_type: UInt64, source_id: 3
SORT id: 24, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 25, column_name: key, result_type: UInt64, source_id: 5
SELECT
key,
a
@ -59,6 +207,24 @@ FROM test
ORDER BY
key ASC,
a ASC
QUERY id: 0
PROJECTION COLUMNS
key UInt64
a UInt8
PROJECTION
LIST id: 1, nodes: 2
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
COLUMN id: 4, column_name: a, result_type: UInt8, source_id: 3
JOIN TREE
TABLE id: 3, table_name: default.test
ORDER BY
LIST id: 5, nodes: 2
SORT id: 6, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
SORT id: 7, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 4, column_name: a, result_type: UInt8, source_id: 3
SELECT
key,
a
@ -66,6 +232,81 @@ FROM test
ORDER BY
key ASC,
exp(key + a) ASC
QUERY id: 0
PROJECTION COLUMNS
key UInt64
a UInt8
PROJECTION
LIST id: 1, nodes: 2
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
COLUMN id: 4, column_name: a, result_type: UInt8, source_id: 3
JOIN TREE
TABLE id: 3, table_name: default.test
ORDER BY
LIST id: 5, nodes: 2
SORT id: 6, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
SORT id: 7, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
FUNCTION id: 8, function_name: exp, function_type: ordinary, result_type: Float64
ARGUMENTS
LIST id: 9, nodes: 1
FUNCTION id: 10, function_name: plus, function_type: ordinary, result_type: UInt64
ARGUMENTS
LIST id: 11, nodes: 2
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
COLUMN id: 4, column_name: a, result_type: UInt8, source_id: 3
QUERY id: 0
PROJECTION COLUMNS
key UInt64
PROJECTION
LIST id: 1, nodes: 1
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
JOIN TREE
TABLE id: 3, table_name: default.test
GROUP BY
LIST id: 4, nodes: 1
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
ORDER BY
LIST id: 5, nodes: 2
SORT id: 6, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
FUNCTION id: 7, function_name: avg, function_type: aggregate, result_type: Float64
ARGUMENTS
LIST id: 8, nodes: 1
COLUMN id: 9, column_name: a, result_type: UInt8, source_id: 3
SORT id: 10, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 2, column_name: key, result_type: UInt64, source_id: 3
QUERY id: 0
PROJECTION COLUMNS
t1.id UInt64
t2.id UInt64
PROJECTION
LIST id: 1, nodes: 2
COLUMN id: 2, column_name: id, result_type: UInt64, source_id: 3
COLUMN id: 4, column_name: id, result_type: UInt64, source_id: 5
JOIN TREE
JOIN id: 6, strictness: ALL, kind: INNER
LEFT TABLE EXPRESSION
TABLE id: 3, table_name: default.t1
RIGHT TABLE EXPRESSION
TABLE id: 5, table_name: default.t2
JOIN EXPRESSION
FUNCTION id: 7, function_name: equals, function_type: ordinary, result_type: UInt8
ARGUMENTS
LIST id: 8, nodes: 2
COLUMN id: 9, column_name: id, result_type: UInt64, source_id: 3
COLUMN id: 10, column_name: id, result_type: UInt64, source_id: 5
ORDER BY
LIST id: 11, nodes: 2
SORT id: 12, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 13, column_name: id, result_type: UInt64, source_id: 3
SORT id: 14, sort_direction: ASCENDING, with_fill: 0
EXPRESSION
COLUMN id: 15, column_name: id, result_type: UInt64, source_id: 5
[0,1,2]
[0,1,2]
[0,1,2]

View File

@ -6,17 +6,37 @@ INSERT INTO test SELECT number, number, toString(number), number from numbers(4)
set optimize_redundant_functions_in_order_by = 1;
SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(x));
SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(x)) SETTINGS allow_experimental_analyzer=1;
SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(exp(x)));
SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(exp(x))) SETTINGS allow_experimental_analyzer=1;
SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY exp(x), x);
SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY exp(x), x) SETTINGS allow_experimental_analyzer=1;
SELECT * FROM (SELECT number + 2 AS key FROM numbers(4)) s FULL JOIN test t USING(key) ORDER BY s.key, t.key;
SELECT * FROM (SELECT number + 2 AS key FROM numbers(4)) s FULL JOIN test t USING(key) ORDER BY s.key, t.key SETTINGS allow_experimental_analyzer=1;
SELECT key, a FROM test ORDER BY key, a, exp(key + a);
SELECT key, a FROM test ORDER BY key, a, exp(key + a) SETTINGS allow_experimental_analyzer=1;
SELECT key, a FROM test ORDER BY key, exp(key + a);
SELECT key, a FROM test ORDER BY key, exp(key + a) SETTINGS allow_experimental_analyzer=1;
EXPLAIN SYNTAX SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(x));
EXPLAIN QUERY TREE run_passes=1 SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(x));
EXPLAIN SYNTAX SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(exp(x)));
EXPLAIN QUERY TREE run_passes=1 SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY x, exp(exp(x)));
EXPLAIN SYNTAX SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY exp(x), x);
EXPLAIN QUERY TREE run_passes=1 SELECT groupArray(x) from (SELECT number as x FROM numbers(3) ORDER BY exp(x), x);
EXPLAIN SYNTAX SELECT * FROM (SELECT number + 2 AS key FROM numbers(4)) s FULL JOIN test t USING(key) ORDER BY s.key, t.key;
EXPLAIN QUERY TREE run_passes=1 SELECT * FROM (SELECT number + 2 AS key FROM numbers(4)) s FULL JOIN test t USING(key) ORDER BY s.key, t.key;
EXPLAIN SYNTAX SELECT key, a FROM test ORDER BY key, a, exp(key + a);
EXPLAIN QUERY TREE run_passes=1 SELECT key, a FROM test ORDER BY key, a, exp(key + a);
EXPLAIN SYNTAX SELECT key, a FROM test ORDER BY key, exp(key + a);
EXPLAIN QUERY TREE run_passes=1 SELECT key, a FROM test ORDER BY key, exp(key + a);
EXPLAIN QUERY TREE run_passes=1 SELECT key FROM test GROUP BY key ORDER BY avg(a), key;
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
CREATE TABLE t1 (id UInt64) ENGINE = MergeTree() ORDER BY id;
CREATE TABLE t2 (id UInt64) ENGINE = MergeTree() ORDER BY id;
EXPLAIN QUERY TREE run_passes=1 SELECT * FROM t1 INNER JOIN t2 ON t1.id = t2.id ORDER BY t1.id, t2.id;
set optimize_redundant_functions_in_order_by = 0;
@ -33,4 +53,6 @@ EXPLAIN SYNTAX SELECT * FROM (SELECT number + 2 AS key FROM numbers(4)) s FULL J
EXPLAIN SYNTAX SELECT key, a FROM test ORDER BY key, a, exp(key + a);
EXPLAIN SYNTAX SELECT key, a FROM test ORDER BY key, exp(key + a);
DROP TABLE t1;
DROP TABLE t2;
DROP TABLE test;

View File

@ -1,5 +1,5 @@
#!/usr/bin/env bash
# Tags: long, distributed
# Tags: long, distributed, no-tsan
# These tests don't use `current_database = currentDatabase()` condition, because database name isn't propagated during remote queries.

View File

@ -0,0 +1,6 @@
SELECT a
FROM t_logical_expressions_optimizer_low_cardinality
WHERE a IN (\'x\', \'y\')
SELECT a
FROM t_logical_expressions_optimizer_low_cardinality
WHERE (b = 0) OR (b = 1)

View File

@ -0,0 +1,10 @@
DROP TABLE IF EXISTS t_logical_expressions_optimizer_low_cardinality;
set optimize_min_equality_disjunction_chain_length=3;
CREATE TABLE t_logical_expressions_optimizer_low_cardinality (a LowCardinality(String), b UInt32) ENGINE = Memory;
-- LowCardinality case, ignore optimize_min_equality_disjunction_chain_length limit, optimzer applied
EXPLAIN SYNTAX SELECT a FROM t_logical_expressions_optimizer_low_cardinality WHERE a = 'x' OR a = 'y';
-- Non-LowCardinality case, optimizer not applied for short chains
EXPLAIN SYNTAX SELECT a FROM t_logical_expressions_optimizer_low_cardinality WHERE b = 0 OR b = 1;
DROP TABLE t_logical_expressions_optimizer_low_cardinality;

View File

@ -0,0 +1,4 @@
SET allow_experimental_analyzer = 1;
SET optimize_rewrite_sum_if_to_count_if = 1;
SELECT sum(if((number % 2) = 0 AS cond_expr, 1 AS one_expr, 0 AS zero_expr) AS if_expr), sum(cond_expr), sum(if_expr), one_expr, zero_expr FROM numbers(100);

View File

@ -0,0 +1,14 @@
SET allow_experimental_analyzer = 1;
SET optimize_arithmetic_operations_in_aggregate_functions = 1;
DROP TABLE IF EXISTS test_table;
CREATE TABLE test_table
(
id UInt64,
value UInt64
) ENGINE=MergeTree ORDER BY id;
INSERT INTO test_table VALUES (1, 1);
INSERT INTO test_table VALUES (1, 1);
SELECT sum((2 * id) as func), func FROM test_table GROUP BY id;

View File

@ -0,0 +1,11 @@
SET join_use_nulls = 1;
SELECT b.id
FROM (
SELECT toLowCardinality(0 :: UInt32) AS id
GROUP BY []
) AS a
SEMI LEFT JOIN (
SELECT toLowCardinality(1 :: UInt64) AS id
) AS b
USING (id);

View File

@ -0,0 +1,6 @@
drop table if exists test;
create table test (x UInt32, y String) engine=Memory;
insert into test select * from generateRandom() limit 10;
select count() from test;
drop table test;

View File

@ -23,7 +23,7 @@ export CLICKHOUSE_TEST_UNIQUE_NAME="${CLICKHOUSE_TEST_NAME}_${CLICKHOUSE_DATABAS
[ -n "${CLICKHOUSE_DATABASE:-}" ] && CLICKHOUSE_BENCHMARK_OPT0+=" --database=${CLICKHOUSE_DATABASE} "
[ -n "${CLICKHOUSE_LOG_COMMENT:-}" ] && CLICKHOUSE_BENCHMARK_OPT0+=" --log_comment $(printf '%q' ${CLICKHOUSE_LOG_COMMENT}) "
export CLICKHOUSE_BINARY=${CLICKHOUSE_BINARY:="clickhouse"}
export CLICKHOUSE_BINARY=${CLICKHOUSE_BINARY:="$(command -v clickhouse)"}
# client
[ -x "$CLICKHOUSE_BINARY-client" ] && CLICKHOUSE_CLIENT_BINARY=${CLICKHOUSE_CLIENT_BINARY:=$CLICKHOUSE_BINARY-client}
[ -x "$CLICKHOUSE_BINARY" ] && CLICKHOUSE_CLIENT_BINARY=${CLICKHOUSE_CLIENT_BINARY:=$CLICKHOUSE_BINARY client}