Merge branch 'master' into DOCAPI-3772-arraySort2

This commit is contained in:
ogorbacheva 2019-04-22 14:59:13 +03:00
commit 0573f7eb1f
28 changed files with 324 additions and 307 deletions

2
contrib/lz4 vendored

@ -1 +1 @@
Subproject commit c10863b98e1503af90616ae99725ecd120265dfb
Subproject commit 780aac520b69d6369f4e3995624c37e56d75498d

View File

@ -9,8 +9,7 @@ add_library (lz4
${LIBRARY_DIR}/xxhash.h
${LIBRARY_DIR}/lz4.h
${LIBRARY_DIR}/lz4hc.h
${LIBRARY_DIR}/lz4opt.h)
${LIBRARY_DIR}/lz4hc.h)
target_compile_definitions(lz4 PUBLIC LZ4_DISABLE_DEPRECATE_WARNINGS=1)

View File

@ -43,13 +43,13 @@ using Arenas = std::vector<ArenaPtr>;
* specifying which individual values should be destroyed and which ones should not.
* Clearly, this method would have a substantially non-zero price.
*/
class ColumnAggregateFunction final : public COWPtrHelper<IColumn, ColumnAggregateFunction>
class ColumnAggregateFunction final : public COWHelper<IColumn, ColumnAggregateFunction>
{
public:
using Container = PaddedPODArray<AggregateDataPtr>;
private:
friend class COWPtrHelper<IColumn, ColumnAggregateFunction>;
friend class COWHelper<IColumn, ColumnAggregateFunction>;
/// Memory pools. Aggregate states are allocated from them.
Arenas arenas;

View File

@ -13,10 +13,10 @@ namespace DB
* In memory, it is represented as one column of a nested type, whose size is equal to the sum of the sizes of all arrays,
* and as an array of offsets in it, which allows you to get each element.
*/
class ColumnArray final : public COWPtrHelper<IColumn, ColumnArray>
class ColumnArray final : public COWHelper<IColumn, ColumnArray>
{
private:
friend class COWPtrHelper<IColumn, ColumnArray>;
friend class COWHelper<IColumn, ColumnArray>;
/** Create an array column with specified values and offsets. */
ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && offsets_column);
@ -30,7 +30,7 @@ public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWPtrHelper<IColumn, ColumnArray>;
using Base = COWHelper<IColumn, ColumnArray>;
static Ptr create(const ColumnPtr & nested_column, const ColumnPtr & offsets_column)
{

View File

@ -18,10 +18,10 @@ namespace ErrorCodes
/** ColumnConst contains another column with single element,
* but looks like a column with arbitrary amount of same elements.
*/
class ColumnConst final : public COWPtrHelper<IColumn, ColumnConst>
class ColumnConst final : public COWHelper<IColumn, ColumnConst>
{
private:
friend class COWPtrHelper<IColumn, ColumnConst>;
friend class COWHelper<IColumn, ColumnConst>;
WrappedPtr data;
size_t s;

View File

@ -55,13 +55,13 @@ private:
/// A ColumnVector for Decimals
template <typename T>
class ColumnDecimal final : public COWPtrHelper<ColumnVectorHelper, ColumnDecimal<T>>
class ColumnDecimal final : public COWHelper<ColumnVectorHelper, ColumnDecimal<T>>
{
static_assert(IsDecimalNumber<T>);
private:
using Self = ColumnDecimal;
friend class COWPtrHelper<ColumnVectorHelper, Self>;
friend class COWHelper<ColumnVectorHelper, Self>;
public:
using Container = DecimalPaddedPODArray<T>;

View File

@ -13,10 +13,10 @@ namespace DB
/** A column of values of "fixed-length string" type.
* If you insert a smaller string, it will be padded with zero bytes.
*/
class ColumnFixedString final : public COWPtrHelper<ColumnVectorHelper, ColumnFixedString>
class ColumnFixedString final : public COWHelper<ColumnVectorHelper, ColumnFixedString>
{
public:
friend class COWPtrHelper<ColumnVectorHelper, ColumnFixedString>;
friend class COWHelper<ColumnVectorHelper, ColumnFixedString>;
using Chars = PaddedPODArray<UInt8>;

View File

@ -15,10 +15,10 @@ namespace DB
/** A column containing a lambda expression.
* Behaves like a constant-column. Contains an expression, but not input or output data.
*/
class ColumnFunction final : public COWPtrHelper<IColumn, ColumnFunction>
class ColumnFunction final : public COWHelper<IColumn, ColumnFunction>
{
private:
friend class COWPtrHelper<IColumn, ColumnFunction>;
friend class COWHelper<IColumn, ColumnFunction>;
ColumnFunction(size_t size, FunctionBasePtr function, const ColumnsWithTypeAndName & columns_to_capture);

View File

@ -14,9 +14,9 @@ namespace ErrorCodes
extern const int ILLEGAL_COLUMN;
}
class ColumnLowCardinality final : public COWPtrHelper<IColumn, ColumnLowCardinality>
class ColumnLowCardinality final : public COWHelper<IColumn, ColumnLowCardinality>
{
friend class COWPtrHelper<IColumn, ColumnLowCardinality>;
friend class COWHelper<IColumn, ColumnLowCardinality>;
ColumnLowCardinality(MutableColumnPtr && column_unique, MutableColumnPtr && indexes, bool is_shared = false);
ColumnLowCardinality(const ColumnLowCardinality & other) = default;
@ -25,7 +25,7 @@ public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWPtrHelper<IColumn, ColumnLowCardinality>;
using Base = COWHelper<IColumn, ColumnLowCardinality>;
static Ptr create(const ColumnPtr & column_unique_, const ColumnPtr & indexes_, bool is_shared = false)
{
return ColumnLowCardinality::create(column_unique_->assumeMutable(), indexes_->assumeMutable(), is_shared);

View File

@ -6,10 +6,10 @@
namespace DB
{
class ColumnNothing final : public COWPtrHelper<IColumnDummy, ColumnNothing>
class ColumnNothing final : public COWHelper<IColumnDummy, ColumnNothing>
{
private:
friend class COWPtrHelper<IColumnDummy, ColumnNothing>;
friend class COWHelper<IColumnDummy, ColumnNothing>;
ColumnNothing(size_t s_)
{

View File

@ -20,10 +20,10 @@ using ConstNullMapPtr = const NullMap *;
/// over a bitmap because columns are usually stored on disk as compressed
/// files. In this regard, using a bitmap instead of a byte map would
/// greatly complicate the implementation with little to no benefits.
class ColumnNullable final : public COWPtrHelper<IColumn, ColumnNullable>
class ColumnNullable final : public COWHelper<IColumn, ColumnNullable>
{
private:
friend class COWPtrHelper<IColumn, ColumnNullable>;
friend class COWHelper<IColumn, ColumnNullable>;
ColumnNullable(MutableColumnPtr && nested_column_, MutableColumnPtr && null_map_);
ColumnNullable(const ColumnNullable &) = default;
@ -32,7 +32,7 @@ public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWPtrHelper<IColumn, ColumnNullable>;
using Base = COWHelper<IColumn, ColumnNullable>;
static Ptr create(const ColumnPtr & nested_column_, const ColumnPtr & null_map_)
{
return ColumnNullable::create(nested_column_->assumeMutable(), null_map_->assumeMutable());

View File

@ -14,10 +14,10 @@ using ConstSetPtr = std::shared_ptr<const Set>;
* Behaves like a constant-column (because the set is one, not its own for each line).
* This column has a nonstandard value, so it can not be obtained via a normal interface.
*/
class ColumnSet final : public COWPtrHelper<IColumnDummy, ColumnSet>
class ColumnSet final : public COWHelper<IColumnDummy, ColumnSet>
{
private:
friend class COWPtrHelper<IColumnDummy, ColumnSet>;
friend class COWHelper<IColumnDummy, ColumnSet>;
ColumnSet(size_t s_, const ConstSetPtr & data_) : data(data_) { s = s_; }
ColumnSet(const ColumnSet &) = default;

View File

@ -18,14 +18,14 @@ namespace DB
/** Column for String values.
*/
class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
class ColumnString final : public COWHelper<IColumn, ColumnString>
{
public:
using Char = UInt8;
using Chars = PaddedPODArray<UInt8>;
private:
friend class COWPtrHelper<IColumn, ColumnString>;
friend class COWHelper<IColumn, ColumnString>;
/// Maps i'th position to offset to i+1'th element. Last offset maps to the end of all chars (is the size of all chars).
Offsets offsets;

View File

@ -12,10 +12,10 @@ namespace DB
* Mixed constant/non-constant columns is prohibited in tuple
* for implementation simplicity.
*/
class ColumnTuple final : public COWPtrHelper<IColumn, ColumnTuple>
class ColumnTuple final : public COWHelper<IColumn, ColumnTuple>
{
private:
friend class COWPtrHelper<IColumn, ColumnTuple>;
friend class COWHelper<IColumn, ColumnTuple>;
using TupleColumns = std::vector<WrappedPtr>;
TupleColumns columns;
@ -30,7 +30,7 @@ public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWPtrHelper<IColumn, ColumnTuple>;
using Base = COWHelper<IColumn, ColumnTuple>;
static Ptr create(const Columns & columns);
static Ptr create(const TupleColumns & columns);
static Ptr create(Columns && arg) { return create(arg); }

View File

@ -25,9 +25,9 @@ namespace ErrorCodes
}
template <typename ColumnType>
class ColumnUnique final : public COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType>>
class ColumnUnique final : public COWHelper<IColumnUnique, ColumnUnique<ColumnType>>
{
friend class COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType>>;
friend class COWHelper<IColumnUnique, ColumnUnique<ColumnType>>;
private:
explicit ColumnUnique(MutableColumnPtr && holder, bool is_nullable);

View File

@ -90,13 +90,13 @@ template <> struct CompareHelper<Float64> : public FloatCompareHelper<Float64> {
/** A template for columns that use a simple array to store.
*/
template <typename T>
class ColumnVector final : public COWPtrHelper<ColumnVectorHelper, ColumnVector<T>>
class ColumnVector final : public COWHelper<ColumnVectorHelper, ColumnVector<T>>
{
static_assert(!IsDecimalNumber<T>);
private:
using Self = ColumnVector;
friend class COWPtrHelper<ColumnVectorHelper, Self>;
friend class COWHelper<ColumnVectorHelper, Self>;
struct less;
struct greater;

View File

@ -1,7 +1,7 @@
#pragma once
#include <Core/Field.h>
#include <Common/COWPtr.h>
#include <Common/COW.h>
#include <Common/PODArray.h>
#include <Common/Exception.h>
#include <common/StringRef.h>
@ -24,13 +24,13 @@ class Arena;
class ColumnGathererStream;
/// Declares interface to store columns in memory.
class IColumn : public COWPtr<IColumn>
class IColumn : public COW<IColumn>
{
private:
friend class COWPtr<IColumn>;
friend class COW<IColumn>;
/// Creates the same column with the same data.
/// This is internal method to use from COWPtr.
/// This is internal method to use from COW.
/// It performs shallow copy with copy-ctor and not useful from outside.
/// If you want to copy column for modification, look at 'mutate' method.
virtual MutablePtr clone() const = 0;

View File

@ -10,10 +10,10 @@
*
* Usage:
class Column : public COWPtr<Column>
class Column : public COW<Column>
{
private:
friend class COWPtr<Column>;
friend class COW<Column>;
/// Leave all constructors in private section. They will be avaliable through 'create' method.
Column();
@ -23,7 +23,7 @@
public:
/// Correctly use const qualifiers in your interface.
virtual ~IColumn() {}
virtual ~Column() {}
};
* It will provide 'create' and 'mutate' methods.
@ -63,7 +63,7 @@
* Actually it is, if your values are small or if copying is done implicitly.
* This is the case for string implementations.
*
* In contrast, COWPtr is intended for the cases when you need to share states of large objects,
* In contrast, COW is intended for the cases when you need to share states of large objects,
* (when you usually will use std::shared_ptr) but you also want precise control over modification
* of this shared state.
*
@ -73,7 +73,7 @@
* to use std::unique_ptr for it somehow.
*/
template <typename Derived>
class COWPtr : public boost::intrusive_ref_counter<Derived>
class COW : public boost::intrusive_ref_counter<Derived>
{
private:
Derived * derived() { return static_cast<Derived *>(this); }
@ -96,8 +96,8 @@ protected:
private:
using Base = IntrusivePtr<T>;
template <typename> friend class COWPtr;
template <typename, typename> friend class COWPtrHelper;
template <typename> friend class COW;
template <typename, typename> friend class COWHelper;
explicit mutable_ptr(T * ptr) : Base(ptr) {}
@ -115,7 +115,7 @@ protected:
mutable_ptr() = default;
mutable_ptr(const std::nullptr_t *) {}
mutable_ptr(std::nullptr_t) {}
};
public:
@ -128,8 +128,8 @@ protected:
private:
using Base = IntrusivePtr<const T>;
template <typename> friend class COWPtr;
template <typename, typename> friend class COWPtrHelper;
template <typename> friend class COW;
template <typename, typename> friend class COWHelper;
explicit immutable_ptr(const T * ptr) : Base(ptr) {}
@ -159,7 +159,7 @@ protected:
immutable_ptr() = default;
immutable_ptr(const std::nullptr_t *) {}
immutable_ptr(std::nullptr_t) {}
};
public:
@ -192,7 +192,7 @@ public:
MutablePtr assumeMutable() const
{
return const_cast<COWPtr*>(this)->getPtr();
return const_cast<COW*>(this)->getPtr();
}
Derived & assumeMutableRef() const
@ -244,7 +244,7 @@ public:
*
* NOTE:
* If you override 'mutate' method in inherited classes, don't forget to make it virtual in base class or to make it call a virtual method.
* (COWPtr itself doesn't force any methods to be virtual).
* (COW itself doesn't force any methods to be virtual).
*
* See example in "cow_compositions.cpp".
*/
@ -255,22 +255,22 @@ public:
/** Helper class to support inheritance.
* Example:
*
* class IColumn : public COWPtr<IColumn>
* class IColumn : public COW<IColumn>
* {
* friend class COWPtr<IColumn>;
* friend class COW<IColumn>;
* virtual MutablePtr clone() const = 0;
* virtual ~IColumn() {}
* };
*
* class ConcreteColumn : public COWPtrHelper<IColumn, ConcreteColumn>
* class ConcreteColumn : public COWHelper<IColumn, ConcreteColumn>
* {
* friend class COWPtrHelper<IColumn, ConcreteColumn>;
* friend class COWHelper<IColumn, ConcreteColumn>;
* };
*
* Here is complete inheritance diagram:
*
* ConcreteColumn
* COWPtrHelper<IColumn, ConcreteColumn>
* COWHelper<IColumn, ConcreteColumn>
* IColumn
* CowPtr<IColumn>
* boost::intrusive_ref_counter<IColumn>
@ -278,7 +278,7 @@ public:
* See example in "cow_columns.cpp".
*/
template <typename Base, typename Derived>
class COWPtrHelper : public Base
class COWHelper : public Base
{
private:
Derived * derived() { return static_cast<Derived *>(this); }

View File

@ -1,11 +1,11 @@
#include <Common/COWPtr.h>
#include <Common/COW.h>
#include <iostream>
class IColumn : public COWPtr<IColumn>
class IColumn : public COW<IColumn>
{
private:
friend class COWPtr<IColumn>;
friend class COW<IColumn>;
virtual MutablePtr clone() const = 0;
public:
@ -22,10 +22,10 @@ public:
using ColumnPtr = IColumn::Ptr;
using MutableColumnPtr = IColumn::MutablePtr;
class ConcreteColumn : public COWPtrHelper<IColumn, ConcreteColumn>
class ConcreteColumn : public COWHelper<IColumn, ConcreteColumn>
{
private:
friend class COWPtrHelper<IColumn, ConcreteColumn>;
friend class COWHelper<IColumn, ConcreteColumn>;
int data;
ConcreteColumn(int data) : data(data) {}

View File

@ -1,11 +1,11 @@
#include <Common/COWPtr.h>
#include <Common/COW.h>
#include <iostream>
class IColumn : public COWPtr<IColumn>
class IColumn : public COW<IColumn>
{
private:
friend class COWPtr<IColumn>;
friend class COW<IColumn>;
virtual MutablePtr clone() const = 0;
virtual MutablePtr deepMutate() const { return shallowMutate(); }
@ -24,10 +24,10 @@ public:
using ColumnPtr = IColumn::Ptr;
using MutableColumnPtr = IColumn::MutablePtr;
class ConcreteColumn : public COWPtrHelper<IColumn, ConcreteColumn>
class ConcreteColumn : public COWHelper<IColumn, ConcreteColumn>
{
private:
friend class COWPtrHelper<IColumn, ConcreteColumn>;
friend class COWHelper<IColumn, ConcreteColumn>;
int data;
ConcreteColumn(int data) : data(data) {}
@ -38,10 +38,10 @@ public:
void set(int value) override { data = value; }
};
class ColumnComposition : public COWPtrHelper<IColumn, ColumnComposition>
class ColumnComposition : public COWHelper<IColumn, ColumnComposition>
{
private:
friend class COWPtrHelper<IColumn, ColumnComposition>;
friend class COWHelper<IColumn, ColumnComposition>;
ConcreteColumn::WrappedPtr wrapped;

View File

@ -12,7 +12,7 @@
#include <Interpreters/ExpressionAnalyzer.h>
#include <Parsers/IAST.h>
#include <Storages/IStorage.h>
#include <Common/COWPtr.h>
#include <Common/COW.h>
#include <Common/FieldVisitors.h>
namespace DB

View File

@ -1,7 +1,7 @@
#pragma once
#include <memory>
#include <Common/COWPtr.h>
#include <Common/COW.h>
#include <boost/noncopyable.hpp>
#include <Core/Field.h>
@ -17,8 +17,8 @@ class IDataType;
struct FormatSettings;
class IColumn;
using ColumnPtr = COWPtr<IColumn>::Ptr;
using MutableColumnPtr = COWPtr<IColumn>::MutablePtr;
using ColumnPtr = COW<IColumn>::Ptr;
using MutableColumnPtr = COW<IColumn>::MutablePtr;
using DataTypePtr = std::shared_ptr<const IDataType>;
using DataTypes = std::vector<DataTypePtr>;

View File

@ -55,7 +55,7 @@ For very large clusters, you can use different ZooKeeper clusters for different
Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network.
By default, an INSERT query waits for confirmation of writing the data from only one replica. If the data was successfully written to only one replica and the server with this replica ceases to exist, the stored data will be lost. Tp enable getting confirmation of data writes from multiple replicas, use the `insert_quorum` option.
By default, an INSERT query waits for confirmation of writing the data from only one replica. If the data was successfully written to only one replica and the server with this replica ceases to exist, the stored data will be lost. To enable getting confirmation of data writes from multiple replicas, use the `insert_quorum` option.
Each block of data is written atomically. The INSERT query is divided into blocks up to `max_insert_block_size = 1048576` rows. In other words, if the `INSERT` query has less than 1048576 rows, it is made atomically.

View File

@ -156,7 +156,7 @@ Here, a sample of 10% is taken from the second half of the data.
### ARRAY JOIN Clause {#select-array-join-clause}
Allows executing `JOIN` with an array or nested data structure. Allows you to perform `JOIN` both with the external array and with the inner array in the table. The intent is similar to the [arrayJoin](functions/array_functions.md#array_functions-join) function, but its functionality is broader.
Allows executing `JOIN` with an array or nested data structure. The intent is similar to the [arrayJoin](functions/array_join.md#functions_arrayjoin) function, but its functionality is broader.
``` sql
SELECT <expr_list>
@ -168,14 +168,14 @@ FROM <left_subquery>
You can specify only a single `ARRAY JOIN` clause in a query.
When running the `ARRAY JOIN`, there is an optimization of the query execution order. Although the `ARRAY JOIN` must be always specified before the `WHERE/PREWHERE` clause, it can be performed as before the `WHERE/PREWHERE` (if its result is needed in this clause), as after completing it (to reduce the volume of calculations). The processing order is controlled by the query optimizer.
The query execution order is optimized when running `ARRAY JOIN`. Although `ARRAY JOIN` must always be specified before the `WHERE/PREWHERE` clause, it can be performed either before `WHERE/PREWHERE` (if the result is needed in this clause), or after completing it (to reduce the volume of calculations). The processing order is controlled by the query optimizer.
Supported types of `ARRAY JOIN` are listed below:
- `ARRAY JOIN` - Executing `JOIN` with an array or nested data structure. Empty arrays are not included in the result.
- `LEFT ARRAY JOIN` - Unlike `ARRAY JOIN`, when using the `LEFT ARRAY JOIN` the result contains the rows with empty arrays. The value for an empty array is set to default value for an array element type (usually 0, empty string or NULL).
- `ARRAY JOIN` - In this case, empty arrays are not included in the result of `JOIN`.
- `LEFT ARRAY JOIN` - The result of `JOIN` contains rows with empty arrays. The value for an empty array is set to the default value for the array element type (usually 0, empty string or NULL).
Examples below demonstrate the usage of the `ARRAY JOIN` clause. Let's create a table with an [Array](../data_types/array.md) type column and insert values into it:
The examples below demonstrate the usage of the `ARRAY JOIN` and `LEFT ARRAY JOIN` clauses. Let's create a table with an [Array](../data_types/array.md) type column and insert values into it:
``` sql
CREATE TABLE arrays_test
@ -195,7 +195,7 @@ VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []);
└─────────────┴─────────┘
```
The first example shows using the `ARRAY JOIN` clause:
The example below uses the `ARRAY JOIN` clause:
``` sql
SELECT s, arr
@ -212,7 +212,7 @@ ARRAY JOIN arr;
└───────┴─────┘
```
The second example shows using the `LEFT ARRAY JOIN` clause:
The next example uses the `LEFT ARRAY JOIN` clause:
``` sql
SELECT s, arr
@ -230,7 +230,27 @@ LEFT ARRAY JOIN arr;
└─────────────┴─────┘
```
The next example demonstrates using the `ARRAY JOIN` with the external array:
#### Using Aliases
An alias can be specified for an array in the `ARRAY JOIN` clause. In this case, an array item can be accessed by this alias, but the array itself is accessed by the original name. Example:
``` sql
SELECT s, arr, a
FROM arrays_test
ARRAY JOIN arr AS a;
```
```
┌─s─────┬─arr─────┬─a─┐
│ Hello │ [1,2] │ 1 │
│ Hello │ [1,2] │ 2 │
│ World │ [3,4,5] │ 3 │
│ World │ [3,4,5] │ 4 │
│ World │ [3,4,5] │ 5 │
└───────┴─────────┴───┘
```
Using aliases, you can perform `ARRAY JOIN` with an external array. For example:
``` sql
SELECT s, arr_external
@ -252,27 +272,7 @@ ARRAY JOIN [1, 2, 3] AS arr_external;
└─────────────┴──────────────┘
```
#### Using Aliases
An alias can be specified for an array in the `ARRAY JOIN` clause. In this case, an array item can be accessed by this alias, but the array itself by the original name. Example:
``` sql
SELECT s, arr, a
FROM arrays_test
ARRAY JOIN arr AS a;
```
```
┌─s─────┬─arr─────┬─a─┐
│ Hello │ [1,2] │ 1 │
│ Hello │ [1,2] │ 2 │
│ World │ [3,4,5] │ 3 │
│ World │ [3,4,5] │ 4 │
│ World │ [3,4,5] │ 5 │
└───────┴─────────┴───┘
```
Multiple arrays of the same size can be comma-separated in the `ARRAY JOIN` clause. In this case, `JOIN` is performed with them simultaneously (the direct sum, not the cartesian product). Example:
Multiple arrays can be comma-separated in the `ARRAY JOIN` clause. In this case, `JOIN` is performed with them simultaneously (the direct sum, not the cartesian product). Note that all the arrays must have the same size. Example:
``` sql
SELECT s, arr, a, num, mapped
@ -290,6 +290,8 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma
└───────┴─────────┴───┴─────┴────────┘
```
The example below uses the [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) function:
``` sql
SELECT s, arr, a, num, arrayEnumerate(arr)
FROM arrays_test
@ -308,7 +310,7 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num;
#### ARRAY JOIN With Nested Data Structure
`ARRAY JOIN` also works with [nested data structure](../data_types/nested_data_structures/nested.md). Example:
`ARRAY `JOIN`` also works with [nested data structures](../data_types/nested_data_structures/nested.md). Example:
``` sql
CREATE TABLE nested_test
@ -401,7 +403,7 @@ ARRAY JOIN nest AS n;
└───────┴─────┴─────┴─────────┴────────────┘
```
The example of using the [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) function:
Example of using the [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) function:
``` sql
SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num
@ -444,7 +446,7 @@ The table names can be specified instead of `<left_subquery>` and `<right_subque
- `FULL JOIN` (or `FULL OUTER JOIN`)
- `CROSS JOIN` (or `,` )
See standard [SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL)) description.
See the standard [SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL)) description.
**ANY or ALL strictness**

View File

@ -198,7 +198,7 @@ SELECT countEqual([1, 2, NULL, NULL], NULL)
└──────────────────────────────────────┘
```
## arrayEnumerate(arr)
## arrayEnumerate(arr) {#array_functions-arrayenumerate}
Возвращает массив \[1, 2, 3, ..., length(arr)\]

View File

@ -161,50 +161,53 @@ SAMPLE 1/10 OFFSET 1/2
### Секция ARRAY JOIN {#select-array-join-clause}
Позволяет выполнить JOIN с массивом или вложенной структурой данных. Смысл похож на функцию arrayJoin, но функциональность более широкая.
`ARRAY JOIN` - это, по сути, `INNER JOIN` с массивом. Пример:
Позволяет выполнить `JOIN` с массивом или вложенной структурой данных. Смысл похож на функцию [arrayJoin](functions/array_join.md#functions_arrayjoin), но функциональность более широкая.
``` sql
SELECT <expr_list>
FROM <left_subquery>
[LEFT] ARRAY JOIN <array>
[WHERE|PREWHERE <expr>]
...
```
:) CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory
В запросе может быть указано не более одной секции `ARRAY JOIN`.
При использовании `ARRAY JOIN`, порядок выполнения запроса оптимизируется. Несмотря на то что секция `ARRAY JOIN` всегда указывается перед выражением `WHERE / PREWHERE`, преобразование `JOIN` может быть выполнено как до выполнения выражения `WHERE / PREWHERE` (если результат необходим в этом выражении), так и после (чтобы уменьшить объем расчетов). Порядок обработки контролируется оптимизатором запросов.
Секция `ARRAY JOIN` поддерживает следующие формы записи:
- `ARRAY JOIN` — в этом случае результат `JOIN` не будет содержать пустые массивы;
- `LEFT ARRAY JOIN` — пустые массивы попадут в результат выполнения `JOIN`. В качестве значения для пустых массивов устанавливается значение по умолчанию. Обычно это 0, пустая строка или NULL, в зависимости от типа элементов массива.
Рассмотрим примеры использования `ARRAY JOIN` и `LEFT ARRAY JOIN`. Для начала создадим таблицу, содержащую столбец с типом [Array](../data_types/array.md), и добавим в него значение:
``` sql
CREATE TABLE arrays_test
(
s String,
arr Array(UInt8)
) ENGINE = Memory
) ENGINE = Memory;
Ok.
INSERT INTO arrays_test
VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []);
```
```
┌─s───────────┬─arr─────┐
│ Hello │ [1,2] │
│ World │ [3,4,5] │
│ Goodbye │ [] │
└─────────────┴─────────┘
```
0 rows in set. Elapsed: 0.001 sec.
:) INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', [])
INSERT INTO arrays_test VALUES
Ok.
3 rows in set. Elapsed: 0.001 sec.
:) SELECT * FROM arrays_test
SELECT *
FROM arrays_test
┌─s───────┬─arr─────┐
│ Hello │ [1,2] │
│ World │ [3,4,5] │
│ Goodbye │ [] │
└─────────┴─────────┘
3 rows in set. Elapsed: 0.001 sec.
:) SELECT s, arr FROM arrays_test ARRAY JOIN arr
В примере ниже используется `ARRAY JOIN`:
``` sql
SELECT s, arr
FROM arrays_test
ARRAY JOIN arr
ARRAY JOIN arr;
```
```
┌─s─────┬─arr─┐
│ Hello │ 1 │
│ Hello │ 2 │
@ -212,19 +215,37 @@ ARRAY JOIN arr
│ World │ 4 │
│ World │ 5 │
└───────┴─────┘
5 rows in set. Elapsed: 0.001 sec.
```
Для массива в секции ARRAY JOIN может быть указан алиас. В этом случае, элемент массива будет доступен под этим алиасом, а сам массив - под исходным именем. Пример:
Следующий пример использует `LEFT ARRAY JOIN`:
``` sql
SELECT s, arr
FROM arrays_test
LEFT ARRAY JOIN arr;
```
:) SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a
```
┌─s───────────┬─arr─┐
│ Hello │ 1 │
│ Hello │ 2 │
│ World │ 3 │
│ World │ 4 │
│ World │ 5 │
│ Goodbye │ 0 │
└─────────────┴─────┘
```
#### Использование алиасов
Для массива в секции `ARRAY JOIN` может быть указан алиас. В этом случае, элемент массива будет доступен под этим алиасом, а сам массив — под исходным именем. Пример:
``` sql
SELECT s, arr, a
FROM arrays_test
ARRAY JOIN arr AS a
ARRAY JOIN arr AS a;
```
```
┌─s─────┬─arr─────┬─a─┐
│ Hello │ [1,2] │ 1 │
│ Hello │ [1,2] │ 2 │
@ -232,19 +253,39 @@ ARRAY JOIN arr AS a
│ World │ [3,4,5] │ 4 │
│ World │ [3,4,5] │ 5 │
└───────┴─────────┴───┘
5 rows in set. Elapsed: 0.001 sec.
```
В секции ARRAY JOIN может быть указано несколько массивов одинаковых размеров через запятую. В этом случае, JOIN делается с ними одновременно (прямая сумма, а не прямое произведение). Пример:
Используя алиасы, можно выполнять `JOIN` с внешними массивами:
``` sql
SELECT s, arr_external
FROM arrays_test
ARRAY JOIN [1, 2, 3] AS arr_external;
```
```
:) SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped
┌─s───────────┬─arr_external─┐
│ Hello │ 1 │
│ Hello │ 2 │
│ Hello │ 3 │
│ World │ 1 │
│ World │ 2 │
│ World │ 3 │
│ Goodbye │ 1 │
│ Goodbye │ 2 │
│ Goodbye │ 3 │
└─────────────┴──────────────┘
```
В секции `ARRAY JOIN` можно указать через запятую сразу несколько массивов. В этом случае, `JOIN` делается с ними одновременно (прямая сумма, а не прямое произведение). Обратите внимание, массивы должны быть одинаковых размеров. Примеры:
``` sql
SELECT s, arr, a, num, mapped
FROM arrays_test
ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(lambda(tuple(x), plus(x, 1)), arr) AS mapped
ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped;
```
```
┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐
│ Hello │ [1,2] │ 1 │ 1 │ 2 │
│ Hello │ [1,2] │ 2 │ 2 │ 3 │
@ -252,15 +293,17 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(lambda(tuple(x), plus(
│ World │ [3,4,5] │ 4 │ 2 │ 5 │
│ World │ [3,4,5] │ 5 │ 3 │ 6 │
└───────┴─────────┴───┴─────┴────────┘
```
5 rows in set. Elapsed: 0.002 sec.
:) SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num
В примере ниже используется функция [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate):
``` sql
SELECT s, arr, a, num, arrayEnumerate(arr)
FROM arrays_test
ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num
ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num;
```
```
┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐
│ Hello │ [1,2] │ 1 │ 1 │ [1,2] │
│ Hello │ [1,2] │ 2 │ 2 │ [1,2] │
@ -268,54 +311,40 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num
│ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │
│ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │
└───────┴─────────┴───┴─────┴─────────────────────┘
5 rows in set. Elapsed: 0.002 sec.
```
ARRAY JOIN также работает с вложенными структурами данных. Пример:
#### ARRAY JOIN с вложенными структурами данных
```
:) CREATE TABLE nested_test (s String, nest Nested(x UInt8, y UInt32)) ENGINE = Memory
`ARRAY JOIN` также работает с [вложенными структурами данных](../data_types/nested_data_structures/nested.md). Пример:
``` sql
CREATE TABLE nested_test
(
s String,
nest Nested(
x UInt8,
y UInt32)
) ENGINE = Memory
) ENGINE = Memory;
Ok.
0 rows in set. Elapsed: 0.006 sec.
:) INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], [])
INSERT INTO nested_test VALUES
Ok.
3 rows in set. Elapsed: 0.001 sec.
:) SELECT * FROM nested_test
SELECT *
FROM nested_test
INSERT INTO nested_test
VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []);
```
```
┌─s───────┬─nest.x──┬─nest.y─────┐
│ Hello │ [1,2] │ [10,20] │
│ World │ [3,4,5] │ [30,40,50] │
│ Goodbye │ [] │ [] │
└─────────┴─────────┴────────────┘
```
3 rows in set. Elapsed: 0.001 sec.
:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest
``` sql
SELECT s, `nest.x`, `nest.y`
FROM nested_test
ARRAY JOIN nest
ARRAY JOIN nest;
```
```
┌─s─────┬─nest.x─┬─nest.y─┐
│ Hello │ 1 │ 10 │
│ Hello │ 2 │ 20 │
@ -323,19 +352,17 @@ ARRAY JOIN nest
│ World │ 4 │ 40 │
│ World │ 5 │ 50 │
└───────┴────────┴────────┘
5 rows in set. Elapsed: 0.001 sec.
```
При указании имени вложенной структуры данных в ARRAY JOIN, смысл такой же, как ARRAY JOIN со всеми элементами-массивами, из которых она состоит. Пример:
```
:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x, nest.y
При указании имени вложенной структуры данных в `ARRAY JOIN`, смысл такой же, как `ARRAY JOIN` со всеми элементами-массивами, из которых она состоит. Пример:
``` sql
SELECT s, `nest.x`, `nest.y`
FROM nested_test
ARRAY JOIN `nest.x`, `nest.y`
ARRAY JOIN `nest.x`, `nest.y`;
```
```
┌─s─────┬─nest.x─┬─nest.y─┐
│ Hello │ 1 │ 10 │
│ Hello │ 2 │ 20 │
@ -343,19 +370,17 @@ ARRAY JOIN `nest.x`, `nest.y`
│ World │ 4 │ 40 │
│ World │ 5 │ 50 │
└───────┴────────┴────────┘
5 rows in set. Elapsed: 0.001 sec.
```
Такой вариант тоже имеет смысл:
```
:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x
``` sql
SELECT s, `nest.x`, `nest.y`
FROM nested_test
ARRAY JOIN `nest.x`
ARRAY JOIN `nest.x`;
```
```
┌─s─────┬─nest.x─┬─nest.y─────┐
│ Hello │ 1 │ [10,20] │
│ Hello │ 2 │ [10,20] │
@ -363,19 +388,17 @@ ARRAY JOIN `nest.x`
│ World │ 4 │ [30,40,50] │
│ World │ 5 │ [30,40,50] │
└───────┴────────┴────────────┘
5 rows in set. Elapsed: 0.001 sec.
```
Алиас для вложенной структуры данных можно использовать, чтобы выбрать как результат JOIN-а, так и исходный массив. Пример:
```
:) SELECT s, n.x, n.y, nest.x, nest.y FROM nested_test ARRAY JOIN nest AS n
Алиас для вложенной структуры данных можно использовать, чтобы выбрать как результат `JOIN`-а, так и исходный массив. Пример:
``` sql
SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`
FROM nested_test
ARRAY JOIN nest AS n
ARRAY JOIN nest AS n;
```
```
┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐
│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │
│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │
@ -383,19 +406,17 @@ ARRAY JOIN nest AS n
│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │
│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │
└───────┴─────┴─────┴─────────┴────────────┘
5 rows in set. Elapsed: 0.001 sec.
```
Пример использования функции arrayEnumerate:
```
:) SELECT s, n.x, n.y, nest.x, nest.y, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(nest.x) AS num
Пример использования функции [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate):
``` sql
SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num
FROM nested_test
ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num
ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num;
```
```
┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐
│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │
│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │
@ -403,16 +424,8 @@ ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num
│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │
│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │
└───────┴─────┴─────┴─────────┴────────────┴─────┘
5 rows in set. Elapsed: 0.002 sec.
```
В запросе может быть указано не более одной секции ARRAY JOIN.
Соответствующее преобразование может выполняться как до секции WHERE/PREWHERE (если его результат нужен в этой секции), так и после выполнения WHERE/PREWHERE (чтобы уменьшить объём вычислений).
### Секция JOIN {#select-join}
Соединяет данные в привычном для [SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL)) смысле.

View File

@ -1,8 +1,8 @@
# MergeTree {#table_engines-mergetree}
Clickhouse 中最强大的表引擎当属 `MergeTree` (合并树)引擎及该家族`*MergeTree`)中的其他引擎。
Clickhouse 中最强大的表引擎当属 `MergeTree` (合并树)引擎及该系列`*MergeTree`)中的其他引擎。
`MergeTree` 引擎家族的基本理念如下。当你有巨量数据要插入到表中,你要高效地一批批写入数据片,并希望这些数据片在后台按照一定规则合并。相比在插入时不断修改(重写)数据进存储,这种策略会高效很多。
`MergeTree` 引擎系列的基本理念如下。当你有巨量数据要插入到表中,你要高效地一批批写入数据片,并希望这些数据片在后台按照一定规则合并。相比在插入时不断修改(重写)数据进存储,这种策略会高效很多。
主要特点:
@ -16,14 +16,14 @@ Clickhouse 中最强大的表引擎当属 `MergeTree` (合并树)引擎及
- 支持数据副本。
`ReplicatedMergeTree` 家族的表便是用于此。更多信息,请参阅 [数据副本](replication.md) 一节。
`ReplicatedMergeTree` 系列的表便是用于此。更多信息,请参阅 [数据副本](replication.md) 一节。
- 支持数据采样。
需要的话,你可以给表设置一个采样方法。
!!! 注意
[Merge](merge.md) 引擎并不属于 `*MergeTree` 家族
[Merge](merge.md) 引擎并不属于 `*MergeTree` 系列
## 建表 {#table_engine-mergetree-creating-a-table}
@ -70,8 +70,8 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
- `SETTINGS` — 影响 `MergeTree` 性能的额外参数:
- `index_granularity` — 索引粒度。即索引中相邻『标记』间的数据行数。默认值8192 。该列表中所有可用的参数可以从这里查看 [MergeTreeSettings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h) 。
- `use_minimalistic_part_header_in_zookeeper` — 数据片头在 ZooKeeper 中的存储方式。如果设置了 `use_minimalistic_part_header_in_zookeeper=1` ZooKeeper 会存储更少的数据。更多信息参考『服务配置参数』这章中的 [设置描述](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) 。
- `min_merge_bytes_to_use_direct_io` — 使用直接 I/O 来操作磁盘的合并操作时要求的最小数据量。合并数据片时ClickHouse 会计算要被合并的所有数据的总存储空间。如果大小超过了 `min_merge_bytes_to_use_direct_io` 设置的字节数,则 ClickHouse 将使用直接 I/O 接口(`O_DIRECT` 选项)对磁盘读写。如果设置 `min_merge_bytes_to_use_direct_io = 0` ,则会禁用直接 I/O。默认值`10 * 1024 * 1024 * 1024` 字节。
- `use_minimalistic_part_header_in_zookeeper` — 数据片头在 ZooKeeper 中的存储方式。如果设置了 `use_minimalistic_part_header_in_zookeeper=1` ZooKeeper 会存储更少的数据。更多信息参考『服务配置参数』这章中的 [设置描述](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) 。
- `min_merge_bytes_to_use_direct_io` — 使用直接 I/O 来操作磁盘的合并操作时要求的最小数据量。合并数据片ClickHouse 会计算要被合并的所有数据的总存储空间。如果大小超过了 `min_merge_bytes_to_use_direct_io` 设置的字节数,则 ClickHouse 将使用直接 I/O 接口(`O_DIRECT` 选项)对磁盘读写。如果设置 `min_merge_bytes_to_use_direct_io = 0` ,则会禁用直接 I/O。默认值`10 * 1024 * 1024 * 1024` 字节。
**示例配置**
@ -117,13 +117,13 @@ MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)
## 数据存储
表由按主键排序的数据 *片* 组成。
表由按主键排序的数据 ** 组成。
当数据被插入到表中时,会分成数据片并按主键的字典序排序。例如,主键是 `(CounterID, Date)` 时,片中数据按 `CounterID` 排序,具有相同 `CounterID` 的部分按 `Date` 排序。
当数据被插入到表中时,会分成数据片并按主键的字典序排序。例如,主键是 `(CounterID, Date)` 时,片中数据按 `CounterID` 排序,具有相同 `CounterID` 的部分按 `Date` 排序。
不同分区的数据会被分成不同的ClickHouse 在后台合并数据片以便更高效存储。不会合并来自不同分区的数据片。这个合并机制并不保证相同主键的所有行都会合并到同一个数据片中。
不同分区的数据会被分成不同的片ClickHouse 在后台合并数据片以便更高效存储。不会合并来自不同分区的数据片。这个合并机制并不保证相同主键的所有行都会合并到同一个数据片中。
ClickHouse 会为每个数据片创建一个索引文件,索引文件包含每个索引行(『标记』)的主键值。索引行号定义为 `n * index_granularity` 。最大的 `n` 等于总行数除以 `index_granularity` 的值的整数部分。对于每列,跟主键相同的索引行处也会写入『标记』。这些『标记』让你可以直接找到数据所在的列。
ClickHouse 会为每个数据片创建一个索引文件,索引文件包含每个索引行(『标记』)的主键值。索引行号定义为 `n * index_granularity` 。最大的 `n` 等于总行数除以 `index_granularity` 的值的整数部分。对于每列,跟主键相同的索引行处也会写入『标记』。这些『标记』让你可以直接找到数据所在的列。
你可以只用一单一大表并不断地一块块往里面加入数据 `MergeTree` 引擎的就是为了这样的场景。
@ -166,7 +166,7 @@ ClickHouse 不要求主键惟一。所以,你可以插入多条具有相同主
- 改善数据压缩。
ClickHouse 以主键排序片数据,所以,数据的一致性越高,压缩越好。
ClickHouse 以主键排序片数据,所以,数据的一致性越高,压缩越好。
- [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) 和 [SummingMergeTree](summingmergetree.md) 引擎里,数据合并时,会有额外的处理逻辑。
@ -177,7 +177,7 @@ ClickHouse 不要求主键惟一。所以,你可以插入多条具有相同主
### 选择跟排序键不一样主键
指定一个跟排序键(用于排序数据片中行的表达式)
指定一个跟排序键(用于排序数据片中行的表达式)
不一样的主键(用于计算写到索引文件的每个标记值的表达式)是可以的。
这种情况下,主键表达式元组必须是排序键表达式元组的一个前缀。
@ -192,7 +192,7 @@ ClickHouse 不要求主键惟一。所以,你可以插入多条具有相同主
这种情况下,主键中仅预留少量列保证高效范围扫描,
剩下的维度列放到排序键元组里。这样是合理的。
[排序键的修改](../../query_language/alter.md) 是轻量级的操作,因为一个新列同时被加入到表里和排序键后时,已存在的数据片并不需要修改。由于旧的排序键是新排序键的前缀,并且刚刚添加的列中没有数据,因此在表修改时的数据对于新旧的排序键来说都是有序的。
[排序键的修改](../../query_language/alter.md) 是轻量级的操作,因为一个新列同时被加入到表里和排序键后时,已存在的数据片并不需要修改。由于旧的排序键是新排序键的前缀,并且刚刚添加的列中没有数据,因此在表修改时的数据对于新旧的排序键来说都是有序的。
### 索引和分区在查询中的应用
@ -238,7 +238,7 @@ SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%'
INDEX index_name expr TYPE type(...) GRANULARITY granularity_value
```
`*MergeTree` 家族的表都能指定跳数索引。
`*MergeTree` 系列的表都能指定跳数索引。
这些索引是由数据块按粒度分割后的每部分在指定表达式上汇总信息 `granularity_value` 组成(粒度大小用表引擎里 `index_granularity` 的指定)。
这些汇总信息有助于用 `where` 语句跳过大片不满足的数据,从而减少 `SELECT` 查询从磁盘读取的数据量,
@ -292,7 +292,7 @@ INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY
## 并发数据访问
应对表的并发访问,我们使用多版本机制。换言之,当同时读和更新表时,数据从当前查询到的一组片中读取。没有冗长的的锁。插入不会阻碍读取。
应对表的并发访问,我们使用多版本机制。换言之,当同时读和更新表时,数据从当前查询到的一组片中读取。没有冗长的的锁。插入不会阻碍读取。
对表的读操作是自动并行的。

View File

@ -1,27 +1,28 @@
# Data Replication {#table_engines-replication}
# 数据副本 {#table_engines-replication}
Replication is only supported for tables in the MergeTree family:
只有 MergeTree 系列里的表可支持副本:
- ReplicatedMergeTree
- ReplicatedSummingMergeTree
- ReplicatedReplacingMergeTree
- ReplicatedAggregatingMergeTree
- ReplicatedCollapsingMergeTree
- ReplicatedVersionedCollapsingMergeTree
- ReplicatedGraphiteMergeTree
Replication works at the level of an individual table, not the entire server. A server can store both replicated and non-replicated tables at the same time.
副本是表级别的,不是整个服务器级的。所以,服务器里可以同时有复制表和非复制表。
Replication does not depend on sharding. Each shard has its own independent replication.
副本不依赖分片。每个分片有它自己的独立副本。
Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](../../query_language/alter.md#query_language_queries_alter)).
对于 `INSERT``ALTER` 语句操作数据的会在压缩的情况下被复制(更多信息,看 [ALTER](../../query_language/alter.md#query_language_queries_alter) )。
`CREATE`, `DROP`, `ATTACH`, `DETACH` and `RENAME` queries are executed on a single server and are not replicated:
`CREATE``DROP``ATTACH``DETACH` 和 `RENAME` 语句只会在单个服务器上执行,不会被复制。
- `The CREATE TABLE` query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica.
- `The DROP TABLE` query deletes the replica located on the server where the query is run.
- `The RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas.
- `The CREATE TABLE` 在运行此语句的服务器上创建一个新的可复制表。如果此表已存在其他服务器上,则给该表添加新副本。
- `The DROP TABLE` 删除运行此查询的服务器上的副本。
- `The RENAME` 重命名一个副本。换句话说,可复制表不同的副本可以有不同的名称。
To use replication, set the addresses of the ZooKeeper cluster in the config file. Example:
要使用副本,需在配置文件中设置 ZooKeeper 集群的地址。例如:
```xml
<zookeeper>
@ -40,43 +41,44 @@ To use replication, set the addresses of the ZooKeeper cluster in the config fil
</zookeeper>
```
Use ZooKeeper version 3.4.5 or later.
需要 ZooKeeper 3.4.5 或更高版本。
You can specify any existing ZooKeeper cluster and the system will use a directory on it for its own data (the directory is specified when creating a replicatable table).
你可以配置任何现有的 ZooKeeper 集群,系统会使用里面的目录来存取元数据(该目录在创建可复制表时指定)。
If ZooKeeper isn't set in the config file, you can't create replicated tables, and any existing replicated tables will be read-only.
如果配置文件中没有设置 ZooKeeper ,则无法创建复制表,并且任何现有的复制表都将变为只读。
ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md).
`SELECT` 查询并不需要借助 ZooKeeper ,复本并不影响 `SELECT` 的性能查询复制表与非复制表速度是一样的。查询分布式表时ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md) 修改。
For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it doesn't create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data.
对于每个 `INSERT` 语句,会通过几个事务将十来个记录添加到 ZooKeeper。确切地说这是针对每个插入的数据块; 每个 INSERT 语句的每 `max_insert_block_size = 1048576` 行和最后剩余的都各算作一个块。)相比非复制表,写 zk 会导致 `INSERT` 的延迟略长一些。但只要你按照建议每秒不超过一个 `INSERT` 地批量插入数据,不会有任何问题。一个 ZooKeeper 集群能给整个 ClickHouse 集群支撑协调每秒几百个 `INSERT`。数据插入的吞吐量(每秒的行数)可以跟不用复制的数据一样高。
For very large clusters, you can use different ZooKeeper clusters for different shards. However, this hasn't proven necessary on the Yandex.Metrica cluster (approximately 300 servers).
对于非常大的集群,你可以把不同的 ZooKeeper 集群用于不同的分片。然而,即使 Yandex.Metrica 集群大约300台服务器也证明还不需要这么做。
Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network.
复制是多主异步。 `INSERT` 语句(以及 `ALTER` )可以发给任意可用的服务器。数据会先插入到执行该语句的服务器上,然后被复制到其他服务器。由于它是异步的,在其他副本上最近插入的数据会有一些延迟。如果部分副本不可用,则数据在其可用时再写入。副本可用的情况下,则延迟时长是通过网络传输压缩数据块所需的时间。
By default, an INSERT query waits for confirmation of writing the data from only one replica. If the data was successfully written to only one replica and the server with this replica ceases to exist, the stored data will be lost. Tp enable getting confirmation of data writes from multiple replicas, use the `insert_quorum` option.
默认情况下INSERT 语句仅等待一个副本写入成功后返回。如果数据只成功写入一个副本后该副本所在的服务器不再存在,则存储的数据会丢失。要启用数据写入多个副本才确认返回,使用 `insert_quorum` 选项。
Each block of data is written atomically. The INSERT query is divided into blocks up to `max_insert_block_size = 1048576` rows. In other words, if the `INSERT` query has less than 1048576 rows, it is made atomically.
单个数据块写入是原子的。 INSERT 的数据按每块最多 `max_insert_block_size = 1048576` 行进行分块,换句话说,如果 `INSERT` 插入的行少于 1048576则该 INSERT 是原子的。
Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn't know if the data was written to the DB, so the `INSERT` query can simply be repeated. It doesn't matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge_tree](../server_settings/settings.md) server settings.
数据块会去重。对于被多次写的相同数据块大小相同且具有相同顺序的相同行的数据块该块仅会写入一次。这样设计的原因是万一在网络故障时客户端应用程序不知道数据是否成功写入DB此时可以简单地重复 `INSERT` 。把相同的数据发送给多个副本 INSERT 并不会有问题。因为这些 `INSERT` 是完全相同的(会被去重)。去重参数参看服务器设置 [merge_tree](../server_settings/settings.md) 。注意Replicated\*MergeTree 才会去重,不需要 zookeeper 的不带 MergeTree 不会去重)
During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.)
在复制期间,只有要插入的源数据通过网络传输。进一步的数据转换(合并)会在所有副本上以相同的方式进行处理执行。这样可以最大限度地减少网络使用,这意味着即使副本在不同的数据中心,数据同步也能工作良好。(能在不同数据中心中的同步数据是副本机制的主要目标。)
You can have any number of replicas of the same data. Yandex.Metrica uses double replication in production. Each server uses RAID-5 or RAID-6, and RAID-10 in some cases. This is a relatively reliable and convenient solution.
你可以给数据做任意多的副本。Yandex.Metrica 在生产中使用双副本。某一些情况下,给每台服务器都使用 RAID-5 或 RAID-6 和 RAID-10。是一种相对可靠和方便的解决方案。
The system monitors data synchronicity on replicas and is able to recover after a failure. Failover is automatic (for small differences in data) or semi-automatic (when data differs too much, which may indicate a configuration error).
系统会监视副本数据同步情况,并能在发生故障后恢复。故障转移是自动的(对于小的数据差异)或半自动的(当数据差异很大时,这可能意味是有配置错误)。
## Creating Replicated Tables
## 创建复制表 {#creating-replicated-tables}
The `Replicated` prefix is added to the table engine name. For example:`ReplicatedMergeTree`.
**Replicated\*MergeTree parameters**
在表引擎名称上加上 `Replicated` 前缀。例如:`ReplicatedMergeTree`。
- `zoo_path` — The path to the table in ZooKeeper.
- `replica_name` — The replica name in ZooKeeper.
**Replicated\*MergeTree 参数**
Example:
- `zoo_path` — ZooKeeper 中该表的路径。
- `replica_name` — ZooKeeper 中的该表的副本名称。
示例:
```sql
CREATE TABLE table_name
@ -90,7 +92,7 @@ ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID)
```
Example in deprecated syntax:
已弃用的建表语法示例:
```sql
CREATE TABLE table_name
@ -101,7 +103,7 @@ CREATE TABLE table_name
) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/hits', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192)
```
As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the 'macros' section of the configuration file. Example:
如上例所示,这些参数可以包含宏替换的占位符,即大括号的部分。它们会被替换为配置文件里 'macros' 那部分配置的值。示例:
```xml
<macros>
@ -111,92 +113,93 @@ As the example shows, these parameters can contain substitutions in curly bracke
</macros>
```
The path to the table in ZooKeeper should be unique for each replicated table. Tables on different shards should have different paths.
In this case, the path consists of the following parts:
“ZooKeeper 中该表的路径”对每个可复制表都要是唯一的。不同分片上的表要有不同的路径。
这种情况下,路径包含下面这些部分:
`/clickhouse/tables/` is the common prefix. We recommend using exactly this one.
`/clickhouse/tables/` 是公共前缀,我们推荐使用这个。
`{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the Yandex.Metrica cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier.
`{layer}-{shard}` 是分片标识部分。在此示例中,由于 Yandex.Metrica 集群使用了两级分片,所以它是由两部分组成的。但对于大多数情况来说,你只需保留 {shard} 占位符即可,它会替换展开为分片标识。
`hits` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it doesn't change after a RENAME query.
`hits` 是该表在 ZooKeeper 中的名称。使其与 ClickHouse 中的表名相同比较好。 这里它被明确定义,跟 ClickHouse 表名不一样,它并不会被 RENAME 语句修改。
The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard.
副本名称用于标识同一个表分片的不同副本。你可以使用服务器名称,如上例所示。同个分片中不同副本的副本名称要唯一。
You can define the parameters explicitly instead of using substitutions. This might be convenient for testing and for configuring small clusters. However, you can't use distributed DDL queries (`ON CLUSTER`) in this case.
你也可以显式指定这些参数,而不是使用宏替换。对于测试和配置小型集群这可能会很方便。但是,这种情况下,则不能使用分布式 DDL 语句(`ON CLUSTER`)。
When working with large clusters, we recommend using substitutions because they reduce the probability of error.
使用大型集群时,我们建议使用宏替换,因为它可以降低出错的可能性。
Run the `CREATE TABLE` query on each replica. This query creates a new replicated table, or adds a new replica to an existing one.
在每个副本服务器上运行 `CREATE TABLE` 查询。将创建新的复制表,或给现有表添加新副本。
If you add a new replica after the table already contains some data on other replicas, the data will be copied from the other replicas to the new one after running the query. In other words, the new replica syncs itself with the others.
如果其他副本上已包含了某些数据,在表上添加新副本,则在运行语句后,数据会从其他副本复制到新副本。换句话说,新副本会与其他副本同步。
To delete a replica, run `DROP TABLE`. However, only one replica is deleted the one that resides on the server where you run the query.
要删除副本,使用 `DROP TABLE`。但它只删除那个 位于运行该语句的服务器上的副本。
## Recovery After Failures
## 故障恢复
If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper.
如果服务器启动时 ZooKeeper 不可用,则复制表会切换为只读模式。系统会定期尝试去连接 ZooKeeper。
If ZooKeeper is unavailable during an `INSERT`, or an error occurs when interacting with ZooKeeper, an exception is thrown.
如果在 `INSERT` 期间 ZooKeeper 不可用,或者在与 ZooKeeper 交互时发生错误,则抛出异常。
After connecting to ZooKeeper, the system checks whether the set of data in the local file system matches the expected set of data (ZooKeeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas.
连接到 ZooKeeper 后,系统会检查本地文件系统中的数据集是否与预期的数据集( ZooKeeper 存储此信息)一致。如果存在轻微的不一致,系统会通过与副本同步数据来解决。
If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ZooKeeper), it moves them to the 'detached' subdirectory (they are not deleted). Any missing parts are copied from the replicas.
如果系统检测到损坏的数据片段(文件大小错误)或无法识别的片段(写入文件系统但未记录在 ZooKeeper 中的部分),则会把它们移动到 'detached' 子目录(不会删除)。而副本中其他任何缺少的但正常数据片段都会被复制同步。
Note that ClickHouse does not perform any destructive actions such as automatically deleting a large amount of data.
注意ClickHouse 不会执行任何破坏性操作,例如自动删除大量数据。
When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary.
当服务器启动(或与 ZooKeeper 建立新会话)时,它只检查所有文件的数量和大小。 如果文件大小一致但中间某处已有字节被修改过,不会立即被检测到,只有在尝试读取 `SELECT` 查询的数据时才会检测到。该查询会引发校验和不匹配或压缩块大小不一致的异常。这种情况下,数据片段会添加到验证队列中,并在必要时从其他副本中复制。
If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by "pushing a button".
如果本地数据集与预期数据的差异太大,则会触发安全机制。服务器在日志中记录此内容并拒绝启动。这种情况很可能是配置错误,例如,一个分片上的副本意外配置为别的分片上的副本。然而,此机制的阈值设置得相当低,在正常故障恢复期间可能会出现这种情况。在这种情况下,数据恢复则是半自动模式,通过用户主动操作触发。
To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ZooKeeper with any content, or run the command to restore all replicated tables:
要触发启动恢复,可在 ZooKeeper 中创建节点 `/path_to_table/replica_name/flags/force_restore_data`,节点值可以是任何内容,或运行命令来恢复所有的可复制表:
```bash
sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data
```
Then restart the server. On start, the server deletes these flags and starts recovery.
然后重启服务器。启动时,服务器会删除这些标志并开始恢复。
## Recovery After Complete Data Loss
## 在数据完全丢失后的恢复
If all data and metadata disappeared from one of the servers, follow these steps for recovery:
如果其中一个服务器的所有数据和元数据都消失了,请按照以下步骤进行恢复:
1. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them.
2. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory `/var/lib/clickhouse/data/db_name/table_name/`).
3. Copy table definitions located in `/var/lib/clickhouse/metadata/` from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, start the server and make all the `ATTACH TABLE` queries that should have been in the .sql files in `/var/lib/clickhouse/metadata/`.)
4. To start recovery, create the ZooKeeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data`
1. 在服务器上安装 ClickHouse。在包含分片标识符和副本的配置文件中正确定义宏配置如果有用到的话
2. 如果服务器上有非复制表则必须手动复制,可以从副本服务器上(在 `/var/lib/clickhouse/data/db_name/table_name/` 目录中)复制它们的数据。
3. 从副本服务器上中复制位于 `/var/lib/clickhouse/metadata/` 中的表定义信息。如果在表定义信息中显式指定了分片或副本标识符,请更正它以使其对应于该副本。(另外,启动服务器,然后会在 `/var/lib/clickhouse/metadata/` 中的.sql文件中生成所有的 `ATTACH TABLE` 语句。)
4.要开始恢复ZooKeeper 中创建节点 `/path_to_table/replica_name/flags/force_restore_data`,节点内容不限,或运行命令来恢复所有复制的表:`sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data`
Then start the server (restart, if it is already running). Data will be downloaded from replicas.
然后启动服务器(如果它已运行则重启)。数据会从副本中下载。
An alternative recovery option is to delete information about the lost replica from ZooKeeper (`/path_to_table/replica_name`), then create the replica again as described in "[Creating replicated tables](#creating-replicated-tables)".
另一种恢复方式是从 ZooKeeper`/path_to_table/replica_name`)中删除有数据丢的副本的所有元信息,然后再按照“[创建可复制表](#creating-replicated-tables)”中的描述重新创建副本。
There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once.
恢复期间的网络带宽没有限制。特别注意这一点,尤其是要一次恢复很多副本。
## Converting from MergeTree to ReplicatedMergeTree
## MergeTree 转换为 ReplicatedMergeTree
We use the term `MergeTree` to refer to all table engines in the ` MergeTree family`, the same as for `ReplicatedMergeTree`.
我们使用 `MergeTree` 来表示 `MergeTree系列` 中的所有表引擎,`ReplicatedMergeTree` 同理。
If you had a `MergeTree` table that was manually replicated, you can convert it to a replicatable table. You might need to do this if you have already collected a large amount of data in a `MergeTree` table and now you want to enable replication.
如果你有一个手动同步的 `MergeTree` 表,您可以将其转换为可复制表。如果你已经在 `MergeTree` 表中收集了大量数据,并且现在要启用复制,则可以执行这些操作。
If the data differs on various replicas, first sync it, or delete this data on all the replicas except one.
如果各个副本上的数据不一致,则首先对其进行同步,或者除保留的一个副本外,删除其他所有副本上的数据。
Rename the existing MergeTree table, then create a `ReplicatedMergeTree` table with the old name.
Move the data from the old table to the 'detached' subdirectory inside the directory with the new table data (`/var/lib/clickhouse/data/db_name/table_name/`).
Then run `ALTER TABLE ATTACH PARTITION` on one of the replicas to add these data parts to the working set.
重命名现有的 MergeTree 表,然后使用旧名称创建 `ReplicatedMergeTree` 表。
将数据从旧表移动到新表(`/var/lib/clickhouse/data/db_name/table_name/`)目录内的 'detached' 目录中。
然后在其中一个副本上运行`ALTER TABLE ATTACH PARTITION`,将这些数据片段添加到工作集中。
## Converting from ReplicatedMergeTree to MergeTree
## ReplicatedMergeTree 转换为 MergeTree
Create a MergeTree table with a different name. Move all the data from the directory with the `ReplicatedMergeTree` table data to the new table's data directory. Then delete the `ReplicatedMergeTree` table and restart the server.
使用其他名称创建 MergeTree 表。将具有`ReplicatedMergeTree`表数据的目录中的所有数据移动到新表的数据目录中。然后删除`ReplicatedMergeTree`表并重新启动服务器。
如果你想在不启动服务器的情况下清除 `ReplicatedMergeTree` 表:
If you want to get rid of a `ReplicatedMergeTree` table without launching the server:
- Delete the corresponding `.sql` file in the metadata directory (`/var/lib/clickhouse/metadata/`).
- Delete the corresponding path in ZooKeeper (`/path_to_table/replica_name`).
- 删除元数据目录中的相应 `.sql` 文件(`/var/lib/clickhouse/metadata/`)。
- 删除 ZooKeeper 中的相应路径(`/path_to_table/replica_name`)。
After this, you can launch the server, create a `MergeTree` table, move the data to its directory, and then restart the server.
之后,你可以启动服务器,创建一个 `MergeTree` 表,将数据移动到其目录,然后重新启动服务器。
## Recovery When Metadata in The ZooKeeper Cluster is Lost or Damaged
## 当 ZooKeeper 集群中的元数据丢失或损坏时恢复方法
If the data in ZooKeeper was lost or damaged, you can save data by moving it to an unreplicated table as described above.
如果 ZooKeeper 中的数据丢失或损坏,如上所述,你可以通过将数据转移到非复制表来保存数据。
[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/replication/) <!--hide-->
[来源文章](https://clickhouse.yandex/docs/en/operations/table_engines/replication/) <!--hide-->