Merge remote-tracking branch 'upstream/master'

This commit is contained in:
BayoNet 2017-04-28 10:37:07 +03:00
commit 802a4ac731
163 changed files with 8321 additions and 157 deletions

View File

@ -1,6 +1,6 @@
#This strings autochanged from release_lib.sh :
set(VERSION_DESCRIBE v1.1.54229-testing)
set(VERSION_REVISION 54229)
set(VERSION_DESCRIBE v1.1.54230-testing)
set(VERSION_REVISION 54230)
#===end of autochange
set (VERSION_MAJOR 1)

View File

@ -1,15 +1,23 @@
#include <DataStreams/CastEnumBlockInputStream.h>
#include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypeString.h>
#include <Interpreters/ExpressionActions.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
namespace DB
{
CastEnumBlockInputStream::CastEnumBlockInputStream(
Context & context_,
BlockInputStreamPtr input_,
const Block & in_sample_,
const Block & out_sample_)
: context(context_)
{
collectEnums(in_sample_, out_sample_);
cast_functions.resize(in_sample_.columns());
children.push_back(input_);
}
@ -42,15 +50,44 @@ Block CastEnumBlockInputStream::readImpl()
if (bool(enum_types[i]))
{
const auto & type = static_cast<const IDataTypeEnum *>(enum_types[i]->type.get());
ColumnPtr new_column = type->createColumn();
Block temporary_block
{
{
elem.column,
elem.type,
elem.name
},
{
std::make_shared<ColumnConstString>(1, type->getName()),
std::make_shared<DataTypeString>(),
""
},
{
nullptr,
enum_types[i]->type,
""
}
};
size_t column_size = elem.column->size();
new_column->reserve(column_size);
for (size_t j = 0; j < column_size; ++j)
new_column->insert(type->castToValue((*elem.column)[j]));
FunctionPtr & cast_function = cast_functions[i];
/// Initialize function.
if (!cast_function)
{
cast_function = FunctionFactory::instance().get("CAST", context);
DataTypePtr unused_return_type;
ColumnsWithTypeAndName arguments{ temporary_block.getByPosition(0), temporary_block.getByPosition(1) };
std::vector<ExpressionAction> unused_prerequisites;
/// Prepares function to execution. TODO It is not obvious.
cast_function->getReturnTypeAndPrerequisites(arguments, unused_return_type, unused_prerequisites);
}
cast_function->execute(temporary_block, {0, 1}, 2);
res.insert({
new_column,
temporary_block.getByPosition(2).column,
enum_types[i]->type,
enum_types[i]->name});
}

View File

@ -5,14 +5,18 @@
#include <experimental/optional>
#include <vector>
namespace DB
{
class IFunction;
/// Implicitly converts string and numeric values to Enum.
class CastEnumBlockInputStream : public IProfilingBlockInputStream
{
public:
CastEnumBlockInputStream(BlockInputStreamPtr input_,
CastEnumBlockInputStream(Context & context_,
BlockInputStreamPtr input_,
const Block & in_sample_,
const Block & out_sample_);
@ -27,7 +31,9 @@ private:
void collectEnums(const Block & in_sample, const Block & out_sample);
private:
Context & context;
std::vector<std::experimental::optional<NameAndTypePair>> enum_types;
std::vector<std::shared_ptr<IFunction>> cast_functions; /// Used to perform type conversions.
};
}

View File

@ -2,7 +2,7 @@
#include <string>
#include <DataStreams/IBlockOutputStream.h>
#include <Core/Block.h>
namespace DB
{

View File

@ -3,12 +3,11 @@
#include <mysqlxx/Row.h>
#include <mysqlxx/Null.h>
#include <mysqlxx/Manip.h>
#include <common/MetrikaTypes.h>
#include <Core/Field.h>
#include <Core/FieldVisitors.h>
#include <IO/WriteHelpers.h>
/// This is for Yandex.Metrica code.
namespace mysqlxx

View File

@ -124,7 +124,7 @@ BlockIO InterpreterInsertQuery::execute()
res.in = interpreter_select.execute().in;
res.in = std::make_shared<NullableAdapterBlockInputStream>(res.in, res.in_sample, res.out_sample);
res.in = std::make_shared<CastEnumBlockInputStream>(res.in, res.in_sample, res.out_sample);
res.in = std::make_shared<CastEnumBlockInputStream>(context, res.in, res.in_sample, res.out_sample);
res.in = std::make_shared<NullAndDoCopyBlockInputStream>(res.in, out);
}

View File

@ -968,7 +968,7 @@ struct AdderNonJoined<ASTTableJoin::Strictness::All, Mapped>
class NonJoinedBlockInputStream : public IProfilingBlockInputStream
{
public:
NonJoinedBlockInputStream(const Join & parent_, Block & left_sample_block, size_t max_block_size_)
NonJoinedBlockInputStream(const Join & parent_, const Block & left_sample_block, size_t max_block_size_)
: parent(parent_), max_block_size(max_block_size_)
{
/** left_sample_block contains keys and "left" columns.
@ -981,39 +981,43 @@ public:
result_sample_block = left_sample_block;
// std::cerr << result_sample_block.dumpStructure() << "\n";
/// Add new columns to the block.
/// Add columns from the right-side table to the block.
for (size_t i = 0; i < num_columns_right; ++i)
{
const ColumnWithTypeAndName & src_column = parent.sample_block_with_columns_to_add.safeGetByPosition(i);
ColumnWithTypeAndName new_column = src_column.cloneEmpty();
result_sample_block.insert(std::move(new_column));
const ColumnWithTypeAndName & src_column = parent.sample_block_with_columns_to_add.getByPosition(i);
result_sample_block.insert(src_column.cloneEmpty());
}
column_numbers_left.reserve(num_columns_left);
column_numbers_keys_and_right.reserve(num_keys + num_columns_right);
column_indices_left.reserve(num_columns_left);
column_indices_keys_and_right.reserve(num_keys + num_columns_right);
std::vector<bool> is_key_column_in_left_block(num_keys + num_columns_left, false);
for (const std::string & key : parent.key_names_left)
{
size_t key_pos = left_sample_block.getPositionByName(key);
is_key_column_in_left_block[key_pos] = true;
/// Here we establish the mapping between key columns of the left- and right-side tables.
/// key_pos index is inserted in the position corresponding to key column in parent.blocks
/// (saved blocks of the right-side table) and points to the same key column
/// in the left_sample_block and thus in the result_sample_block.
column_indices_keys_and_right.push_back(key_pos);
}
for (size_t i = 0; i < num_keys + num_columns_left; ++i)
{
const String & name = left_sample_block.safeGetByPosition(i).name;
auto found_key_column = std::find(parent.key_names_left.begin(), parent.key_names_left.end(), name);
if (parent.key_names_left.end() == found_key_column)
column_numbers_left.push_back(i);
else
column_numbers_keys_and_right.push_back(found_key_column - parent.key_names_left.begin());
if (!is_key_column_in_left_block[i])
column_indices_left.push_back(i);
}
for (size_t i = 0; i < num_columns_right; ++i)
column_numbers_keys_and_right.push_back(num_keys + num_columns_left + i);
column_indices_keys_and_right.push_back(num_keys + num_columns_left + i);
/// If use_nulls, convert left columns to Nullable.
if (parent.use_nulls)
{
for (size_t i = 0; i < num_columns_left; ++i)
{
convertColumnToNullable(result_sample_block.getByPosition(column_numbers_left[i]));
convertColumnToNullable(result_sample_block.getByPosition(column_indices_left[i]));
}
}
@ -1050,9 +1054,14 @@ private:
size_t max_block_size;
Block result_sample_block;
ColumnNumbers column_numbers_left;
ColumnNumbers column_numbers_keys_and_right;
/// Indices of columns in result_sample_block that come from the left-side table (except key columns).
ColumnNumbers column_indices_left;
/// Indices of key columns in result_sample_block or columns that come from the right-side table.
/// Order is significant: it is the same as the order of columns in the blocks of the right-side table that are saved in parent.blocks.
ColumnNumbers column_indices_keys_and_right;
/// Columns of the current output block corresponding to column_indices_left.
ColumnPlainPtrs columns_left;
/// Columns of the current output block corresponding to column_indices_keys_and_right.
ColumnPlainPtrs columns_keys_and_right;
std::unique_ptr<void, std::function<void(void *)>> position; /// type erasure
@ -1063,19 +1072,19 @@ private:
{
Block block = result_sample_block.cloneEmpty();
size_t num_columns_left = column_numbers_left.size();
size_t num_columns_right = column_numbers_keys_and_right.size();
size_t num_columns_left = column_indices_left.size();
size_t num_columns_right = column_indices_keys_and_right.size();
for (size_t i = 0; i < num_columns_left; ++i)
{
auto & column_with_type_and_name = block.safeGetByPosition(column_numbers_left[i]);
auto & column_with_type_and_name = block.safeGetByPosition(column_indices_left[i]);
column_with_type_and_name.column = column_with_type_and_name.type->createColumn();
columns_left[i] = column_with_type_and_name.column.get();
}
for (size_t i = 0; i < num_columns_right; ++i)
{
auto & column_with_type_and_name = block.safeGetByPosition(column_numbers_keys_and_right[i]);
auto & column_with_type_and_name = block.safeGetByPosition(column_indices_keys_and_right[i]);
column_with_type_and_name.column = column_with_type_and_name.type->createColumn();
columns_keys_and_right[i] = column_with_type_and_name.column.get();
columns_keys_and_right[i]->reserve(column_with_type_and_name.column->size());

View File

@ -360,9 +360,9 @@ private:
ASTTableJoin::Kind kind;
ASTTableJoin::Strictness strictness;
/// Names of key columns (columns for equi-JOIN) in "left" table.
/// Names of key columns (columns for equi-JOIN) in "left" table (in the order they appear in USING clause).
const Names key_names_left;
/// Names of key columns (columns for equi-JOIN) in "right" table.
/// Names of key columns (columns for equi-JOIN) in "right" table (in the order they appear in USING clause).
const Names key_names_right;
/// Substitute NULLs for non-JOINed rows.
@ -387,7 +387,9 @@ private:
Sizes key_sizes;
/// Block with columns from the right-side table except key columns.
Block sample_block_with_columns_to_add;
/// Block with key columns in the same order they appear in the right-side table.
Block sample_block_with_keys;
Poco::Logger * log;

View File

@ -80,8 +80,9 @@ public:
*/
void add(const LogElement & element)
{
/// We could lock here in case of queue overflow. Maybe better to throw an exception or even don't do logging in that case.
queue.push({false, element});
/// Without try we could block here in case of queue overflow.
if (!queue.tryPush({false, element}))
LOG_ERROR(log, "SystemLog queue is full");
}
private:
@ -215,7 +216,7 @@ void SystemLog<LogElement>::flush()
{
try
{
LOG_TRACE(log, "Flushing query log");
LOG_TRACE(log, "Flushing system log");
if (!is_prepared) /// BTW, flush method is called from single thread.
prepareTable();
@ -224,6 +225,10 @@ void SystemLog<LogElement>::flush()
for (const LogElement & elem : data)
elem.appendToBlock(block);
/// Clear queue early, because insertion to the table could lead to generation of more log entrites
/// and pushing them to already full queue will lead to deadlock.
data.clear();
/// We write to table indirectly, using InterpreterInsertQuery.
/// This is needed to support DEFAULT-columns in table.
@ -242,11 +247,10 @@ void SystemLog<LogElement>::flush()
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
/// In case of exception, also clean accumulated data - to avoid locking.
data.clear();
}
}
template <typename LogElement>

View File

@ -443,10 +443,12 @@ int Server::main(const std::vector<std::string> & args)
listen_hosts.emplace_back(config().getString(key));
}
bool try_listen = false;
if (listen_hosts.empty())
{
listen_hosts.emplace_back("::1");
listen_hosts.emplace_back("127.0.0.1");
try_listen = true;
}
auto make_socket_address = [&](const std::string & host, std::uint16_t port) {
@ -479,7 +481,8 @@ int Server::main(const std::vector<std::string> & args)
for (const auto & listen_host : listen_hosts)
{
/// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file.
try
{
/// HTTP
if (config().has("http_port"))
{
@ -527,7 +530,6 @@ int Server::main(const std::vector<std::string> & args)
LOG_INFO(log, "Listening tcp: " + tcp_address.toString());
}
/// At least one of TCP and HTTP servers must be created.
if (servers.empty())
throw Exception("No 'tcp_port' and 'http_port' is specified in configuration file.", ErrorCodes::NO_ELEMENTS_IN_CONFIG);
@ -548,6 +550,18 @@ int Server::main(const std::vector<std::string> & args)
LOG_INFO(log, "Listening interserver: " + interserver_address.toString());
}
}
catch (const Poco::Net::NetException & e)
{
if (try_listen && e.code() == POCO_EPROTONOSUPPORT)
LOG_ERROR(log, "Listen [" << listen_host << "]: " << e.what() << ": " << e.message());
else
throw;
}
}
if (servers.empty())
throw Exception("No servers started (add valid listen_host and 'tcp_port' or 'http_port' to configuration file.)", ErrorCodes::NO_ELEMENTS_IN_CONFIG);
for (auto & server : servers)
server->start();

View File

@ -3,6 +3,7 @@
#include <Poco/Util/AbstractConfiguration.h>
#include <Core/Defines.h>
#include <Core/Types.h>
#include <Common/Exception.h>
namespace DB

View File

@ -38,3 +38,8 @@ Hello [0,1,2]
3
4
5
1 2 3 aaa
2 3 4 bbb ccc
5 6 7 ddd
2 3 4 bbb ccc
5 6 7 ddd

View File

@ -9,3 +9,16 @@ SELECT k, x FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN
SELECT k, y FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN (SELECT range(k) AS y, arrayJoin([3, 4, 5]) AS k) USING k WHERE k < 10 ORDER BY k;
SELECT x, y FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN (SELECT range(k) AS y, arrayJoin([3, 4, 5]) AS k) USING k WHERE k < 10 ORDER BY k;
SELECT k FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN (SELECT range(k) AS y, arrayJoin([3, 4, 5]) AS k) USING k WHERE k < 10 ORDER BY k;
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
CREATE TABLE t1 (k1 UInt32, k2 UInt32, k3 UInt32, val_t1 String) ENGINE=TinyLog;
CREATE TABLE t2 (val_t2 String, k3 UInt32, k2_alias UInt32, k1 UInt32) ENGINE=TinyLog;
INSERT INTO t1 VALUES (1, 2, 3, 'aaa'), (2, 3, 4, 'bbb');
INSERT INTO t2 VALUES ('ccc', 4, 3, 2), ('ddd', 7, 6, 5);
SELECT k1, k2, k3, val_t1, val_t2 FROM t1 ANY FULL JOIN t2 USING (k3, k1, k2 AS k2_alias) ORDER BY k1, k2, k3;
SELECT k1, k2, k3, val_t1, val_t2 FROM t1 ANY RIGHT JOIN t2 USING (k3, k1, k2 AS k2_alias) ORDER BY k1, k2, k3;

69
docs/en/access_rights.rst Normal file
View File

@ -0,0 +1,69 @@
Access rights
=============
Users and access rights are set up in the user config. This is usually ``users.xml``.
Users are recorded in the ``users`` section. Let's look at part of the ``users.xml`` file:
.. code-block:: xml
<!-- Users and ACL. -->
<users>
<!-- If the username is not specified, the default user is used. -->
<default>
<!-- Password (in plaintext). May be empty. -->
<password></password>
<!-- List of networks that access is allowed from. Each list item has one of the following forms:
<ip> IP address or subnet mask. For example, 222.111.222.3 or 10.0.0.1/8 or 2a02:6b8::3 or 2a02:6b8::3/64.
<host> Host name. Example: example01. A DNS query is made for verification, and all received address are compared to the client address.
<host_regexp> Regex for host names. For example, ^example\d\d-\d\d-\d\.yandex\.ru$
A DNS PTR query is made to verify the client address and the regex is applied to the result.
Then another DNS query is made for the result of the PTR query, and all received address are compared to the client address.
We strongly recommend that the regex ends with \.yandex\.ru$. If you are installing ClickHouse independently, here you should specify:
<networks>
<ip>::/0</ip>
</networks> -->
<networks incl="networks" />
<!-- Settings profile for the user. -->
<profile>default</profile>
<!-- Quota for the user. -->
<quota>default</quota>
</default>
<!-- For queries from the user interface. -->
<web>
<password></password>
<networks incl="networks" />
<profile>web</profile>
<quota>default</quota>
</web>
Here we can see that two users are declared: ``default`` and ``web``. We added the ``web`` user ourselves.
The ``default`` user is chosen in cases when the username is not passed, so this user must be present in the config file. The ``default`` user is also used for distributed query processing - the system accesses remote servers under this username. So the ``default`` user must have an empty password and must not have substantial restrictions or quotas - otherwise, distributed queries will fail.
The password is specified in plain text directly in the config. In this regard, you should not consider these passwords as providing security against potential malicious attacks. Rather, they are necessary for protection from Yandex employees.
A list of networks is specified that access is allowed from. In this example, the list of networks for both users is loaded from a separate file (``/etc/metrika.xml``) containing the ``networks`` substitution. Here is a fragment of it:
.. code-block:: xml
<yandex>
...
<networks>
<ip>::/64</ip>
<ip>93.111.222.128/26</ip>
<ip>2a02:6b8:0:111::/64</ip>
...
</networks>
</yandex>
We could have defined this list of networks directly in ``users.xml``, or in a file in the ``users.d`` directory (for more information, see the section "Configuration files").
The config includes comments explaining how to open access from everywhere.
For use in production, only specify IP elements (IP addresses and their masks), since using ``host`` and ``host_regexp`` might cause extra latency.
Next the user settings profile is specified (see the section "Settings profiles"). You can specify the default profile, ``default``. The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is ``readonly`` set to ``1``, which provides read-only access.
After this, the quota is defined (see the section "Quotas"). You can specify the default quota, ``default``. It is set in the config by default so that it only counts resource usage, but does not restrict it. The quota can have any name. You can specify the same quota for different users - in this case, resource usage is calculated for each user individually.

View File

@ -0,0 +1,284 @@
Aggregate functions
==================
count()
-------
Counts the number of rows. Accepts zero arguments and returns UInt64.
The syntax COUNT(DISTINCT x) is not supported. The separate 'uniq' aggregate function exists for this purpose.
A 'SELECT count() FROM table' query is not optimized, because the number of entries in the table is not stored separately. It will select some small column from the table and count the number of values in it.
any(x)
------
Selects the first encountered value.
The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate.
To get a determinate result, you can use the 'min' or 'max' function instead of 'any'.
In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY.
When a SELECT query has the GROUP BY clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the SELECT, HAVING, and ORDER BY clauses be calculated from keys or from aggregate functions. That is, each column selected from the table must be used either in keys, or inside aggregate functions. To get behavior like in MySQL, you can put the other columns in the 'any' aggregate function.
anyLast(x)
----------
Selects the last value encountered.
The result is just as indeterminate as for the 'any' function.
min(x)
------
Calculates the minimum.
max(x)
-----
Calculates the maximum
argMin(arg, val)
----------------
Calculates the 'arg' value for a minimal 'val' value. If there are several different values of 'arg' for minimal values of 'val', the first of these values encountered is output.
argMax(arg, val)
---------------
Calculates the 'arg' value for a maximum 'val' value. If there are several different values of 'arg' for maximum values of 'val', the first of these values encountered is output.
sum(x)
-------
Calculates the sum.
Only works for numbers.
avg(x)
------
Calculates the average.
Only works for numbers.
The result is always Float64.
uniq(x)
--------
Calculates the approximate number of different values of the argument. Works for numbers, strings, dates, and dates with times.
Uses an adaptive sampling algorithm: for the calculation state, it uses a sample of element hash values with a size up to 65535.
Compared with the widely known HyperLogLog algorithm, this algorithm is less effective in terms of accuracy and memory consumption (even up to proportionality), but it is adaptive. This means that with fairly high accuracy, it consumes less memory during simultaneous computation of cardinality for a large number of data sets whose cardinality has power law distribution (i.e. in cases when most of the data sets are small). This algorithm is also very accurate for data sets with small cardinality (up to 65536) and very efficient on CPU (when computing not too many of these functions, using 'uniq' is almost as fast as using other aggregate functions).
There is no compensation for the bias of an estimate, so for large data sets the results are systematically deflated. This function is normally used for computing the number of unique visitors in Yandex.Metrica, so this bias does not play a role.
The result is determinate (it doesn't depend on the order of query execution).
uniqCombined(x)
--------------
Approximately computes the number of different values of the argument. Works for numbers, strings, dates, date-with-time, for several arguments and arguments-tuples.
A combination of three algorithms is used: an array, a hash table and HyperLogLog with an error correction table. The memory consumption is several times smaller than the uniq function, and the accuracy is several times higher. The speed of operation is slightly lower than that of the uniq function, but sometimes it can be even higher - in the case of distributed requests, in which a large number of aggregation states are transmitted over the network. The maximum state size is 96 KiB (HyperLogLog of 217 6-bit cells).
The result is deterministic (it does not depend on the order of query execution).
The uniqCombined function is a good default choice for calculating the number of different values.
uniqHLL12(x)
------------
Uses the HyperLogLog algorithm to approximate the number of different values of the argument. It uses 212 5-bit cells. The size of the state is slightly more than 2.5 KB.
The result is determinate (it doesn't depend on the order of query execution).
In most cases, use the 'uniq' function. You should only use this function if you understand its advantages well.
uniqExact(x)
------------
Calculates the number of different values of the argument, exactly.
There is no reason to fear approximations, so it's better to use the 'uniq' function.
You should use the 'uniqExact' function if you definitely need an exact result.
The 'uniqExact' function uses more memory than the 'uniq' function, because the size of the state has unbounded growth as the number of different values increases.
groupArray(x)
------------
Creates an array of argument values.
Values can be added to the array in any (indeterminate) order.
In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY.
groupUniqArray(x)
-----------------
Creates an array from different argument values. Memory consumption is the same as for the 'uniqExact' function.
quantile(level)(x)
------------------
Approximates the 'level' quantile. 'level' is a constant, a floating-point number from 0 to 1. We recommend using a 'level' value in the range of 0.01 .. 0.99.
Don't use a 'level' value equal to 0 or 1 - use the 'min' and 'max' functions for these cases.
The algorithm is the same as for the 'median' function. Actually, 'quantile' and 'median' are internally the same function. You can use the 'quantile' function without parameters - in this case, it calculates the median, and you can use the 'median' function with parameters - in this case, it calculates the quantile of the set level.
When using multiple 'quantile' and 'median' functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the 'quantiles' function.
quantileDeterministic(level)(x, determinator)
--------------
Calculates the quantile of 'level' using the same algorithm as the 'medianDeterministic' function.
quantileTiming(level)(x)
---------------
Calculates the quantile of 'level' using the same algorithm as the 'medianTiming' function.
quantileTimingWeighted(level)(x, weight)
---------------
Calculates the quantile of 'level' using the same algorithm as the 'medianTimingWeighted' function.
quantileExact(level)(x)
------------
Computes the level quantile exactly. To do this, all transferred values are added to an array, which is then partially sorted. Therefore, the function consumes O (n) memory, where n is the number of transferred values. However, for a small number of values, the function is very effective.
quantileExactWeighted(level)(x, weight)
----------------
Computes the level quantile exactly. In this case, each value is taken into account with the weight weight - as if it is present weight once. The arguments of the function can be considered as histograms, where the value "x" corresponds to the "column" of the histogram of the height weight, and the function itself can be considered as the summation of histograms.
The algorithm is a hash table. Because of this, in case the transmitted values are often repeated, the function consumes less RAM than the quantileExact. You can use this function instead of quantileExact, specifying the number 1 as the weight.
quantileTDigest(level)(x)
-------------
Computes the level quantile approximatively, using the t-digest algorithm. The maximum error is 1%. The memory consumption per state is proportional to the logarithm of the number of transmitted values.
The performance of the function is below quantile, quantileTiming. By the ratio of state size and accuracy, the function is significantly better than quantile.
The result depends on the order in which the query is executed, and is nondeterministic.
median
------
Approximates the median. Also see the similar 'quantile' function.
Works for numbers, dates, and dates with times.
For numbers it returns Float64, for dates - a date, and for dates with times - a date with time.
Uses reservoir sampling with a reservoir size up to 8192.
If necessary, the result is output with linear approximation from the two neighboring values.
This algorithm proved to be more practical than another well-known algorithm - QDigest.
The result depends on the order of running the query, and is nondeterministic.
quantiles(level1, level2, ...)(x)
---------------
Approximates quantiles of all specified levels.
The result is an array containing the corresponding number of values.
varSamp(x)
--------
Calculates the amount Σ((x - x̅)2) / (n - 1), where 'n' is the sample size and 'x̅' is the average value of 'x'.
It represents an unbiased estimate of the variance of a random variable, if the values passed to the function are a sample of this random amount.
Returns Float64. If n <= 1, it returns +∞.
varPop(x)
---------
Calculates the amount Σ((x - x̅)2) / n, where 'n' is the sample size and 'x̅' is the average value of 'x'.
In other words, dispersion for a set of values. Returns Float64.
stddevSamp(x)
-----------
The result is equal to the square root of 'varSamp(x)'.
stddevPop(x)
---------
The result is equal to the square root of 'varPop(x)'.
covarSamp(x, y)
----------
Calculates the value of Σ((x - x̅)(y - y̅)) / (n - 1).
Returns Float64. If n <= 1, it returns +∞.
covarPop(x, y)
----------
Calculates the value of Σ((x - x̅)(y - y̅)) / n.
corr(x, y)
---------
Calculates the Pearson correlation coefficient: Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)2) * Σ((y - y̅)2)).
Parametric aggregate functions
================
Some aggregate functions can accept not only argument columns (used for compression), but a set of parameters - constants for initialization. The syntax is two pairs of brackets instead of one. The first is for parameters, and the second is for arguments.
sequenceMatch(pattern)(time, cond1, cond2, ...)
------------
Pattern matching for event chains.
'pattern' is a string containing a pattern to match. The pattern is similar to a regular expression.
'time' is the event time of the DateTime type.
'cond1, cond2 ...' are from one to 32 arguments of the UInt8 type that indicate whether an event condition was met.
The function collects a sequence of events in RAM. Then it checks whether this sequence matches the pattern.
It returns UInt8 - 0 if the pattern isn't matched, or 1 if it matches.
Example: sequenceMatch('(?1).*(?2)')(EventTime, URL LIKE '%company%', URL LIKE '%cart%')
- whether there was a chain of events in which pages with the address in company were visited earlier than pages with the address in cart.
This is a degenerate example. You could write it using other aggregate functions:
minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%').
However, there is no such solution for more complex situations.
Pattern syntax:
``(?1)`` - Reference to a condition (any number in place of 1).
``.*`` - Any number of events.
``(?t>=1800)`` - Time condition.
Any quantity of any type of events is allowed over the specified time.
The operators <, >, <= may be used instead of >=.
Any number may be specified in place of 1800.
Events that occur during the same second may be put in the chain in any order. This may affect the result of the function.
sequenceCount(pattern)(time, cond1, cond2, ...)
------------------
Similar to the sequenceMatch function, but it does not return the fact that there is a chain of events, and UInt64 is the number of strings found.
Chains are searched without overlapping. That is, the following chain can start only after the end of the previous one.
uniqUpTo(N)(x)
-------------
Calculates the number of different argument values, if it is less than or equal to N.
If the number of different argument values is greater than N, it returns N + 1.
Recommended for use with small Ns, up to 10. The maximum N value is 100.
For the state of an aggregate function, it uses the amount of memory equal to 1 + N * the size of one value of bytes.
For strings, it stores a non-cryptographic hash of 8 bytes. That is, the calculation is approximated for strings.
It works as fast as possible, except for cases when a large N value is used and the number of unique values is slightly less than N.
Usage example:
Problem: Generate a report that shows only keywords that produced at least 5 unique users.
Solution: Write in the query ``GROUP BY SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5``
Aggregate function combinators
=======================
The name of an aggregate function can have a suffix appended to it. This changes the way the aggregate function works.
There are ``If`` and ``Array`` combinators. See the sections below.
-If combinator. Conditional aggregate functions
---------------------
The suffix ``-If`` can be appended to the name of any aggregate function. In this case, the aggregate function accepts an extra argument - a condition (Uint8 type). The aggregate function processes only the rows that trigger the condition. If the condition was not triggered even once, it returns a default value (usually zeros or empty strings).
Examples: ``sumIf(column, cond)``, ``countIf(cond)``, ``avgIf(x, cond)``, ``quantilesTimingIf(level1, level2)(x, cond)``, ``argMinIf(arg, val, cond)`` and so on.
You can use aggregate functions to calculate aggregates for multiple conditions at once, without using subqueries and JOINs.
For example, in Yandex.Metrica, we use conditional aggregate functions for implementing segment comparison functionality.
-Array combinator. Aggregate functions for array arguments
-----------------
The -Array suffix can be appended to any aggregate function. In this case, the aggregate function takes arguments of the 'Array(T)' type (arrays) instead of 'T' type arguments. If the aggregate function accepts multiple arguments, this must be arrays of equal lengths. When processing arrays, the aggregate function works like the original aggregate function across all array elements.
Example 1: ``sumArray(arr)`` - Totals all the elements of all 'arr' arrays. In this example, it could have been written more simply: sum(arraySum(arr)).
Example 2: ``uniqArray(arr)`` - Count the number of unique elements in all 'arr' arrays. This could be done an easier way: ``uniq(arrayJoin(arr))``, but it's not always possible to add 'arrayJoin' to a query.
The ``-If`` and ``-Array`` combinators can be used together. However, 'Array' must come first, then 'If'.
Examples: ``uniqArrayIf(arr, cond)``, ``quantilesTimingArrayIf(level1, level2)(arr, cond)``. Due to this order, the 'cond' argument can't be an array.
-State combinator
------------
If this combinator is used, the aggregate function returns a non-finished value (for example, in the case of the uniq function, the number of unique values), and the intermediate aggregation state (for example, in the case of the uniq function, a hash table for calculating the number of unique values) AggregateFunction (...) and can be used for further processing or can be stored in a table for subsequent pre-aggregation - see the sections "AggregatingMergeTree" and "functions for working with intermediate aggregation states".
-Merge combinator
------------
In the case of using this combinator, the aggregate function will take as an argument the intermediate state of aggregation, pre-aggregate (combine together) these states, and return the finished value.
-MergeState combinator
----------------
Merges the intermediate aggregation states, similar to the -Merge combo, but returns a non-ready value, and an intermediate aggregation state, similar to the -State combinator.

260
docs/en/conf.py Normal file
View File

@ -0,0 +1,260 @@
# -*- coding: utf-8 -*-
#
# ClickHouse documentation build configuration file, created by
# sphinx-quickstart on Tue Mar 21 13:05:32 2017.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.mathjax',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'ClickHouse'
copyright = u'2017, Alexey Milovidov'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '1'
# The full version, including alpha/beta/rc tags.
release = '1'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = []
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'ClickHousedoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'ClickHouse.tex', u'ClickHouse Documentation',
u'Alexey Milovidov', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'clickhouse', u'ClickHouse Documentation',
[u'Alexey Milovidov'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'ClickHouse', u'ClickHouse Documentation',
u'Alexey Milovidov', 'ClickHouse', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False

View File

@ -0,0 +1,21 @@
Configuration files
======================
The main server config file is ``config.xml``. It resides in the ``/etc/clickhouse-server/`` directory.
Certain settings can be overridden in the ``*.xml`` and ``*.conf`` files from the ``conf.d`` and ``config.d`` directories next to the config.
The ``replace`` and ``remove`` attributes can be specified for the elements of these config files.
If neither is specified, it combines the contents of elements recursively, replacing values of duplicate children.
If ``replace`` is specified, it replaces the entire element with the specified one.
If ``remove`` is specified, it deletes the element.
The config can also define "substitutions". If an element has the ``incl`` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is ``/etc/metrika.xml``. This can be changed in the config in the ``include_from`` element. The substitution values are specified in ``/yandex/substitution_name`` elements of this file.
You can also perform substitutions from ZooKeeper nodes. To do that add the ``from_zk="/path/to/node"`` attribute to a config element. Element contents will be substituted with the contents of the /path/to/node ZooKeeper node. The ZooKeeper node can contain a whole XML subtree, and it will be inserted as a child of the substituted node.
The 'config.xml' file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the 'users_config' element. By default, it is 'users.xml'. If 'users_config' is omitted, the user settings, profiles, and quotas are specified directly in ``config.xml``. For ``users_config``, overrides and substitutions may also exist in files from the ``users_config.d`` directory (for example, ``users.d``).
For each config file, the server also generates ``file-preprocessed.xml`` files on launch. These files contain all the completed substitutions and overrides, and they are intended for informational use. If ZooKeeper substitutions were used in a config file and the ZooKeeper is unavailable during server startup, the configuration is loaded from the respective preprocessed file.
The server tracks changes to config files and files and ZooKeeper nodes that were used for substitutions and overrides and reloads users and clusters configurations in runtime. That is, you can add or change users, clusters and their settings without relaunching the server.

View File

@ -0,0 +1,5 @@
Array(T)
--------
Array of T-type items. The T type can be any type, including an array.
We don't recommend using multidimensional arrays, because they are not well supported (for example, you can't store multidimensional arrays in tables with engines from MergeTree family).

View File

@ -0,0 +1,4 @@
Boolean
---------------
There is no separate type for boolean values. For them, the type UInt8 is used, in which only the values 0 and 1 are used.

View File

@ -0,0 +1,7 @@
Date
----
A date. Stored in two bytes as the number of days since 1970-01-01 (unsigned). Allows storing values from just after the beginning of the Unix Epoch to the upper threshold defined by a constant at the compilation stage (currently, this is until the year 2038, but it may be expanded to 2106).
The minimum value is output as 0000-00-00.
The date is stored without the time zone.

View File

@ -0,0 +1,16 @@
DateTime
--------
Date with time. Stored in four bytes as a Unix timestamp (unsigned). Allows storing values in the same range as for the Date type. The minimal value is output as 0000-00-00 00:00:00. The time is stored with accuracy up to one second (without leap seconds).
Time zones
~~~~~~~~~~~~~
The date with time is converted from text (divided into component parts) to binary and back, using the system's time zone at the time the client or server starts. In text format, information about daylight savings is lost.
Note that by default the client adopts the server time zone at the beginning of the session. You can change this behaviour with the --use_client_time_zone command line switch.
Supports only those time zones that never had the time differ from UTC for a partial number of hours (without leap seconds) over the entire time range you will be working with.
So when working with a textual date (for example, when saving text dumps), keep in mind that there may be ambiguity during changes for daylight savings time, and there may be problems matching data if the time zone changed.

View File

@ -0,0 +1,28 @@
Enum
----
Enum8 or Enum16. A set of enumerated string values that are stored as Int8 or Int16.
Example:
::
Enum8('hello' = 1, 'world' = 2)
- This data type has two possible values - 'hello' and 'world'.
The numeric values must be within -128..127 for ``Enum8`` and -32768..32767 for ``Enum16``. Every member of the enum must also have different numbers. The empty string is a valid value. The numbers do not need to be sequential and can be in any order. The order does not matter.
In memory, the data is stored in the same way as the numeric types ``Int8`` and ``Int16``.
When reading in text format, the string is read and the corresponding numeric value is looked up. An exception will be thrown if it is not found.
When writing in text format, the stored number is looked up and the corresponding string is written out. An exception will be thrown if the number does not correspond to a known value.
In binary format, the information is saved in the same way as ``Int8`` and ``Int16``.
The implicit default value for an Enum is the value having the smallest numeric value.
In ORDER BY, GROUP BY, IN, DISTINCT, etc. Enums behave like the numeric value. e.g. they will be sorted by the numeric value in an ``ORDER BY``. Equality and comparison operators behave like they do on the underlying numeric value.
Enum values cannot be compared to numbers, they must be compared to a string. If the string compared to is not a valid value for the Enum, an exception will be thrown. The ``IN`` operator is supported with the Enum on the left hand side and a set of strings on the right hand side.
Most numeric and string operations are not defined for Enum values, e.g. adding a number to an Enum or concatenating a string to an Enum. However, the toString function can be used to convert the Enum to its string value. Enum values are also convertible to numeric types using the ``toT`` function where ``T`` is a numeric type. When ``T`` corresponds to the enum's underlying numeric type, this conversion is zero-cost.
It is possible to add new members to the ``Enum`` using ``ALTER``. If the only change is to the set of values, the operation will be almost instant. It is also possible to remove members of the Enum using ALTER. Removing members is only safe if the removed value has never been used in the table. As a safeguard, changing the numeric value of a previously defined Enum member will throw an exception.
Using ``ALTER``, it is possible to change an ``Enum8`` to an ``Enum16`` or vice versa - just like changing an ``Int8`` to ``Int16``.

View File

@ -0,0 +1,10 @@
FixedString(N)
--------------
A fixed-length string of N bytes (not characters or code points). N must be a strictly positive natural number.
When server reads a string (as an input passed in INSERT query, for example) that contains fewer bytes, the string is padded to N bytes by appending null bytes at the right.
When server reads a string that contains more bytes, an error message is returned.
When server writes a string (as an output of SELECT query, for example), null bytes are not trimmed off of the end of the string, but are output.
Note that this behavior differs from MySQL behavior for the CHAR type (where strings are padded with spaces, and the spaces are removed for output).
Fewer functions can work with the FixedString(N) type than with String, so it is less convenient to use.

View File

@ -0,0 +1,7 @@
Float32, Float64
----------------
Floating-point numbers are just like 'float' and 'double' in the C language.
In contrast to standard SQL, floating-point numbers support 'inf', '-inf', and even 'nan's.
See the notes on sorting nans in "ORDER BY clause".
We do not recommend storing floating-point numbers in tables.

View File

@ -0,0 +1,8 @@
Data types
===========
.. toctree::
:glob:
*
*/index

View File

@ -0,0 +1,40 @@
UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64
--------------------------------------------------------
Fixed-length integers, with or without a sign.
Int ranges
"""""""""""""
.. table::
+--------+----------------------+-----------------------+
| Тип | From | To |
+========+======================+=======================+
| Int8 | -128 | 127 |
+--------+----------------------+-----------------------+
| Int16 | -32768 | 32767 |
+--------+----------------------+-----------------------+
| Int32 | -2147483648 | 2147483647 |
+--------+----------------------+-----------------------+
| Int64 | -9223372036854775808 | 9223372036854775807 |
+--------+----------------------+-----------------------+
Uint ranges
""""""""""""""
.. table::
+--------+----------------------+-----------------------+
| Тип | From | To |
+========+======================+=======================+
| UInt8 | 0 | 255 |
+--------+----------------------+-----------------------+
| UInt16 | 0 | 65535 |
+--------+----------------------+-----------------------+
| UInt32 | 0 | 4294967295 |
+--------+----------------------+-----------------------+
| UInt64 | 0 | 18446744073709551615 |
+--------+----------------------+-----------------------+

View File

@ -0,0 +1,4 @@
AggregateFunction(name, types_of_arguments...)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The intermediate state of an aggregate function. To get it, use aggregate functions with the '-State' suffix. For more information, see "AggregatingMergeTree".

View File

@ -0,0 +1,7 @@
Nested data structures
--------------------------
.. toctree::
:glob:
*

View File

@ -0,0 +1,94 @@
Nested(Name1 Type1, Name2 Type2, ...)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A nested data structure is like a nested table. The parameters of a nested data structure - the column names and types - are specified the same way as in a CREATE query. Each table row can correspond to any number of rows in a nested data structure.
Example:
.. code-block:: sql
CREATE TABLE test.visits
(
CounterID UInt32,
StartDate Date,
Sign Int8,
IsNew UInt8,
VisitID UInt64,
UserID UInt64,
...
Goals Nested
(
ID UInt32,
Serial UInt32,
EventTime DateTime,
Price Int64,
OrderID String,
CurrencyID UInt32
),
...
) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)
This example declares the 'Goals' nested data structure, which contains data about conversions (goals reached). Each row in the 'visits' table can correspond to zero or any number of conversions.
Only a single nesting level is supported. Nested structure columns with array type are equivalent to multidimensional arrays and thus their support is limited (storing such columns in tables with engines from MergeTree family is not supported).
In most cases, when working with a nested data structure, its individual columns are specified. To do this, the column names are separated by a dot. These columns make up an array of matching types. All the column arrays of a single nested data structure have the same length.
Example:
.. code-block:: sql
SELECT
Goals.ID,
Goals.EventTime
FROM test.visits
WHERE CounterID = 101500 AND length(Goals.ID) < 5
LIMIT 10
┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐
│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │
│ [1073752] │ ['2014-03-17 00:28:25'] │
│ [1073752] │ ['2014-03-17 10:46:20'] │
│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │
│ [] │ [] │
│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │
│ [] │ [] │
│ [] │ [] │
│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │
│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │
└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘
It is easiest to think of a nested data structure as a set of multiple column arrays of the same length.
The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see "ARRAY JOIN clause". Example:
.. code-block:: sql
SELECT
Goal.ID,
Goal.EventTime
FROM test.visits
ARRAY JOIN Goals AS Goal
WHERE CounterID = 101500 AND length(Goals.ID) < 5
LIMIT 10
┌─Goal.ID─┬──────Goal.EventTime─┐
│ 1073752 │ 2014-03-17 16:38:10 │
│ 591325 │ 2014-03-17 16:38:48 │
│ 591325 │ 2014-03-17 16:42:27 │
│ 1073752 │ 2014-03-17 00:28:25 │
│ 1073752 │ 2014-03-17 10:46:20 │
│ 1073752 │ 2014-03-17 13:59:20 │
│ 591325 │ 2014-03-17 22:17:55 │
│ 591325 │ 2014-03-17 22:18:07 │
│ 591325 │ 2014-03-17 22:18:51 │
│ 1073752 │ 2014-03-17 11:37:06 │
└─────────┴─────────────────────┘
You can't perform SELECT for an entire nested data structure. You can only explicitly list individual columns that are part of it.
For an INSERT query, you should pass all the component column arrays of a nested data structure separately (as if they were individual column arrays). During insertion, the system checks that they have the same length.
For a DESCRIBE query, the columns in a nested data structure are listed separately in the same way.
The ALTER query is very limited for elements in a nested data structure.

View File

@ -0,0 +1,4 @@
Expression
~~~~~~~~~~
Used for representing lambda expressions in high-order functions.

View File

@ -0,0 +1,9 @@
Special data types
----------------------
Special data type values can't be saved to a table or output in results, but are used as the intermediate result of running a query.
.. toctree::
:glob:
*

View File

@ -0,0 +1,4 @@
Set
~~~
Used for the right half of an IN expression.

View File

@ -0,0 +1,14 @@
String
------
Strings of an arbitrary length. The length is not limited. The value can contain an arbitrary set of bytes, including null bytes.
The String type replaces the types VARCHAR, BLOB, CLOB, and others from other DBMSs.
Кодировки
~~~~~~~~~
ClickHouse doesn't have the concept of encodings. Strings can contain an arbitrary set of bytes, which are stored and output as-is.
If you need to store texts, we recommend using UTF-8 encoding. At the very least, if your terminal uses UTF-8 (as recommended), you can read and write your values without making conversions.
Similarly, certain functions for working with strings have separate variations that work under the assumption that the string contains a set of bytes representing a UTF-8 encoded text.
For example, the 'length' function calculates the string length in bytes, while the 'lengthUTF8' function calculates the string length in Unicode code points, assuming that the value is UTF-8 encoded.

View File

@ -0,0 +1,6 @@
Tuple(T1, T2, ...)
------------------
Tuples can't be written to tables (other than Memory tables). They are used for temporary column grouping. Columns can be grouped when an IN expression is used in a query, and for specifying certain formal parameters of lambda functions. For more information, see "IN operators" and "Higher order functions".
Tuples can be output as the result of running a query. In this case, for text formats other than JSON*, values are comma-separated in brackets. In JSON* formats, tuples are output as arrays (in square brackets).

View File

@ -0,0 +1,306 @@
External dictionaries
===============
It is possible to add your own dictionaries from various data sources. The data source for a dictionary can be a file in the local file system, the ClickHouse server, or a MySQL server.
A dictionary can be stored completely in RAM and updated regularly, or it can be partially cached in RAM and dynamically load missing values.
The configuration of external dictionaries is in a separate file or files specified in the 'dictionaries_config' configuration parameter.
This parameter contains the absolute or relative path to the file with the dictionary configuration. A relative path is relative to the directory with the server config file. The path can contain wildcards * and ?, in which case all matching files are found. Example: dictionaries/*.xml.
The dictionary configuration, as well as the set of files with the configuration, can be updated without restarting the server. The server checks updates every 5 seconds. This means that dictionaries can be enabled dynamically.
Dictionaries can be created when starting the server, or at first use. This is defined by the 'dictionaries_lazy_load' parameter in the main server config file. This parameter is optional, 'true' by default. If set to 'true', each dictionary is created at first use. If dictionary creation failed, the function that was using the dictionary throws an exception. If 'false', all dictionaries are created when the server starts, and if there is an error, the server shuts down.
The dictionary config file has the following format:
.. code-block:: xml
<dictionaries>
<comment>Optional element with any content; completely ignored.</comment>
<!--You can set any number of different dictionaries. -->
<dictionary>
<!-- Dictionary name. The dictionary will be accessed for use by this name. -->
<name>os</name>
<!-- Data source. -->
<source>
<!-- Source is a file in the local file system. -->
<file>
<!-- Path on the local file system. -->
<path>/opt/dictionaries/os.tsv</path>
<!-- Which format to use for reading the file. -->
<format>TabSeparated</format>
</file>
<!-- or the source is a table on a MySQL server.
<mysql>
<!- - These parameters can be specified outside (common for all replicas) or inside a specific replica - ->
<port>3306</port>
<user>clickhouse</user>
<password>qwerty</password>
<!- - Specify from one to any number of replicas for fault tolerance. - ->
<replica>
<host>example01-1</host>
<priority>1</priority> <!- - The lower the value, the higher the priority. - ->
</replica>
<replica>
<host>example01-2</host>
<priority>1</priority>
</replica>
<db>conv_main</db>
<table>counters</table>
</mysql>
-->
<!-- or the source is a table on the ClickHouse server.
<clickhouse>
<host>example01-01-1</host>
<port>9000</port>
<user>default</user>
<password></password>
<db>default</db>
<table>counters</table>
</clickhouse>
<!- - If the address is similar to localhost, the request is made without network interaction. For fault tolerance, you can create a Distributed table on localhost and enter it. - ->
-->
<!-- or the source is a executable. If layout.complex_key_cache - list of needed keys will be written in STDIN of program -->
<executable>
<!-- Path on the local file system or name located in one of env PATH dirs. -->
<command>cat /opt/dictionaries/os.tsv</command>
<!-- Which format to use for reading/writing stream. -->
<format>TabSeparated</format>
</executable>
<!-- or the source is a http server. If layout.complex_key_cache - list of needed keys will be sent as POST -->
<http>
<!-- Host. -->
<url>http://[::1]/os.tsv</url>
<!-- Which format to use for reading answer and making POST. -->
<format>TabSeparated</format>
</http>
</source>
<!-- Update interval for fully loaded dictionaries. 0 - never update. -->
<lifetime>
<min>300</min>
<max>360</max>
<!-- The update interval is selected uniformly randomly between min and max, in order to spread out the load when updating dictionaries on a large number of servers. -->
</lifetime>
<!-- or <!- - The update interval for fully loaded dictionaries or invalidation time for cached dictionaries. 0 - never update. - ->
<lifetime>300</lifetime>
-->
<layout> <!-- Method for storing in memory. -->
<flat />
<!-- or <hashed />
or
<cache>
<!- - Cache size in number of cells; rounded up to a degree of two. - ->
<size_in_cells>1000000000</size_in_cells>
</cache> -->
</layout>
<!-- Structure. -->
<structure>
<!-- Description of the column that serves as the dictionary identifier (key). -->
<id>
<!-- Column name with ID. -->
<name>Id</name>
</id>
<attribute>
<!-- Column name. -->
<name>Name</name>
<!-- Column type. (How the column is understood when loading. For MySQL, a table can have TEXT, VARCHAR, and BLOB, but these are all loaded as String) -->
<type>String</type>
<!-- Value to use for a non-existing element. In the example, an empty string. -->
<null_value></null_value>
</attribute>
<!-- Any number of attributes can be specified. -->
<attribute>
<name>ParentID</name>
<type>UInt64</type>
<null_value>0</null_value>
<!-- Whether it defines a hierarchy - mapping to the parent ID (by default, false). -->
<hierarchical>true</hierarchical>
<!-- The mapping id -> attribute can be considered injective, in order to optimize GROUP BY. (by default, false) -->
<injective>true</injective>
</attribute>
</structure>
</dictionary>
</dictionaries>
The dictionary identifier (key attribute) should be a number that fits into UInt64. Also, you can use arbitrary tuples as keys (see section "Dictionaries with complex keys"). Note: you can use complex keys consisting of just one element. This allows using e.g. Strings as dictionary keys.
There are six ways to store dictionaries in memory.
flat
-----
This is the most effective method. It works if all keys are smaller than ``500,000``. If a larger key is discovered when creating the dictionary, an exception is thrown and the dictionary is not created. The dictionary is loaded to RAM in its entirety. The dictionary uses the amount of memory proportional to maximum key value. With the limit of 500,000, memory consumption is not likely to be high. All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
hashed
-------
This method is slightly less effective than the first one. The dictionary is also loaded to RAM in its entirety, and can contain any number of items with any identifiers. In practice, it makes sense to use up to tens of millions of items, while there is enough RAM.
All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
cache
-------
This is the least effective method. It is appropriate if the dictionary doesn't fit in RAM. It is a cache of a fixed number of cells, where frequently-used data can be located. MySQL, ClickHouse, executable, http sources are supported, but file sources are not supported.
When searching a dictionary, the cache is searched first. For each data block, all keys not found in the cache (or expired keys) are collected in a package, which is sent to the source with the query ``SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)``. The received data is then written to the cache.
range_hashed
--------
The table lists some data for date ranges, for each key. To give the possibility to extract this data for a given key, for a given date.
Example: in the table there are discounts for each advertiser in the form:
::
advertiser id discount start date end date value
123 2015-01-01 2015-01-15 0.15
123 2015-01-16 2015-01-31 0.25
456 2015-01-01 2015-01-15 0.05
Adding layout = range_hashed.
When using such a layout, the structure should have the elements range_min, range_max.
Example:
.. code-block:: xml
<structure>
<id>
<name>Id</name>
</id>
<range_min>
<name>first</name>
</range_min>
<range_max>
<name>last</name>
</range_max>
...
These columns must be of type Date. Other types are not yet supported.
The columns indicate a closed date range.
To work with such dictionaries, dictGetT functions must take one more argument - the date:
``dictGetT('dict_name', 'attr_name', id, date)``
The function takes out the value for this id and for the date range, which includes the transmitted date. If no id is found or the range found is not found for the found id, the default value for the dictionary is returned.
If there are overlapping ranges, then any suitable one can be used.
If the range boundary is NULL or is an incorrect date (1900-01-01, 2039-01-01), then the range should be considered open. The range can be open on both sides.
In the RAM, the data is presented as a hash table with a value in the form of an ordered array of ranges and their corresponding values.
Example of a dictionary by ranges:
.. code-block:: xml
<dictionaries>
<dictionary>
<name>xxx</name>
<source>
<mysql>
<password>xxx</password>
<port>3306</port>
<user>xxx</user>
<replica>
<host>xxx</host>
<priority>1</priority>
</replica>
<db>dicts</db>
<table>xxx</table>
</mysql>
</source>
<lifetime>
<min>300</min>
<max>360</max>
</lifetime>
<layout>
<range_hashed />
</layout>
<structure>
<id>
<name>Abcdef</name>
</id>
<range_min>
<name>StartDate</name>
</range_min>
<range_max>
<name>EndDate</name>
</range_max>
<attribute>
<name>XXXType</name>
<type>String</type>
<null_value />
</attribute>
</structure>
</dictionary>
</dictionaries>
complex_key_hashed
----------------
The same as ``hashed``, but for complex keys.
complex_key_cache
----------
The same as ``cache``, but for complex keys.
Notes
----------
We recommend using the ``flat`` method when possible, or ``hashed``. The speed of the dictionaries is impeccable with this type of memory storage.
Use the cache method only in cases when it is unavoidable. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary only works normally for high enough hit rates (recommended 99% and higher). You can view the average hit rate in the system.dictionaries table. Set a large enough cache size. You will need to experiment to find the right number of cells - select a value, use a query to get the cache completely full, look at the memory consumption (this information is in the system.dictionaries table), then proportionally increase the number of cells so that a reasonable amount of memory is consumed. We recommend MySQL as the source for the cache, because ClickHouse doesn't handle requests with random reads very well.
In all cases, performance is better if you call the function for working with a dictionary after ``GROUP BY``, and if the attribute being fetched is marked as injective. For a dictionary cache, performance improves if you call the function after LIMIT. To do this, you can use a subquery with LIMIT, and call the function with the dictionary from the outside.
An attribute is called injective if different attribute values correspond to different keys. So when ``GROUP BY`` uses a function that fetches an attribute value by the key, this function is automatically taken out of ``GROUP BY``.
When updating dictionaries from a file, first the file modification time is checked, and it is loaded only if the file has changed.
When updating from MySQL, for flat and hashed dictionaries, first a ``SHOW TABLE STATUS`` query is made, and the table update time is checked. If it is not NULL, it is compared to the stored time. This works for MyISAM tables, but for InnoDB tables the update time is unknown, so loading from InnoDB is performed on each update.
For cache dictionaries, the expiration (lifetime) of data in the cache can be set. If more time than 'lifetime' has passed since loading the data in a cell, the cell's value is not used, and it is re-requested the next time it needs to be used.
If a dictionary couldn't be loaded even once, an attempt to use it throws an exception.
If an error occurred during a request to a cached source, an exception is thrown.
Dictionary updates (other than loading for first use) do not block queries. During updates, the old version of a dictionary is used. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries.
You can view the list of external dictionaries and their status in the system.dictionaries table.
To use external dictionaries, see the section "Functions for working with external dictionaries".
Note that you can convert values for a small dictionary by specifying all the contents of the dictionary directly in a ``SELECT`` query (see the section "transform function"). This functionality is not related to external dictionaries.
Dictionaries with complex keys
----------------------------
You can use tuples consisting of fields of arbitrary types as keys. Configure your dictionary with ``complex_key_hashed`` or ``complex_key_cache`` layout in this case.
Key structure is configured not in the ``<id>`` element but in the ``<key>`` element. Fields of the key tuple are configured analogously to dictionary attributes. Example:
.. code-block:: xml
<structure>
<key>
<attribute>
<name>field1</name>
<type>String</type>
</attribute>
<attribute>
<name>field2</name>
<type>UInt32</type>
</attribute>
...
</key>
...
When using such dictionary, use a Tuple of field values as a key in dictGet* functions. Example: ``dictGetString('dict_name', 'attr_name', tuple('field1_value', 123))``.

11
docs/en/dicts/index.rst Normal file
View File

@ -0,0 +1,11 @@
Dictionaries
=======
A dictionary is a mapping (key -> attributes) that can be used in a query as functions. You can think of this as a more convenient and efficient type of JOIN with dimension tables.
There are built-in (internal) and add-on (external) dictionaries.
.. toctree::
:glob:
*

View File

@ -0,0 +1,45 @@
Internal dictionaries
------------------
ClickHouse contains a built-in feature for working with a geobase.
This allows you to:
* Use a region's ID to get its name in the desired language.
* Use a region's ID to get the ID of a city, area, federal district, country, or continent.
* Check whether a region is part of another region.
* Get a chain of parent regions.
All the functions support "translocality," the ability to simultaneously use different perspectives on region ownership. For more information, see the section "Functions for working with Yandex.Metrica dictionaries".
The internal dictionaries are disabled in the default package.
To enable them, uncomment the parameters ``path_to_regions_hierarchy_file`` and ``path_to_regions_names_files`` in the server config file.
The geobase is loaded from text files.
If you are Yandex employee, to create them, use the following instructions:
https://github.yandex-team.ru/raw/Metrika/ClickHouse_private/master/doc/create_embedded_geobase_dictionaries.txt
Put the regions_hierarchy*.txt files in the path_to_regions_hierarchy_file directory. This configuration parameter must contain the path to the regions_hierarchy.txt file (the default regional hierarchy), and the other files (regions_hierarchy_ua.txt) must be located in the same directory.
Put the regions_names_*.txt files in the path_to_regions_names_files directory.
You can also create these files yourself. The file format is as follows:
``regions_hierarchy*.txt``: TabSeparated (no header), columns:
* Region ID (UInt32)
* Parent region ID (UInt32)
* Region type (UInt8): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types don't have values.
* Population (UInt32) - Optional column.
``regions_names_*.txt``: TabSeparated (no header), columns:
* Region ID (UInt32)
* Region name (String) - Can't contain tabs or line breaks, even escaped ones.
A flat array is used for storing in RAM. For this reason, IDs shouldn't be more than a million.
Dictionaries can be updated without the server restart. However, the set of available dictionaries is not updated. For updates, the file modification times are checked. If a file has changed, the dictionary is updated.
The interval to check for changes is configured in the 'builtin_dictionaries_reload_interval' parameter.
Dictionary updates (other than loading at first use) do not block queries. During updates, queries use the old versions of dictionaries. If an error occurs during an update, the error is written to the server log, while queries continue using the old version of dictionaries.
We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server.
There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldn't be used.

56
docs/en/external_data.rst Normal file
View File

@ -0,0 +1,56 @@
External data for query processing
====================================
ClickHouse allows sending a server the data that is needed for processing a query, together with a SELECT query. This data is put in a temporary table (see the section "Temporary tables") and can be used in the query (for example, in IN operators).
For example, if you have a text file with important user identifiers, you can upload it to the server along with a query that uses filtration by this list.
If you need to run more than one query with a large volume of external data, don't use this feature. It is better to upload the data to the DB ahead of time.
External data can be uploaded using the command-line client (in non-interactive mode), or using the HTTP interface.
In the command-line client, you can specify a parameters section in the format
::
--external --file=... [--name=...] [--format=...] [--types=...|--structure=...]
You may have multiple sections like this, for the number of tables being transmitted.
**--external** - Marks the beginning of the section.
**--file** - Path to the file with the table dump, or ``-``, which refers to stdin
Only a single table can be retrieved from stdin.
The following parameters are optional:
**--name** - Name of the table. If omitted, ``_data`` is used.
**--format** - Data format in the file. If omitted, ``TabSeparated`` is used.
One of the following parameters is required:
**--types** - A comma-separated list of column types. For example, ``UInt64,String``. Columns will be named ``_1``, ``_2``, ...
**--structure** - Table structure, in the format ``UserID UInt64, URL String``. Defines the column names and types.
The files specified in ``file`` will be parsed by the format specified in ``format``, using the data types specified in ``types`` or ``structure``. The table will be uploaded to the server and accessible there as a temporary table with the name ``name``.
Examples:
::
echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8
849897
cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String'
/bin/sh 20
/bin/false 5
/bin/bash 4
/usr/sbin/nologin 1
/bin/sync 1
When using the HTTP interface, external data is passed in the multipart/form-data format. Each table is transmitted as a separate file. The table name is taken from the file name. The 'query_string' passes the parameters 'name_format', 'name_types', and 'name_structure', where name is the name of the table that these parameters correspond to. The meaning of the parameters is the same as when using the command-line client.
Example:
::
cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv
curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String'
/bin/sh 20
/bin/false 5
/bin/bash 4
/usr/sbin/nologin 1
/bin/sync 1
For distributed query processing, the temporary tables are sent to all the remote servers.

View File

@ -0,0 +1,9 @@
BlockTabSeparated
-----------------
Data is not written by row, but by column and block.
Each block consists of parts of columns, each of which is written on a separate line.
The values are tab-separated. The last value in a column part is followed by a line break instead of a tab.
Blocks are separated by a double line break.
The rest of the rules are the same as in the TabSeparated format.
This format is only appropriate for outputting a query result, not for parsing.

10
docs/en/formats/csv.rst Normal file
View File

@ -0,0 +1,10 @@
CSV
----
Comma separated values (`RFC <https://tools.ietf.org/html/rfc4180>`_).
String values are output in double quotes. Double quote inside a string is output as two consecutive double quotes. That's all escaping rules. Date and DateTime values are output in double quotes. Numbers are output without quotes. Fields are delimited by commas. Rows are delimited by unix newlines (LF). Arrays are output in following way: first, array are serialized to String (as in TabSeparated or Values formats), and then the String value are output in double quotes. Tuples are narrowed and serialized as separate columns.
During parsing, values could be enclosed or not enclosed in quotes. Supported both single and double quotes. In particular, Strings could be represented without quotes - in that case, they are parsed up to comma or newline (CR or LF). Contrary to RFC, in case of parsing strings without quotes, leading and trailing spaces and tabs are ignored. As line delimiter, both Unix (LF), Windows (CR LF) or Mac OS Classic (LF CR) variants are supported.
CSV format supports output of totals and extremes similar to TabSeparated format.

View File

@ -0,0 +1,4 @@
CSVWithNames
------------
Also contains header, similar to ``TabSeparatedWithNames``.

View File

@ -0,0 +1,9 @@
Formats
=======
The format determines how data is given (written by server as output) to you after SELECTs, and how it is accepted (read by server as input) for INSERTs.
.. toctree::
:glob:
*

85
docs/en/formats/json.rst Normal file
View File

@ -0,0 +1,85 @@
JSON
-----
Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information - the total number of output rows, and the number of rows that could have been output if there weren't a LIMIT. Example:
.. code-block:: sql
SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON
{
"meta":
[
{
"name": "SearchPhrase",
"type": "String"
},
{
"name": "c",
"type": "UInt64"
}
],
"data":
[
{
"SearchPhrase": "",
"c": "8267016"
},
{
"SearchPhrase": "интерьер ванной комнаты",
"c": "2166"
},
{
"SearchPhrase": "яндекс",
"c": "1655"
},
{
"SearchPhrase": "весна 2014 мода",
"c": "1549"
},
{
"SearchPhrase": "фриформ фото",
"c": "1480"
}
],
"totals":
{
"SearchPhrase": "",
"c": "8873898"
},
"extremes":
{
"min":
{
"SearchPhrase": "",
"c": "1480"
},
"max":
{
"SearchPhrase": "",
"c": "8267016"
}
},
"rows": 5,
"rows_before_limit_at_least": 141137
}
JSON is compatible with JavaScript. For this purpose, certain symbols are additionally escaped: the forward slash ``/`` is escaped as ``\/``; alternative line breaks ``U+2028`` and ``U+2029``, which don't work in some browsers, are escaped as \uXXXX-sequences. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab as ``\b``, ``\f``, ``\n``, ``\r``, and ``\t`` respectively, along with the rest of the bytes from the range 00-1F using \uXXXX-sequences. Invalid UTF-8 sequences are changed to the replacement character ``<EFBFBD>`` and, thus, the output text will consist of valid UTF-8 sequences. UInt64 and Int64 numbers are output in double quotes for compatibility with JavaScript.
``rows`` - The total number of output rows.
``rows_before_limit_at_least`` - The minimal number of rows there would have been without a LIMIT. Output only if the query contains LIMIT.
If the query contains GROUP BY, ``rows_before_limit_at_least`` is the exact number of rows there would have been without a LIMIT.
``totals`` - Total values (when using WITH TOTALS).
``extremes`` - Extreme values (when extremes is set to 1).
This format is only appropriate for outputting a query result, not for parsing.
See JSONEachRow format for INSERT queries.

View File

@ -0,0 +1,44 @@
JSONCompact
-----------
Differs from ``JSON`` only in that data rows are output in arrays, not in objects.
Example:
::
{
"meta":
[
{
"name": "SearchPhrase",
"type": "String"
},
{
"name": "c",
"type": "UInt64"
}
],
"data":
[
["", "8267016"],
["bath interiors", "2166"],
["yandex", "1655"],
["spring 2014 fashion", "1549"],
["freeform photo", "1480"]
],
"totals": ["","8873898"],
"extremes":
{
"min": ["","1480"],
"max": ["","8267016"]
},
"rows": 5,
"rows_before_limit_at_least": 141137
}
This format is only appropriate for outputting a query result, not for parsing.
See ``JSONEachRow`` format for INSERT queries.

View File

@ -0,0 +1,23 @@
JSONEachRow
-----------
If put in SELECT query, displays data in newline delimited JSON (JSON objects separated by \\n character) format.
If put in INSERT query, expects this kind of data as input.
::
{"SearchPhrase":"","count()":"8267016"}
{"SearchPhrase":"bathroom interior","count()":"2166"}
{"SearchPhrase":"yandex","count()":"1655"}
{"SearchPhrase":"spring 2014 fashion","count()":"1549"}
{"SearchPhrase":"free-form photo","count()":"1480"}
{"SearchPhrase":"Angelina Jolie","count()":"1245"}
{"SearchPhrase":"omsk","count()":"1112"}
{"SearchPhrase":"photos of dog breeds","count()":"1091"}
{"SearchPhrase":"curtain design","count()":"1064"}
{"SearchPhrase":"baku","count()":"1000"}
Unlike JSON format, there are no replacements of invalid UTF-8 sequences. There can be arbitrary amount of bytes in a line.
This is done in order to avoid data loss during formatting. Values are displayed analogous to JSON format.
In INSERT queries JSON data can be supplied with arbitrary order of columns (JSON key-value pairs). It is also possible to omit values in which case the default value of the column is inserted. N.B. when using JSONEachRow format, complex default values are not supported, so when omitting a column its value will be zeros or empty string depending on its type.
Space characters between JSON objects are skipped. Between objects there can be a comma which is ignored. Newline character is not a mandatory separator for objects.

View File

@ -0,0 +1,6 @@
Native
------
The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is "columnar" - it doesn't convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients.
You can use this format to quickly generate dumps that can only be read by the ClickHouse DBMS. It doesn't make sense to work with this format yourself.

4
docs/en/formats/null.rst Normal file
View File

@ -0,0 +1,4 @@
Null
----
Nothing is output. However, the query is processed, and when using the command-line client, data is transmitted to the client. This is used for tests, including productivity testing. Obviously, this format is only appropriate for outputting a query result, not for parsing.

View File

@ -0,0 +1,34 @@
Pretty
------
Writes data as Unicode-art tables, also using ANSI-escape sequences for setting colors in the terminal.
A full grid of the table is drawn, and each row occupies two lines in the terminal. Each result block is output as a separate table. This is necessary so that blocks can be output without buffering results (buffering would be necessary in order to pre-calculate the visible width of all the values).
To avoid dumping too much data to the terminal, only the first 10,000 rows are printed. If the number of rows is greater than or equal to 10,000, the message "Showed first 10,000" is printed.
This format is only appropriate for outputting a query result, not for parsing.
The Pretty format supports outputting total values (when using WITH TOTALS) and extremes (when 'extremes' is set to 1). In these cases, total values and extreme values are output after the main data, in separate tables. Example (shown for the PrettyCompact format):
.. code-block:: sql
SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact
┌──EventDate─┬───────c─┐
│ 2014-03-17 │ 1406958 │
│ 2014-03-18 │ 1383658 │
│ 2014-03-19 │ 1405797 │
│ 2014-03-20 │ 1353623 │
│ 2014-03-21 │ 1245779 │
│ 2014-03-22 │ 1031592 │
│ 2014-03-23 │ 1046491 │
└────────────┴─────────┘
Totals:
┌──EventDate─┬───────c─┐
│ 0000-00-00 │ 8873898 │
└────────────┴─────────┘
Extremes:
┌──EventDate─┬───────c─┐
│ 2014-03-17 │ 1031592 │
│ 2014-03-23 │ 1406958 │
└────────────┴─────────┘

View File

@ -0,0 +1,4 @@
PrettyCompact
-------------
Differs from ``Pretty`` in that the grid is drawn between rows and the result is more compact. This format is used by default in the command-line client in interactive mode.

View File

@ -0,0 +1,4 @@
PrettyCompactMonoBlock
----------------------
Differs from ``PrettyCompact`` in that up to 10,000 rows are buffered, then output as a single table, not by blocks.

View File

@ -0,0 +1,18 @@
PrettyNoEscapes
---------------
Differs from Pretty in that ANSI-escape sequences aren't used. This is necessary for displaying this format in a browser, as well as for using the 'watch' command-line utility.
Example:
::
watch -n1 "clickhouse-client --query='SELECT * FROM system.events FORMAT PrettyCompactNoEscapes'"
You can use the HTTP interface for displaying in the browser.
PrettyCompactNoEscapes
----------------------
The same.
PrettySpaceNoEscapes
--------------------
The same.

View File

@ -0,0 +1,4 @@
PrettySpace
-----------
Differs from ``PrettyCompact`` in that whitespace (space characters) is used instead of the grid.

View File

@ -0,0 +1,13 @@
RowBinary
---------
Writes data by row in binary format. Rows and values are listed consecutively, without separators.
This format is less efficient than the Native format, since it is row-based.
Numbers is written in little endian, fixed width. For example, UInt64 takes 8 bytes.
DateTime is written as UInt32 with unix timestamp value.
Date is written as UInt16 with number of days since 1970-01-01 in value.
String is written as length in varint (unsigned `LEB128 <https://en.wikipedia.org/wiki/LEB128>`_) format and then bytes of string.
FixedString is written as just its bytes.
Array is written as length in varint (unsigned `LEB128 <https://en.wikipedia.org/wiki/LEB128>`_) format and then all elements, contiguously

View File

@ -0,0 +1,55 @@
TabSeparated
------------
In TabSeparated format, data is written by row. Each row contains values separated by tabs. Each value is follow by a tab, except the last value in the row, which is followed by a line break. Strictly Unix line breaks are assumed everywhere. The last row also must contain a line break at the end. Values are written in text format, without enclosing quotation marks, and with special characters escaped.
Numbers are written in decimal form. Numbers may contain an extra "+" symbol at the beginning (but it is not recorded during an output). Non-negative numbers can't contain the negative sign. When parsing, it is allowed to parse an empty string as a zero, or (for signed types) a string consisting of just a minus sign as a zero. Numbers that do not fit into the corresponding data type may be parsed as a different number, without an error message.
Floating-point numbers are formatted in decimal form. The dot is used as the decimal separator. Exponential entries are supported, as are 'inf', '+inf', '-inf', and 'nan'. An entry of floating-point numbers may begin or end with a decimal point.
During formatting, accuracy may be lost on floating-point numbers.
During parsing, a result is not necessarily the nearest machine-representable number.
Dates are formatted in YYYY-MM-DD format and parsed in the same format, but with any characters as separators.
DateTimes are formatted in the format YYYY-MM-DD hh:mm:ss and parsed in the same format, but with any characters as separators.
This all occurs in the system time zone at the time the client or server starts (depending on which one formats data). For DateTimes, daylight saving time is not specified. So if a dump has times during daylight saving time, the dump does not unequivocally match the data, and parsing will select one of the two times.
During a parsing operation, incorrect dates and dates with times can be parsed with natural overflow or as null dates and times, without an error message.
As an exception, parsing DateTime is also supported in Unix timestamp format, if it consists of exactly 10 decimal digits. The result is not time zone-dependent. The formats ``YYYY-MM-DD hh:mm:ss`` and ``NNNNNNNNNN`` are differentiated automatically.
Strings are parsed and formatted with backslash-escaped special characters. The following escape sequences are used while formatting: ``\b``, ``\f``, ``\r``, ``\n``, ``\t``, ``\0``, ``\'``, and ``\\``. For parsing, also supported \a, \v and \xHH (hex escape sequence) and any sequences of the type \c where c is any character (these sequences are converted to c). This means that parsing supports formats where a line break can be written as \n or as \ and a line break. For example, the string 'Hello world' with a line break between the words instead of a space can be retrieved in any of the following variations:
::
Hello\nworld
Hello\
world
The second variant is supported because MySQL uses it when writing tab-separated dumps.
Only a small set of symbols are escaped. You can easily stumble onto a string value that your terminal will ruin in output.
Minimum set of symbols that you must escape in TabSeparated format is tab, newline (LF) and backslash.
Arrays are formatted as a list of comma-separated values in square brackets. Number items in the array are formatted as normally, but dates, dates with times, and strings are formatted in single quotes with the same escaping rules as above.
The TabSeparated format is convenient for processing data using custom programs and scripts. It is used by default in the HTTP interface, and in the command-line client's batch mode. This format also allows transferring data between different DBMSs. For example, you can get a dump from MySQL and upload it to ClickHouse, or vice versa.
The TabSeparated format supports outputting total values (when using WITH TOTALS) and extreme values (when 'extremes' is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example:
``SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated``
.. code-block:: sql
2014-03-17 1406958
2014-03-18 1383658
2014-03-19 1405797
2014-03-20 1353623
2014-03-21 1245779
2014-03-22 1031592
2014-03-23 1046491
0000-00-00 8873898
2014-03-17 1031592
2014-03-23 1406958
It's also available as ``TSV``.

View File

@ -0,0 +1,7 @@
TabSeparatedRaw
---------------
Differs from the ``TabSeparated`` format in that the rows are formatted without escaping.
This format is only appropriate for outputting a query result, but not for parsing data to insert into a table.
It's also available as ``TSVRaw``.

View File

@ -0,0 +1,8 @@
TabSeparatedWithNames
---------------------
Differs from the TabSeparated format in that the column names are output in the first row.
For parsing, the first row is completely ignored. You can't use column names to determine their position or to check their correctness.
(Support for using header while parsing could be added in future.)
It's also available as ``TSVWithNames``.

View File

@ -0,0 +1,7 @@
TabSeparatedWithNamesAndTypes
-----------------------------
Differs from the ``TabSeparated`` format in that the column names are output to the first row, while the column types are in the second row.
For parsing, the first and second rows are completely ignored.
It's also available as ``TSVWithNamesAndTypes``.

19
docs/en/formats/tskv.rst Normal file
View File

@ -0,0 +1,19 @@
TSKV
-----
Similar to TabSeparated, but displays data in name=value format. Names are displayed just as in TabSeparated. Additionally, a ``=`` symbol is displayed.
::
SearchPhrase= count()=8267016
SearchPhrase=bathroom interior count()=2166
SearchPhrase=yandex count()=1655
SearchPhrase=spring 2014 fashion count()=1549
SearchPhrase=free-form photo count()=1480
SearchPhrase=Angelina Jolie count()=1245
SearchPhrase=omsk count()=1112
SearchPhrase=photos of dog breeds count()=1091
SearchPhrase=curtain design count()=1064
SearchPhrase=baku count()=1000
In case of many small columns this format is obviously not effective and there usually is no reason to use it. This format is supported because it is used for some cases in Yandex.
Format is supported both for input and output. In INSERT queries data can be supplied with arbitrary order of columns. It is also possible to omit values in which case the default value of the column is inserted. N.B. when using TSKV format, complex default values are not supported, so when omitting a column its value will be zeros or empty string depending on its type.

View File

@ -0,0 +1,9 @@
Values
------
Prints every row in parentheses. Rows are separated by commas. There is no comma after the last row. The values inside the parentheses are also comma-separated. Numbers are output in decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are same as in the TabSeparated format. During formatting, extra spaces aren't inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed).
Minimum set of symbols that you must escape in Values format is single quote and backslash.
This is the format that is used in ``INSERT INTO t VALUES`` ...
But you can also use it for query result.

View File

@ -0,0 +1,5 @@
Vertical
--------
Prints each value on a separate line with the column name specified. This format is convenient for printing just one or a few rows, if each row consists of a large number of columns.
This format is only appropriate for outputting a query result, not for parsing.

74
docs/en/formats/xml.rst Normal file
View File

@ -0,0 +1,74 @@
XML
----
XML format is supported only for displaying data, not for INSERTS. Example:
.. code-block:: xml
<?xml version='1.0' encoding='UTF-8' ?>
<result>
<meta>
<columns>
<column>
<name>SearchPhrase</name>
<type>String</type>
</column>
<column>
<name>count()</name>
<type>UInt64</type>
</column>
</columns>
</meta>
<data>
<row>
<SearchPhrase></SearchPhrase>
<field>8267016</field>
</row>
<row>
<SearchPhrase>bathroom interior</SearchPhrase>
<field>2166</field>
</row>
<row>
<SearchPhrase>yandex>
<field>1655</field>
</row>
<row>
<SearchPhrase>spring 2014 fashion</SearchPhrase>
<field>1549</field>
</row>
<row>
<SearchPhrase>free-form photo</SearchPhrase>
<field>1480</field>
</row>
<row>
<SearchPhrase>Angelina Jolie</SearchPhrase>
<field>1245</field>
</row>
<row>
<SearchPhrase>omsk</SearchPhrase>
<field>1112</field>
</row>
<row>
<SearchPhrase>photos of dog breeds</SearchPhrase>
<field>1091</field>
</row>
<row>
<SearchPhrase>curtain design</SearchPhrase>
<field>1064</field>
</row>
<row>
<SearchPhrase>baku</SearchPhrase>
<field>1000</field>
</row>
</data>
<rows>10</rows>
<rows_before_limit_at_least>141137</rows_before_limit_at_least>
</result>
If name of a column contains some unacceptable character, field is used as a name. In other aspects XML uses JSON structure.
As in case of JSON, invalid UTF-8 sequences are replaced by replacement character <20> so displayed text will only contain valid UTF-8 sequences.
In string values ``<`` and ``&`` are displayed as ``&lt;`` and ``&amp;``.
Arrays are displayed as ``<array><elem>Hello</elem><elem>World</elem>...</array>``,
and tuples as ``<tuple><elem>Hello</elem><elem>World</elem>...</tuple>``.

View File

@ -0,0 +1,64 @@
Arithmetic functions
======================
For all arithmetic functions, the result type is calculated as the smallest number type that the result fits in, if there is such a type. The minimum is taken simultaneously based on the number of bits, whether it is signed, and whether it floats. If there are not enough bits, the highest bit type is taken.
Example
.. code-block:: sql
:) SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0)
┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐
│ UInt8 │ UInt16 │ UInt32 │ UInt64 │
└───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘
Arithmetic functions work for any pair of types from UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, or Float64.
Overflow is produced the same way as in C++.
plus(a, b), a + b operator
--------------------------
Calculates the sum of the numbers.
You can also add whole numbers with a date or date and time. In the case of a date, adding a whole number means adding the corresponding number of days. For a date with time, it means adding the corresponding number of seconds.
minus(a, b), a - b operator
---------------------------
Calculates the difference. The result is always signed.
You can also calculate whole numbers from a date or date with time. The idea is the same - see above for 'plus'.
multiply(a, b), a * b operator
------------------------------
Calculates the product of the numbers.
divide(a, b), a / b operator
-----------------------------
Calculates the quotient of the numbers. The result type is always a floating-point type.
It is not integer division. For integer division, use the 'intDiv' function.
When dividing by zero you get 'inf', '-inf', or 'nan'.
intDiv(a, b)
------------
Calculates the quotient of the numbers. Divides into integers, rounding down (by the absolute value).
When dividing by zero or when dividing a minimal negative number by minus one, an exception is thrown.
intDivOrZero(a, b)
------------------
Differs from 'intDiv' in that it returns zero when dividing by zero or when dividing a minimal negative number by minus one.
modulo(a, b), a % b operator
----------------------------
Calculates the remainder after division.
If arguments are floating-point numbers, they are pre-converted to integers by dropping the decimal portion. The remainder is taken in the same sense as in C++. Truncated division is used for negative numbers.
An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one.
negate(a), -a operator
----------------------
Calculates a number with the reverse sign. The result is always signed.
abs(a)
------
Calculates the absolute value of the number 'a'. That is, if a< 0, it returns -a.
For unsigned types, it doesn't do anything. For signed integer types, it returns an unsigned number.

View File

@ -0,0 +1,176 @@
Functions for working with arrays
-----------------------------
empty
~~~~~
Returns 1 for an empty array, or 0 for a non-empty array.
The result type is UInt8.
The function also works for strings.
notEmpty
~~~~~~~~
Returns 0 for an empty array, or 1 for a non-empty array.
The result type is UInt8.
The function also works for strings.
length
~~~~~~
Returns the number of items in the array.
The result type is UInt64.
The function also works for strings.
emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64
~~~~~~~~~~~~~~
emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64
~~~~~~~~~~~~~~~
emptyArrayFloat32, emptyArrayFloat64
~~~~~~~~~~~~~~~
emptyArrayDate, emptyArrayDateTime
~~~~~~~~~~~~~~
emptyArrayString
~~~~~~~~~~~~
Accepts zero arguments and returns an empty array of the appropriate type.
emptyArrayToSingle
~~~~~~~~~~~~~~
Accepts an empty array as argument and returns an array of one element equal to the default value.
range(N)
~~~~~~~
Returns an array of numbers from 0 to N-1.
Just in case, an exception is thrown if arrays with a total length of more than 100,000,000 elements are created in a data block.
array(x1, ...), оператор [x1, ...]
~~~~~~~~~~~~
Creates an array from the function arguments.
The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn't clear which type of array to create. That is, you can't use this function to create an empty array (to do that, use the 'emptyArray*' function described above).
Returns an 'Array(T)' type result, where 'T' is the smallest common type out of the passed arguments.
arrayElement(arr, n), оператор arr[n]
~~~~~~~~~~~~
Get the element with the index 'n' from the array 'arr'.
'n' should be any integer type.
Indexes in an array begin from one.
Negative indexes are supported - in this case, it selects the corresponding element numbered from the end. For example, 'arr[-1]' is the last item in the array.
If the index goes beyond the array bounds:
- if both arguments are constants, an exception is thrown.
- otherwise, a default value is returned (0 for numbers, an empty string for strings, etc.).
has(arr, elem)
~~~~~~~~~~~
Checks whether the 'arr' array has the 'elem' element.
Returns 0 if the the element is not in the array, or 1 if it is.
'elem' must be a constant.
indexOf(arr, x)
~~~~~~~~~~
Returns the index of the 'x' element (starting from 1) if it is in the array, or 0 if it is not.
countEqual(arr, x)
~~~~~~~~
Returns the number of elements in the array equal to 'x'. Equivalent to ``arrayCount(elem -> elem = x, arr)``.
arrayEnumerate(arr)
~~~~~~~~~
Returns the array ``[1, 2, 3, ..., length(arr)]``
This function is normally used together with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example:
.. code-block:: sql
SELECT
count() AS Reaches,
countIf(num = 1) AS Hits
FROM test.hits
ARRAY JOIN
GoalsReached,
arrayEnumerate(GoalsReached) AS num
WHERE CounterID = 160656
LIMIT 10
┌─Reaches─┬──Hits─┐
│ 95606 │ 31406 │
└─────────┴───────┘
In this example, Reaches is the number of conversions (the strings received after applying ARRAY JOIN), and Hits is the number of pageviews (strings before ARRAY JOIN). In this particular case, you can get the same result in an easier way:
.. code-block:: sql
SELECT
sum(length(GoalsReached)) AS Reaches,
count() AS Hits
FROM test.hits
WHERE (CounterID = 160656) AND notEmpty(GoalsReached)
┌─Reaches─┬──Hits─┐
│ 95606 │ 31406 │
└─────────┴───────┘
This function can also be used in higher-order functions. For example, you can use it to get array indexes for elements that match a condition.
arrayEnumerateUniq(arr, ...)
~~~~~~~~~~
Returns an array the same size as the source array, indicating for each element what its position is among elements with the same value.
For example: ``arrayEnumerateUniq([10, 20, 10, 30]) = [1, 1, 2, 1]``.
This function is useful when using ARRAY JOIN and aggregation of array elements. Example:
.. code-block:: sql
SELECT
Goals.ID AS GoalID,
sum(Sign) AS Reaches,
sumIf(Sign, num = 1) AS Visits
FROM test.visits
ARRAY JOIN
Goals,
arrayEnumerateUniq(Goals.ID) AS num
WHERE CounterID = 160656
GROUP BY GoalID
ORDER BY Reaches DESC
LIMIT 10
┌──GoalID─┬─Reaches─┬─Visits─┐
│ 53225 │ 3214 │ 1097 │
│ 2825062 │ 3188 │ 1097 │
│ 56600 │ 2803 │ 488 │
│ 1989037 │ 2401 │ 365 │
│ 2830064 │ 2396 │ 910 │
│ 1113562 │ 2372 │ 373 │
│ 3270895 │ 2262 │ 812 │
│ 1084657 │ 2262 │ 345 │
│ 56599 │ 2260 │ 799 │
│ 3271094 │ 2256 │ 812 │
└─────────┴─────────┴────────┘
In this example, each goal ID has a calculation of the number of conversions (each element in the Goals nested data structure is a goal that was reached, which we refer to as a conversion) and the number of sessions.
Without ARRAY JOIN, we would have counted the number of sessions as ``sum(Sign)``. But in this particular case, the rows were multiplied by the nested Goals structure, so in order to count each session one time after this,
we apply a condition to the value of the ``arrayEnumerateUniq(Goals.ID)`` function.
The arrayEnumerateUniq function can take multiple arrays of the same size as arguments. In this case, uniqueness is considered for tuples of elements in the same positions in all the arrays.
.. code-block:: sql
SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res
┌─res───────────┐
│ [1,2,1,1,2,1] │
└───────────────┘
This is necessary when using ARRAY JOIN with a nested data structure and further aggregation across multiple elements in this structure.
arrayUniq(arr, ...)
~~~~~~~~~~~~~~~~~~~
If a single array is passed, returns a number of unique elements in that array.
If multiple arrays of the same size are passed as arguments to the function, returns a number of unique tuples of elements in the same positions in all the arrays.
If you need an array of the unique elements, you can use ``arrayReduce('groupUniqArray', arr)``.
arrayJoin(arr)
~~~~~~~~
A special function. See the section "arrayJoin function".

View File

@ -0,0 +1,30 @@
arrayJoin function
---------------
This is a very unusual function.
Normal functions don't change a set of rows, but just change the values in each row (map). Aggregate functions compress a set of rows (fold or reduce).
The 'arrayJoin' function takes each row and generates a set of rows (unfold).
This function takes an array as an argument, and propagates the source row to multiple rows for the number of elements in the array.
All the values in columns are simply copied, except the values in the column where this function is applied - it is replaced with the corresponding array value.
A query can use multiple 'arrayJoin' functions. In this case, the transformation is performed multiple times.
Note the ARRAY JOIN syntax in the SELECT query, which provides broader possibilities.
Example:
.. code-block:: sql
:) SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src
SELECT
arrayJoin([1, 2, 3] AS src) AS dst,
'Hello',
src
┌─dst─┬─\'Hello\'─┬─src─────┐
│ 1 │ Hello │ [1,2,3] │
│ 2 │ Hello │ [1,2,3] │
│ 3 │ Hello │ [1,2,3] │
└─────┴───────────┴─────────┘

View File

@ -0,0 +1,24 @@
Bit functions
---------------
Bit functions work for any pair of types from UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, or Float64.
The result type is an integer with bits equal to the maximum bits of its arguments. If at least one of the arguments is signed, the result is a signed number. If an argument is a floating-point number, it is cast to Int64.
bitAnd(a, b)
~~~~~~~~~~~~
bitOr(a, b)
~~~~~~~~~~~
bitXor(a, b)
~~~~~~~~~~~~
bitNot(a)
~~~~~~~~~
bitShiftLeft(a, b)
~~~~~~~~~~~~~~~~~~
bitShiftRight(a, b)
~~~~~~~~~~~~~~~~~~

View File

@ -0,0 +1,36 @@
Comparison functions
------------------
Comparison functions always return 0 or 1 (Uint8).
The following types can be compared:
* numbers
* strings and fixed strings
* dates
* dates with times
within each group, but not between different groups.
For example, you can't compare a date with a string. You have to use a function to convert the string to a date, or vice versa.
Strings are compared by bytes. A shorter string is smaller than all strings that start with it and that contain at least one more character.
Note: before version 1.1.54134 signed and unsigned numbers were compared the same way as in C++. That is, you could got an incorrect result in such cases: SELECT 9223372036854775807 > -1. From version 1.1.54134, the behavior has changed and numbers are compared mathematically correct.
equals, a = b and a == b operator
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
notEquals, a != b and a <> b operator
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
less, < operator
~~~~~~~~~~~~~~~~~
greater, > operator
~~~~~~~~~~~~~~~~~~~
lessOrEquals, <= operator
~~~~~~~~~~~~~~~~~~~~~~~~
greaterOrEquals, >= operator
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -0,0 +1,7 @@
Conditional functions
-------------
if(cond, then, else), оператор cond ? then : else
~~~~~~~~~~~~~~~~~
Returns 'then' if 'cond != 0', or 'else' if 'cond = 0'.
'cond' must be UInt 8, and 'then' and 'else' must be a type that has the smallest common type.

View File

@ -0,0 +1,141 @@
Functions for working with dates and times
--------------------------------------
Time Zone Support
All functions for working with the date and time for which this makes sense, can take a second, optional argument - the time zone name. Example: Asia / Yekaterinburg. In this case, they do not use the local time zone (the default), but the specified one.
.. code-block:: sql
SELECT
toDateTime('2016-06-15 23:00:00') AS time,
toDate(time) AS date_local,
toDate(time, 'Asia/Yekaterinburg') AS date_yekat,
toString(time, 'US/Samoa') AS time_samoa
┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐
│ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │
└─────────────────────┴────────────┴────────────┴─────────────────────┘
Only time zones are supported, different from UTC for an integer number of hours.
toYear
~~~~~~~
Converts a date or date with time to a UInt16 number containing the year number (AD).
toMonth
~~~~~~~
Converts a date or date with time to a UInt8 number containing the month number (1-12).
toDayOfMonth
~~~~~~~
Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31).
toDayOfWeek
~~~~~~~
Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7).
toHour
~~~~~~~
Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23).
This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true - even in Moscow the clocks were once changed at a different time).
toMinute
~~~~~~~
Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59).
toSecond
~~~~~~~
Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59).
Leap seconds are not accounted for.
toStartOfDay
~~~~~~~
Rounds down a date with time to the start of the day.
toMonday
~~~~~~~
Rounds down a date or date with time to the nearest Monday.
Returns the date.
toStartOfMonth
~~~~~~~
Rounds down a date or date with time to the first day of the month.
Returns the date.
toStartOfQuarter
~~~~~~~
Rounds down a date or date with time to the first day of the quarter.
The first day of the quarter is either 1 January, 1 April, 1 July, or 1 October. Returns the date.
toStartOfYear
~~~~~~~
Rounds down a date or date with time to the first day of the year.
Returns the date.
toStartOfMinute
~~~~~~~
Rounds down a date with time to the start of the minute.
toStartOfFiveMinute
~~~~~~~
Rounds down a date with time to the start of the 5 minute (00:00, 00:05, 00:10...).
toStartOfHour
~~~~~~~
Rounds down a date with time to the start of the hour.
toTime
~~~~~~~
Converts a date with time to some fixed date, while preserving the time.
toRelativeYearNum
~~~~~~~
Converts a date with time or date to the number of the year, starting from a certain fixed point in the past.
toRelativeMonthNum
~~~~~~~
Converts a date with time or date to the number of the month, starting from a certain fixed point in the past.
toRelativeWeekNum
~~~~~~~
Converts a date with time or date to the number of the week, starting from a certain fixed point in the past.
toRelativeDayNum
~~~~~~~
Converts a date with time or date to the number of the day, starting from a certain fixed point in the past.
toRelativeHourNum
~~~~~~~
Converts a date with time or date to the number of the hour, starting from a certain fixed point in the past.
toRelativeMinuteNum
~~~~~~~
Converts a date with time or date to the number of the minute, starting from a certain fixed point in the past.
toRelativeSecondNum
~~~~~~~
Converts a date with time or date to the number of the second, starting from a certain fixed point in the past.
now
~~~~~~~
Accepts zero arguments and returns the current time at one of the moments of request execution.
This function returns a constant, even if the request took a long time to complete.
today
~~~~~~~
Accepts zero arguments and returns the current date at one of the moments of request execution.
The same as 'toDate(now())'.
yesterday
~~~~~~~
Accepts zero arguments and returns yesterday's date at one of the moments of request execution.
The same as 'today() - 1'.
timeSlot
~~~~~~~
Rounds the time to the half hour.
This function is specific to Yandex.Metrica, since half an hour is the minimum amount of time for breaking a session into two sessions if a counter shows a single user's consecutive pageviews that differ in time by strictly more than this amount. This means that tuples (the counter number, user ID, and time slot) can be used to search for pageviews that are included in the corresponding session.
timeSlots(StartTime, Duration)
~~~~~~~
For a time interval starting at 'StartTime' and continuing for 'Duration' seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the half hour.
For example, timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600)) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')].
This is necessary for searching for pageviews in the corresponding session.

View File

@ -0,0 +1,31 @@
Encoding functions
--------
hex
~~~~~
Accepts a string, number, date, or date with time. Returns a string containing the argument's hexadecimal representation. Uses uppercase letters A-F.
Doesn't use ``0x`` prefixes or ``h`` suffixes.
For strings, all bytes are simply encoded as two hexadecimal numbers. Numbers are converted to big endian ("human readable") format.
For numbers, older zeros are trimmed, but only by entire bytes.
For example, ``hex(1) = '01'``. Dates are encoded as the number of days since the beginning of the Unix Epoch. Dates with times are encoded as the number of seconds since the beginning of the Unix Epoch.
unhex(str)
~~~~~~~
Accepts a string containing any number of hexadecimal digits, and returns a string containing the corresponding bytes. Supports both uppercase and lowercase letters A-F. The number of hexadecimal digits doesn't have to be even. If it is odd, the last digit is interpreted as the younger half of the 00-0F byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn't thrown).
If you want to convert the result to a number, you can use the functions 'reverse' and 'reinterpretAsType'
UUIDStringToNum(str)
~~~~~~~
Accepts a string containing the UUID in the text format (``123e4567-e89b-12d3-a456-426655440000``). Returns a binary representation of the UUID in ``FixedString(16)``.
UUIDNumToString(str)
~~~~~~~~
Accepts a FixedString(16) value containing the UUID in the binary format. Returns a readable string containing the UUID in the text format.
bitmaskToList(num)
~~~~~~~
Accepts an integer. Returns a string containing the list of powers of two that total the source number when summed. They are comma-separated without spaces in text format, in ascending order.
bitmaskToArray(num)
~~~~~~~~~
Accepts an integer. Returns an array of UInt64 numbers containing the list of powers of two that total the source number when summed. Numbers in the array are in ascending order.

View File

@ -0,0 +1,43 @@
Functions for working with external dictionaries
-------
For more information, see the section "External dictionaries".
dictGetUInt8, dictGetUInt16, dictGetUInt32, dictGetUInt64
~~~~~~~~~
dictGetInt8, dictGetInt16, dictGetInt32, dictGetInt64
~~~~~~~~~~
dictGetFloat32, dictGetFloat64
~~~~~~~~~
dictGetDate, dictGetDateTime
~~~~~~~
dictGetString
~~~~~~
``dictGetT('dict_name', 'attr_name', id)``
- Gets the value of the 'attr_name' attribute from the 'dict_name' dictionary by the 'id' key.
'dict_name' and 'attr_name' are constant strings.
'id' must be UInt64.
If the 'id' key is not in the dictionary, it returns the default value set in the dictionary definition.
dictGetTOrDefault
~~~~~~~~
``dictGetT('dict_name', 'attr_name', id, default)``
Similar to the functions dictGetT, but the default value is taken from the last argument of the function.
dictIsIn
~~~~~~
``dictIsIn('dict_name', child_id, ancestor_id)``
- For the 'dict_name' hierarchical dictionary, finds out whether the 'child_id' key is located inside 'ancestor_id' (or matches 'ancestor_id'). Returns UInt8.
dictGetHierarchy
~~~~~~~~
``dictGetHierarchy('dict_name', id)``
- For the 'dict_name' hierarchical dictionary, returns an array of dictionary keys starting from 'id' and continuing along the chain of parent elements. Returns Array(UInt64).
dictHas
~~~~~~
``dictHas('dict_name', id)``
- check the presence of a key in the dictionary. Returns a value of type UInt8, equal to 0, if there is no key and 1 if there is a key.

View File

@ -0,0 +1,69 @@
Hash functions
-------------
Hash functions can be used for deterministic pseudo-random shuffling of elements.
halfMD5
~~~~~~
Calculates the MD5 from a string. Then it takes the first 8 bytes of the hash and interprets them as UInt64 in big endian.
Accepts a String-type argument. Returns UInt64.
This function works fairly slowly (5 million short strings per second per processor core).
If you don't need MD5 in particular, use the 'sipHash64' function instead.
MD5
~~~
Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16).
If you don't need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the 'sipHash128' function instead.
If you need the same result as gives 'md5sum' utility, write ``lower(hex(MD5(s)))``.
sipHash64
~~~~~~~
Calculates SipHash from a string.
Accepts a String-type argument. Returns UInt64.
SipHash is a cryptographic hash function. It works at least three times faster than MD5. For more information, see https://131002.net/siphash/
sipHash128
~~~~~
Calculates SipHash from a string.
Accepts a String-type argument. Returns FixedString(16).
Differs from sipHash64 in that the final xor-folding state is only done up to 128 bits.
cityHash64
~~~~~
Calculates CityHash64 from a string or a similar hash function for any number of any type of arguments.
For String-type arguments, CityHash is used. This is a fast non-cryptographic hash function for strings with decent quality.
For other types of arguments, a decent implementation-specific fast non-cryptographic hash function is used.
If multiple arguments are passed, the function is calculated using the same rules and chain combinations using the CityHash combinator.
For example, you can compute the checksum of an entire table with accuracy up to the row order: ``SELECT sum(cityHash64(*)) FROM table``.
intHash32
~~~~~
Calculates a 32-bit hash code from any type of integer.
This is a relatively fast non-cryptographic hash function of average quality for numbers.
intHash64
~~~~~
Calculates a 64-bit hash code from any type of integer.
It works faster than intHash32. Average quality.
SHA1
~~~~
SHA224
~~~~~
SHA256
~~~~~
Calculates SHA-1, SHA-224, or SHA-256 from a string and returns the resulting set of bytes as FixedString(20), FixedString(28), or FixedString(32).
The function works fairly slowly (SHA-1 processes about 5 million short strings per second per processor core, while SHA-224 and SHA-256 process about 2.2 million).
We recommend using this function only in cases when you need a specific hash function and you can't select it.
Even in these cases, we recommend applying the function offline and pre-calculating values when inserting them into the table, instead of applying it in SELECTS.
URLHash(url[, N])
~~~~~~~~
A fast, decent-quality non-cryptographic hash function for a string obtained from a URL using some type of normalization.
``URLHash(s)`` - Calculates a hash from a string without one of the trailing symbols ``/``,``?`` or ``#`` at the end, if present.
``URL Hash(s, N)`` - Calculates a hash from a string up to the N level in the URL hierarchy, without one of the trailing symbols ``/``,``?`` or ``#`` at the end, if present.
Levels are the same as in URLHierarchy. This function is specific to Yandex.Metrica.

View File

@ -0,0 +1,67 @@
Higher-order functions
-----------------------
-> operator, lambda(params, expr) function
~~~~~~~~~~~~~~
Allows describing a lambda function for passing to a higher-order function. The left side of the arrow has a formal parameter - any ID, or multiple formal parameters - any IDs in a tuple. The right side of the arrow has an expression that can use these formal parameters, as well as any table columns.
Examples: ``x -> 2 * x, str -> str != Referer.``
Higher-order functions can only accept lambda functions as their functional argument.
A lambda function that accepts multiple arguments can be passed to a higher-order function. In this case, the higher-order function is passed several arrays of identical length that these arguments will correspond to.
For all functions other than 'arrayMap' and 'arrayFilter', the first argument (the lambda function) can be omitted. In this case, identical mapping is assumed.
arrayMap(func, arr1, ...)
~~~~~~~~~~~~
Returns an array obtained from the original application of the 'func' function to each element in the 'arr' array.
arrayFilter(func, arr1, ...)
~~~~~~~~~~~~~
Returns an array containing only the elements in 'arr1' for which 'func' returns something other than 0.
Examples:
.. code-block:: sql
SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res
┌─res───────────┐
│ ['abc World'] │
└───────────────┘
SELECT
arrayFilter(
(i, x) -> x LIKE '%World%',
arrayEnumerate(arr),
['Hello', 'abc World'] AS arr)
AS res
┌─res─┐
│ [2] │
└─────┘
arrayCount([func,] arr1, ...)
~~~~~~~~~
Returns the number of elements in 'arr' for which 'func' returns something other than 0. If 'func' is not specified, it returns the number of non-zero items in the array.
arrayExists([func,] arr1, ...)
~~~~~~~~~~
Returns 1 if there is at least one element in 'arr' for which 'func' returns something other than 0. Otherwise, it returns 0.
arrayAll([func,] arr1, ...)
~~~~~~~~~
Returns 1 if 'func' returns something other than 0 for all the elements in 'arr'. Otherwise, it returns 0.
arraySum([func,] arr1, ...)
~~~~~~~~~~~
Returns the sum of the 'func' values. If the function is omitted, it just returns the sum of the array elements.
arrayFirst(func, arr1, ...)
~~~~~~~~~
Returns the first element in the 'arr1' array for which 'func' returns something other than 0.
arrayFirstIndex(func, arr1, ...)
~~~~~~~
Returns the index of the first element in the 'arr1' array for which 'func' returns something other than 0.

View File

@ -0,0 +1,18 @@
Functions for implementing the IN operator
---------------
in, notIn, globalIn, globalNotIn
~~~~~~~~~~~~~
See the section "IN operators".
tuple(x, y, ...), оператор (x, y, ...)
~~~~~~~~~~~~~
A function that allows grouping multiple columns.
For columns with the types T1, T2, ..., it returns a Tuple(T1, T2, ...) type tuple containing these columns. There is no cost to execute the function.
Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can't be written to a table.
tupleElement(tuple, n), оператор x.N
~~~~~~~~~~~
A function that allows getting columns from a tuple.
'N' is the column index, starting from 1. 'N' must be a constant. 'N' must be a strict postive integer no greater than the size of the tuple.
There is no cost to execute the function.

View File

@ -0,0 +1,71 @@
Functions
=======
There are at least* two types of functions - regular functions (they are just called "functions") and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function doesn't depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows).
In this section we discuss regular functions. For aggregate functions, see the section "Aggregate functions".
* - There is a third type of function that the 'arrayJoin' function belongs to; table functions can also be mentioned separately.
.. toctree::
:glob:
*
*/index
Strong typing
~~~~~~~~~~~~~~~~~
In contrast to standard SQL, ClickHouse has strong typing. In other words, it doesn't make implicit conversions between types. Each function works for a specific set of types. This means that sometimes you need to use type conversion functions.
Common subexpression elimination
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
All expressions in a query that have the same AST (the same record or same result of syntactic parsing) are considered to have identical values. Such expressions are concatenated and executed once. Identical subqueries are also eliminated this way.
Types of results
~~~~~~~~~~~~~~~
All functions return a single return as the result (not several values, and not zero values). The type of result is usually defined only by the types of arguments, not by the values. Exceptions are the tupleElement function (the a.N operator), and the toFixedString function.
Constants
~~~~~~~~~
For simplicity, certain functions can only work with constants for some arguments. For example, the right argument of the LIKE operator must be a constant.
Almost all functions return a constant for constant arguments. The exception is functions that generate random numbers.
The 'now' function returns different values for queries that were run at different times, but the result is considered a constant, since constancy is only important within a single query.
A constant expression is also considered a constant (for example, the right half of the LIKE operator can be constructed from multiple constants).
Functions can be implemented in different ways for constant and non-constant arguments (different code is executed). But the results for a constant and for a true column containing only the same value should match each other.
Immutability
~~~~~~~~~~~~~~
Functions can't change the values of their arguments - any changes are returned as the result. Thus, the result of calculating separate functions does not depend on the order in which the functions are written in the query.
Error handling
~~~~~~~~~~~~~~~~
Some functions might throw an exception if the data is invalid. In this case, the query is canceled and an error text is returned to the client. For distributed processing, when an exception occurs on one of the servers, the other servers also attempt to abort the query.
Evaluation of argument expressions
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In almost all programming languages, one of the arguments might not be evaluated for certain operators. This is usually for the operators ``&&``, ``||``, ``?:``.
But in ClickHouse, arguments of functions (operators) are always evaluated. This is because entire parts of columns are evaluated at once, instead of calculating each row separately.
Performing functions for distributed query processing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For distributed query processing, as many stages of query processing as possible are performed on remote servers, and the rest of the stages (merging intermediate results and everything after that) are performed on the requestor server.
This means that functions can be performed on different servers.
For example, in the query ``SELECT f(sum(g(x))) FROM distributed_table GROUP BY h(y)``,
- if distributed_table has at least two shards, the functions ``g`` and ``h`` are performed on remote servers, and the function ``f`` - is performed on the requestor server.
- if distributed_table has only one shard, all the functions ``f``, ``g``, and ``h`` are performed on this shard's server.
The result of a function usually doesn't depend on which server it is performed on. However, sometimes this is important.
For example, functions that work with dictionaries use the dictionary that exists on the server they are running on.
Another example is the hostName function, which returns the name of the server it is running on in order to make GROUP BY by servers in a SELECT query.
If a function in a query is performed on the requestor server, but you need to perform it on remote servers, you can wrap it in an 'any' aggregate function or add it to a key in GROUP BY.

View File

@ -0,0 +1,101 @@
Functions for working with IP addresses
-------------------------
IPv4NumToString(num)
~~~~~~~~~~~~~
Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form).
IPv4StringToNum(s)
~~~~~~~~
The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0.
IPv4NumToStringClassC(num)
~~~~~~~~~~~
Similar to IPv4NumToString, but using ``xxx`` instead of the last octet.
Example:
.. code-block:: sql
SELECT
IPv4NumToStringClassC(ClientIP) AS k,
count() AS c
FROM test.hits
GROUP BY k
ORDER BY c DESC
LIMIT 10
┌─k──────────────┬─────c─┐
│ 83.149.9.xxx │ 26238 │
│ 217.118.81.xxx │ 26074 │
│ 213.87.129.xxx │ 25481 │
│ 83.149.8.xxx │ 24984 │
│ 217.118.83.xxx │ 22797 │
│ 78.25.120.xxx │ 22354 │
│ 213.87.131.xxx │ 21285 │
│ 78.25.121.xxx │ 20887 │
│ 188.162.65.xxx │ 19694 │
│ 83.149.48.xxx │ 17406 │
└────────────────┴───────┘
Since using ``'xxx'`` is highly unusual, this may be changed in the future. We recommend that you don't rely on the exact format of this fragment.
IPv6NumToString(x)
~~~~~~~~~~~~
Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format.
IPv6-mapped IPv4 addresses are output in the format ``::ffff:111.222.33.44``. Examples:
.. code-block:: sql
SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr
┌─addr─────────┐
│ 2a02:6b8::11 │
└──────────────┘
SELECT
IPv6NumToString(ClientIP6 AS k),
count() AS c
FROM hits_all
WHERE EventDate = today() AND substring(ClientIP6, 1, 12) != unhex('00000000000000000000FFFF')
GROUP BY k
ORDER BY c DESC
LIMIT 10
┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐
│ 2a02:2168:aaa:bbbb::2 │ 24695 │
│ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │
│ 2a02:6b8:0:fff::ff │ 16389 │
│ 2a01:4f8:111:6666::2 │ 16016 │
│ 2a02:2168:888:222::1 │ 15896 │
│ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │
│ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │
│ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │
│ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │
│ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │
└─────────────────────────────────────────┴───────┘
SELECT
IPv6NumToString(ClientIP6 AS k),
count() AS c
FROM hits_all
WHERE EventDate = today()
GROUP BY k
ORDER BY c DESC
LIMIT 10
┌─IPv6NumToString(ClientIP6)─┬──────c─┐
│ ::ffff:94.26.111.111 │ 747440 │
│ ::ffff:37.143.222.4 │ 529483 │
│ ::ffff:5.166.111.99 │ 317707 │
│ ::ffff:46.38.11.77 │ 263086 │
│ ::ffff:79.105.111.111 │ 186611 │
│ ::ffff:93.92.111.88 │ 176773 │
│ ::ffff:84.53.111.33 │ 158709 │
│ ::ffff:217.118.11.22 │ 154004 │
│ ::ffff:217.118.11.33 │ 148449 │
│ ::ffff:217.118.11.44 │ 148243 │
└────────────────────────────┴────────┘
IPv6StringToNum(s)
~~~~~~~~
The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes.
HEX can be uppercase or lowercase.

View File

@ -0,0 +1,51 @@
Functions for working with JSON.
-------------------
In Yandex.Metrica, JSON is passed by users as session parameters. There are several functions for working with this JSON. (Although in most of the cases, the JSONs are additionally pre-processed, and the resulting values are put in separate columns in their processed format.) All these functions are based on strong assumptions about what the JSON can be, but they try not to do anything.
The following assumptions are made:
#. The field name (function argument) must be a constant.
#. The field name is somehow canonically encoded in JSON. For example, ``visitParamHas('{"abc":"def"}', 'abc') = 1``, but ``visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0``
#. Fields are searched for on any nesting level, indiscriminately. If there are multiple matching fields, the first occurrence is used.
#. JSON doesn't have space characters outside of string literals.
visitParamHas(params, name)
~~~~~~~
Checks whether there is a field with the 'name' name.
visitParamExtractUInt(params, name)
~~~~~~~~~
Parses UInt64 from the value of the field named 'name'. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn't exist, or it exists but doesn't contain a number, it returns 0.
visitParamExtractInt(params, name)
~~~~~~~
The same as for Int64.
visitParamExtractFloat(params, name)
~~~~~~~
The same as for Float64.
visitParamExtractBool(params, name)
~~~~~~~~
Parses a true/false value. The result is UInt8.
visitParamExtractRaw(params, name)
~~~~~~~
Returns the value of a field, including separators.
Examples:
::
visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'
visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'
visitParamExtractString(params, name)
~~~~~~~~~~~
Parses the string in double quotes. The value is unescaped. If unescaping failed, it returns an empty string.
Examples:
::
visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'
visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'
visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''
visitParamExtractString('{"abc":"hello}', 'abc') = ''
Currently, there is no support for code points not from the basic multilingual plane written in the format ``\uXXXX\uYYYY`` (they are converted to CESU-8 instead of UTF-8).

View File

@ -0,0 +1,19 @@
Logical functions
------------------
Logical functions accept any numeric types, but return a UInt8 number equal to 0 or 1.
Zero as an argument is considered "false," while any non-zero value is considered "true"..
and, AND operator
~~~~~~~~~~~~~~~~~
or, OR operator
~~~~~~~~~~~~~~~
not, NOT operator
~~~~~~~~~~~~~~~
xor
~~~~~~~~~~~~~~~

View File

@ -0,0 +1,98 @@
Mathematical functions
---------------
All the functions return a Float64 number. The accuracy of the result is close to the maximum precision possible, but the result might not coincide with the machine representable number nearest to the corresponding real number.
e()
~~~~
Accepts zero arguments and returns a Float64 number close to the e number.
pi()
~~~~
Accepts zero arguments and returns a Float64 number close to π.
exp(x)
~~~~~
Accepts a numeric argument and returns a Float64 number close to the exponent of the argument.
log(x)
~~~~~~
Accepts a numeric argument and returns a Float64 number close to the natural logarithm of the argument.
exp2(x)
~~~~~~~
Accepts a numeric argument and returns a Float64 number close to 2x.
log2(x)
~~~~~
Accepts a numeric argument and returns a Float64 number close to the binary logarithm of the argument.
exp10(x)
~~~~~~~
Accepts a numeric argument and returns a Float64 number close to 10x.
log10(x)
~~~~~~~
Accepts a numeric argument and returns a Float64 number close to the decimal logarithm of the argument.
sqrt(x)
~~~~~~~~
Accepts a numeric argument and returns a Float64 number close to the square root of the argument.
cbrt(x)
~~~~~~~
Accepts a numeric argument and returns a Float64 number close to the cubic root of the argument.
erf(x)
~~~~~~~
If 'x' is non-negative, then erf(x / σ√2) - is the probability that a random variable having a normal distribution with standard deviation 'σ' takes the value that is separated from the expected value by more than 'x'.
Example (three sigma rule):
.. code-block:: sql
SELECT erf(3 / sqrt(2))
┌─erf(divide(3, sqrt(2)))─┐
│ 0.9973002039367398 │
└─────────────────────────┘
erfc(x)
~~~~~~
Accepts a numeric argument and returns a Float64 number close to 1 - erf(x), but without loss of precision for large 'x' values.
lgamma(x)
~~~~~~~
The logarithm of the gamma function.
tgamma(x)
~~~~~~
Gamma function.
sin(x)
~~~~~
The sine.
cos(x)
~~~~~
The cosine.
tan(x)
~~~~~~
The tangent.
asin(x)
~~~~~~
The arc sine
acos(x)
~~~~~~
The arc cosine.
atan(x)
~~~~~
The arc tangent.
pow(x, y)
~~~~~~~
xy.

View File

@ -0,0 +1,260 @@
Other functions
-------------
hostName()
~~~~~~~
Returns a string with the name of the host that this function was performed on. For distributed processing, this is the name of the remote server host, if the function is performed on a remote server.
visibleWidth(x)
~~~~~~~~~
Calculates the approximate width when outputting values to the console in text format (tab-separated). This function is used by the system for implementing Pretty formats.
toTypeName(x)
~~~~~~~~
Gets the type name. Returns a string containing the type name of the passed argument.
blockSize()
~~~~~~~~
Gets the size of the block.
In ClickHouse, queries are always run on blocks (sets of column parts). This function allows getting the size of the block that you called it for.
materialize(x)
~~~~~~~~
Turns a constant into a full column containing just one value.
In ClickHouse, full columns and constants are represented differently in memory. Functions work differently for constant arguments and normal arguments (different code is executed), although the result is almost always the same. This function is for debugging this behavior.
ignore(...)
~~~~~~~
A function that accepts any arguments and always returns 0.
However, the argument is still calculated. This can be used for benchmarks.
sleep(seconds)
~~~~~~~~~
Sleeps 'seconds' seconds on each data block. You can specify an integer or a floating-point number.
currentDatabase()
~~~~~~~~~~
Returns the name of the current database.
You can use this function in table engine parameters in a CREATE TABLE query where you need to specify the database..
isFinite(x)
~~~~~~~
Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is not infinite and not a NaN, otherwise 0.
isInfinite(x)
~~~~~~~
Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is infinite, otherwise 0.
Note that 0 is returned for a NaN
isNaN(x)
~~~~~
Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is a NaN, otherwise 0.
hasColumnInTable('database', 'table', 'column')
~~~~~~~~
Accepts constant String columns - database name, table name and column name. Returns constant UInt8 value, equal to 1 if column exists,
otherwise 0.
If table doesn't exist than exception is thrown.
For elements of nested data structure function checks existence of column. For nested data structure 0 is returned.
bar
~~~~~
Allows building a unicode-art diagram.
``bar(x, min, max, width)`` - Draws a band with a width proportional to (x - min) and equal to 'width' characters when x == max.
``min, max`` - Integer constants. The value must fit in Int64.
``width`` - Constant, positive number, may be a fraction.
The band is drawn with accuracy to one eighth of a symbol. Example:
.. code-block:: sql
SELECT
toHour(EventTime) AS h,
count() AS c,
bar(c, 0, 600000, 20) AS bar
FROM test.hits
GROUP BY h
ORDER BY h ASC
┌──h─┬──────c─┬─bar────────────────┐
│ 0 │ 292907 │ █████████▋ │
│ 1 │ 180563 │ ██████ │
│ 2 │ 114861 │ ███▋ │
│ 3 │ 85069 │ ██▋ │
│ 4 │ 68543 │ ██▎ │
│ 5 │ 78116 │ ██▌ │
│ 6 │ 113474 │ ███▋ │
│ 7 │ 170678 │ █████▋ │
│ 8 │ 278380 │ █████████▎ │
│ 9 │ 391053 │ █████████████ │
│ 10 │ 457681 │ ███████████████▎ │
│ 11 │ 493667 │ ████████████████▍ │
│ 12 │ 509641 │ ████████████████▊ │
│ 13 │ 522947 │ █████████████████▍ │
│ 14 │ 539954 │ █████████████████▊ │
│ 15 │ 528460 │ █████████████████▌ │
│ 16 │ 539201 │ █████████████████▊ │
│ 17 │ 523539 │ █████████████████▍ │
│ 18 │ 506467 │ ████████████████▊ │
│ 19 │ 520915 │ █████████████████▎ │
│ 20 │ 521665 │ █████████████████▍ │
│ 21 │ 542078 │ ██████████████████ │
│ 22 │ 493642 │ ████████████████▍ │
│ 23 │ 400397 │ █████████████▎ │
└────┴────────┴────────────────────┘
transform
~~~~~~~
Transforms a value according to the explicitly defined mapping of some elements to other ones.
There are two variations of this function:
1. ``transform(x, array_from, array_to, default)``
``x`` - What to transform
``array_from`` - Constant array of values for converting.
``array_to`` - Constant array of values to convert the values in 'from' to.
``default`` - Constant. Which value to use if 'x' is not equal to one of the values in 'from'
``'array_from'`` and ``'array_to'`` are arrays of the same size.
Types:
``transform(T, Array(T), Array(U), U) -> U``
``'T'`` and ``'U'`` can be numeric, string, or Date or DateTime types.
Where the same letter is indicated (T or U), for numeric types these might not be matching types, but types that have a common type.
For example, the first argument can have the Int64 type, while the second has the Array(Uint16) type.
If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the existing element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'default'. If there are multiple matching elements in 'array_from', it returns one of the matches.
Example:
.. code-block:: sql
SELECT
transform(SearchEngineID, [2, 3], ['Яндекс', 'Google'], 'Остальные') AS title,
count() AS c
FROM test.hits
WHERE SearchEngineID != 0
GROUP BY title
ORDER BY c DESC
┌─title─────┬──────c─┐
│ Яндекс │ 498635 │
│ Google │ 229872 │
│ Остальные │ 104472 │
└───────────┴────────┘
2. ``transform(x, array_from, array_to)``
Differs from the first variation in that the 'default' argument is omitted.
If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the matching element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'x'.
Types:
``transform(T, Array(T), Array(T)) -> T``
Example:
.. code-block:: sql
SELECT
transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'ввв.яндекс.рф', 'example.com']) AS s,
count() AS c
FROM test.hits
GROUP BY domain(Referer)
ORDER BY count() DESC
LIMIT 10
┌─s──────────────┬───────c─┐
│ │ 2906259 │
│ www.yandex │ 867767 │
│ ███████.ru │ 313599 │
│ mail.yandex.ru │ 107147 │
│ ввв.яндекс.рф │ 105668 │
│ ██████.ru │ 100355 │
│ █████████.ru │ 65040 │
│ news.yandex.ru │ 64515 │
│ ██████.net │ 59141 │
│ example.com │ 57316 │
└────────────────┴─────────┘
formatReadableSize(x)
~~~~~~~~~~~
Gets a size (number of bytes). Returns a string that contains rounded size with the suffix (KiB, MiB etc.).
Example:
.. code-block:: sql
SELECT
arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes,
formatReadableSize(filesize_bytes) AS filesize
┌─filesize_bytes─┬─filesize───┐
│ 1 │ 1.00 B │
│ 1024 │ 1.00 KiB │
│ 1048576 │ 1.00 MiB │
│ 192851925 │ 183.92 MiB │
└────────────────┴────────────┘
least(a, b)
~~~~~~
Returns the least element of a and b.
greatest(a, b)
~~~~~~~~
Returns the greatest element of a and b
uptime()
~~~~~~
Returns server's uptime in seconds.
version()
~~~~~~~
Returns server's version as a string.
rowNumberInAllBlocks()
~~~~~~~~~~
Returns an incremental row number within all blocks that were processed by this function.
runningDifference(x)
~~~~~~~~
Calculates the difference between consecutive values in the data block.
Result of the function depends on the order of the data in the blocks.
It works only inside of the each processed block of data. Data splitting in the blocks is not explicitly controlled by the user.
If you specify ORDER BY in subquery and call runningDifference outside of it, you could get an expected result.
Example:
.. code-block:: sql
SELECT
EventID,
EventTime,
runningDifference(EventTime) AS delta
FROM
(
SELECT
EventID,
EventTime
FROM events
WHERE EventDate = '2016-11-24'
ORDER BY EventTime ASC
LIMIT 5
)
┌─EventID─┬───────────EventTime─┬─delta─┐
│ 1106 │ 2016-11-24 00:00:04 │ 0 │
│ 1107 │ 2016-11-24 00:00:05 │ 1 │
│ 1108 │ 2016-11-24 00:00:05 │ 0 │
│ 1109 │ 2016-11-24 00:00:09 │ 4 │
│ 1110 │ 2016-11-24 00:00:10 │ 1 │
└─────────┴─────────────────────┴───────┘

View File

@ -0,0 +1,17 @@
Functions for generating pseudo-random numbers
----------------------
Non-cryptographic generators of pseudo-random numbers are used.
All the functions accept zero arguments or one argument.
If an argument is passed, it can be any type, and its value is not used for anything.
The only purpose of this argument is to prevent common subexpression elimination, so that two different instances of the same function return different columns with different random numbers.
rand
~~~~
Returns a pseudo-random UInt32 number, evenly distributed among all UInt32-type numbers.
Uses a linear congruential generator.
rand64
~~~~
Returns a pseudo-random UInt64 number, evenly distributed among all UInt64-type numbers.
Uses a linear congruential generator.

View File

@ -0,0 +1,38 @@
Rounding functions
----------------
floor(x[, N])
~~~~~~~
Returns a rounder number that is less than or equal to 'x'.
A round number is a multiple of 1 / 10N, or the nearest number of the appropriate data type ``if 1 / 10N`` isn't exact.
'N' is an integer constant, optional parameter. By default it is zero, which means to round to an integer.
'N' may be negative.
Examples: ``floor(123.45, 1) = 123.4, floor(123.45, -1) = 120``.
'x' is any numeric type. The result is a number of the same type.
For integer arguments, it makes sense to round with a negative 'N' value (for non-negative 'N', the function doesn't do anything).
If rounding causes overflow (for example, ``floor(-128, -1))``, an implementation-specific result is returned.
ceil(x[, N])
~~~~~~
Returns the smallest round number that is greater than or equal to 'x'. In every other way, it is the same as the 'floor' function (see above)..
round(x[, N])
~~~~~~~
Returns the round number nearest to 'num', which may be less than, greater than, or equal to 'x'.
If 'x' is exactly in the middle between the nearest round numbers, one of them is returned (implementation-specific).
The number '-0.' may or may not be considered round (implementation-specific).
In every other way, this function is the same as 'floor' and 'ceil' described above.
roundToExp2(num)
~~~~~~~~
Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to the nearest (whole non-negative) degree of two.
roundDuration(num)
~~~~~~~~
Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function is specific to Yandex.Metrica and used for implementing the report on session length.
roundAge(num)
~~~~~~~
Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to numbers from the set: 18, 25, 35, 45. This function is specific to Yandex.Metrica and used for implementing the report on user age.

View File

@ -0,0 +1,23 @@
Functions for splitting and merging strings and arrays
----------------
splitByChar(separator, s)
~~~~~~~~~~~~
Splits a string into substrings, using 'separator' as the separator.
'separator' must be a string constant consisting of exactly one character.
Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators.
splitByString(separator, s)
~~~~~~~~~~~
The same as above, but it uses a string of multiple characters as the separator. The string must be non-empty.
arrayStringConcat(arr[, separator])
~~~~~~~~~~~~~
Concatenates strings from the array elements, using 'separator' as the separator.
'separator' is a string constant, an optional parameter. By default it is an empty string.
Returns a string.
alphaTokens(s)
~~~~~~~~~~
Selects substrings of consecutive bytes from the range a-z and A-Z.
Returns an array of selected substrings.

View File

@ -0,0 +1,74 @@
Functions for working with strings
------------------------------
empty
~~~~~
Returns 1 for an empty string or 0 for a non-empty string.
The result type is UInt8.
A string is considered non-empty if it contains at least one byte, even if this is a space or a null byte.
The function also works for arrays.
notEmpty
~~~~~~~~
Returns 0 for an empty string or 1 for a non-empty string.
The result type is UInt8.
The function also works for arrays.
length
~~~~~~
Returns the length of a string in bytes (not in characters, and not in code points).
The result type is UInt64.
The function also works for arrays.
lengthUTF8
~~~~~~~~~~
Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception).
The result type is UInt64.
lower
~~~~~~
Converts ASCII Latin symbols in a string to lowercase.
upper
~~~~~
Converts ASCII Latin symbols in a string to uppercase.
lowerUTF8
~~~~~~~~~
Converts a string to lowercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. It doesn't detect the language. So for Turkish the result might not be exactly correct.
If length of UTF-8 sequence is different for upper and lower case of code point, then result for that code point could be incorrect.
If value contains invalid UTF-8, the behavior is unspecified.
upperUTF8
~~~~~~~~~
Converts a string to uppercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. It doesn't detect the language. So for Turkish the result might not be exactly correct.
If length of UTF-8 sequence is different for upper and lower case of code point, then result for that code point could be incorrect.
If value contains invalid UTF-8, the behavior is unspecified.
reverse
~~~~~~~
Reverses the string (as a sequence of bytes).
reverseUTF8
~~~~~~~~~~~
Reverses a sequence of Unicode code points, assuming that the string contains a set of bytes representing a UTF-8 text. Otherwise, it does something else (it doesn't throw an exception).
concat(s1, s2, ...)
~~~~~~~~~~~~~~~~~~~
Concatenates strings from the function arguments, without a separator.
substring(s, offset, length)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Returns a substring starting with the byte from the 'offset' index that is 'length' bytes long. Character indexing starts from one (as in standard SQL). The 'offset' and 'length' arguments must be constants.
substringUTF8(s, offset, length)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The same as 'substring', but for Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception).
appendTrailingCharIfAbsent(s, c)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the ``s`` string is non-empty and does not contain the ``c`` character at the end, it appends the ``c`` character to the end.
convertCharset(s, from, to)
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Returns a string with the data s (encoded as from charset) that was converted to the to charset.

View File

@ -0,0 +1,70 @@
Functions for searching and replacing in strings
---------------------------------
replaceOne(haystack, pattern, replacement)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Replaces the first occurrence, if it exists, of the 'pattern' substring in 'haystack' with the 'replacement' substring.
Hereafter, 'pattern' and 'replacement' must be constants.
replaceAll(haystack, pattern, replacement)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Replaces all occurrences of the 'pattern' substring in 'haystack' with the 'replacement' substring.
replaceRegexpOne(haystack, pattern, replacement)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Replacement using the 'pattern' regular expression. A re2 regular expression. Replaces only the first occurrence, if it exists.
A pattern can be specified as 'replacement'. This pattern can include substitutions \0-\9\.
The substitution \0 includes the entire regular expression.
The substitutions \1-\9 include the subpattern corresponding to the number.
In order to specify the \ symbol in a pattern, you must use a \ symbol to escape it.
Also keep in mind that a string literal requires an extra escape.
Example 1. Converting the date to American format:
.. code-block:: sql
SELECT DISTINCT
EventDate,
replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res
FROM test.hits
LIMIT 7
FORMAT TabSeparated
2014-03-17 03/17/2014
2014-03-18 03/18/2014
2014-03-19 03/19/2014
2014-03-20 03/20/2014
2014-03-21 03/21/2014
2014-03-22 03/22/2014
2014-03-23 03/23/2014
Example 2. Copy the string ten times:
.. code-block:: sql
SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res
┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
replaceRegexpAll(haystack, pattern, replacement)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This does the same thing, but replaces all the occurrences. Example:
.. code-block:: sql
SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res
┌─res────────────────────────┐
│ HHeelllloo,, WWoorrlldd!! │
└────────────────────────────┘
As an exception, if a regular expression worked on an empty substring, the replacement is not made more than once.
Example:
.. code-block:: sql
SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res
┌─res─────────────────┐
│ here: Hello, World! │
└─────────────────────┘

View File

@ -0,0 +1,52 @@
Functions for searching strings
------------------------
The search is case-sensitive in all these functions.
The search substring or regular expression must be a constant in all these functions.
position(haystack, needle)
~~~~~~~~~~~~~~~~~~~~~~~~~~
Searches for the 'needle' substring in the 'haystack' string.
Returns the position (in bytes) of the found substring, starting from 1, or returns 0 if the substring was not found.
There's also positionCaseInsensitive function.
positionUTF8(haystack, needle)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The same as 'position', but the position is returned in Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception).
There's also positionCaseInsensitiveUTF8 function.
match(haystack, pattern)
~~~~~~~~~~~~~~~~~~~~~~~~
Checks whether the string matches the 'pattern' regular expression.
The regular expression is re2.
Returns 0 if it doesn't match, or 1 if it matches.
Note that the backslash symbol (``\``) is used for escaping in the regular expression. The same symbol is used for escaping in string literals.
So in order to escape the symbol in a regular expression, you must write two backslashes (``\\``) in a string literal.
The regular expression works with the string as if it is a set of bytes.
The regular expression can't contain null bytes.
For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster.
extract(haystack, pattern)
~~~~~~~~~~~~~~~~~~~~~~~~~~
Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern.
extractAll(haystack, pattern)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Extracts all the fragments of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the 'extract' function (it takes the first subpattern, or the entire expression if there isn't a subpattern).
like(haystack, pattern), оператор haystack LIKE pattern
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Checks whether a string matches a simple regular expression. The regular expression can contain the metasymbols ``%`` and ``_``.
``%`` indicates any quantity of any bytes (including zero characters).
``_`` indicates any one byte.
Use the backslash (``\``) for escaping metasymbols. See the note on escaping in the description of the 'match' function.
For regular expressions like%needle%, the code is more optimal and works as fast as the 'position' function. For other regular expressions, the code is the same as for the 'match' function.
notLike(haystack, pattern), оператор haystack NOT LIKE pattern
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The same thing as 'like', but negative.

View File

@ -0,0 +1,120 @@
Type conversion functions
----------------------------
toUInt8, toUInt16, toUInt32, toUInt64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
toInt8, toInt16, toInt32, toInt64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
toFloat32, toFloat64
~~~~~~~~~~~~~~~~~~~~
toUInt8OrZero, toUInt16OrZero, toUInt32OrZero, toUInt64OrZero, toInt8OrZero, toInt16OrZero, toInt32OrZero, toInt64OrZero, toFloat32OrZero, toFloat64OrZero
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
toDate, toDateTime
~~~~~~~~~~~~~~~~~~
toString
~~~~~~~~
Functions for converting between numbers, strings (but not fixed strings), dates, and dates with times. All these functions accept one argument.
When converting to or from a string, the value is formatted or parsed using the same rules as for the TabSeparated format (and almost all other text formats). If the string can't be parsed, an exception is thrown and the request is canceled.
When converting dates to numbers or vice versa, the date corresponds to the number of days since the beginning of the Unix epoch.
When converting dates with times to numbers or vice versa, the date with time corresponds to the number of seconds since the beginning of the Unix epoch.
Formats of date and date with time for toDate/toDateTime functions are defined as follows:
::
YYYY-MM-DD
YYYY-MM-DD hh:mm:ss
As an exception, if converting from UInt32, Int32, UInt64, or Int64 type numbers to Date, and if the number is greater than or equal to 65536, the number is interpreted as a Unix timestamp (and not as the number of days) and is rounded to the date. This allows support for the common occurrence of writing 'toDate(unix_timestamp)', which otherwise would be an error and would require writing the more cumbersome 'toDate(toDateTime(unix_timestamp))'.
Conversion between a date and date with time is performed the natural way: by adding a null time or dropping the time.
Conversion between numeric types uses the same rules as assignments between different numeric types in C++.
To do transformations on DateTime in given time zone, pass second argument with time zone name:
.. code-block:: sql
SELECT
now() AS now_local,
toString(now(), 'Asia/Yekaterinburg') AS now_yekat
┌───────────now_local─┬─now_yekat───────────┐
│ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │
└─────────────────────┴─────────────────────┘
To format DateTime in given time zone:
::
toString(now(), 'Asia/Yekaterinburg')
To get unix timestamp for string with datetime in specified time zone:
::
toUnixTimestamp('2000-01-01 00:00:00', 'Asia/Yekaterinburg')
toFixedString(s, N)
~~~~~~~~~~~~~~~~~~~~
Converts a String type argument to a FixedString(N) type (a string with fixed length N). N must be a constant. If the string has fewer bytes than N, it is passed with null bytes to the right. If the string has more bytes than N, an exception is thrown.
toStringCutToZero(s)
~~~~~~~~~~~~~~~~~~~~
Accepts a String or FixedString argument. Returns a String that is cut to a first null byte occurrence.
Example:
.. code-block:: sql
:) SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut
┌─s─────────────┬─s_cut─┐
│ foo\0\0\0\0\0 │ foo │
└───────────────┴───────┘
:) SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut
┌─s──────────┬─s_cut─┐
│ foo\0bar\0 │ foo │
└────────────┴───────┘
reinterpretAsUInt8, reinterpretAsUInt16, reinterpretAsUInt32, reinterpretAsUInt64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
reinterpretAsInt8, reinterpretAsInt16, reinterpretAsInt32, reinterpretAsInt64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
reinterpretAsFloat32, reinterpretAsFloat64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
reinterpretAsDate, reinterpretAsDateTime
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch.
reinterpretAsString
~~~~~~~~~~~~~~~~~~~
This function accepts a number or date or date with time, and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long.
CAST(x, t)
~~~~~~~~~~
Casts x to the t data type.
The syntax ``CAST(x AS t)`` is also supported.
Example:
.. code-block:: sql
SELECT
'2016-06-15 23:00:00' AS timestamp,
CAST(timestamp AS DateTime) AS datetime,
CAST(timestamp AS Date) AS date,
CAST(timestamp, 'String') AS string,
CAST(timestamp, 'FixedString(22)') AS fixed_string
┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐
│ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │
└─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘
Casting to FixedString(N) works only for String and FixedString(N).

View File

@ -0,0 +1,118 @@
Functions for working with URLs
------------------------
All these functions don't follow the RFC. They are maximally simplified for improved performance.
Функции, извлекающие часть URL-а.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If there isn't anything similar in a URL, an empty string is returned.
protocol
""""""""
- Selects the protocol. Examples: http, ftp, mailto, magnet...
domain
"""""""
- Selects the domain.
domainWithoutWWW
""""""""""""
- Selects the domain and removes no more than one 'www.' from the beginning of it, if present.
topLevelDomain
"""""""""""
- Selects the top-level domain. Example: .ru.
firstSignificantSubdomain
""""""""""""""
- Selects the "first significant subdomain". This is a non-standard concept specific to Yandex.Metrica. The first significant subdomain is a second-level domain if it is 'com', 'net', 'org', or 'co'. Otherwise, it is a third-level domain. For example, firstSignificantSubdomain('https://news.yandex.ru/') = 'yandex', firstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex'. The list of "insignificant" second-level domains and other implementation details may change in the future.
cutToFirstSignificantSubdomain
""""""""""""""""
- Selects the part of the domain that includes top-level subdomains up to the "first significant subdomain" (see the explanation above).
For example, ``cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'``.
path
""""
- Selects the path. Example: /top/news.html The path does not include the query-string.
pathFull
"""""""
- The same as above, but including query-string and fragment. Example: /top/news.html?page=2#comments
queryString
"""""""""
- Selects the query-string. Example: page=1&lr=213. query-string does not include the first question mark, or # and everything that comes after #.
fragment
""""""
- Selects the fragment identifier. fragment does not include the first number sign (#).
queryStringAndFragment
"""""""""
- Selects the query-string and fragment identifier. Example: page=1#29390.
extractURLParameter(URL, name)
"""""""""
- Selects the value of the 'name' parameter in the URL, if present. Otherwise, selects an empty string. If there are many parameters with this name, it returns the first occurrence. This function works under the assumption that the parameter name is encoded in the URL in exactly the same way as in the argument passed.
extractURLParameters(URL)
""""""""""
- Gets an array of name=value strings corresponding to the URL parameters. The values are not decoded in any way.
extractURLParameterNames(URL)
""""""""
- Gets an array of name=value strings corresponding to the names of URL parameters. The values are not decoded in any way.
URLHierarchy(URL)
"""""""""
- Gets an array containing the URL trimmed to the ``/``, ``?`` characters in the path and query-string. Consecutive separator characters are counted as one. The cut is made in the position after all the consecutive separator characters. Example:
URLPathHierarchy(URL)
""""""""
- The same thing, but without the protocol and host in the result. The / element (root) is not included. Example:
This function is used for implementing tree-view reports by URL in Yandex.Metrica.
::
URLPathHierarchy('https://example.com/browse/CONV-6788') =
[
'/browse/',
'/browse/CONV-6788'
]
decodeURLComponent(URL)
"""""""""""
Returns a URL-decoded URL.
Example:
.. code-block:: sql
:) SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL;
┌─DecodedURL─────────────────────────────┐
│ http://127.0.0.1:8123/?query=SELECT 1; │
└────────────────────────────────────────┘
Functions that remove part of a URL.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the URL doesn't have anything similar, the URL remains unchanged.
cutWWW
"""""
Removes no more than one 'www.' from the beginning of the URL's domain, if present.
cutQueryString
""""""
Removes the query-string. The question mark is also removed..
cutFragment
""""""""
Removes the fragment identifier. The number sign is also removed.
cutQueryStringAndFragment
""""""""""
Removes the query-string and fragment identifier. The question mark and number sign are also removed.
cutURLParameter(URL, name)
""""""""""
Removes the URL parameter named 'name', if present. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument.

View File

@ -0,0 +1,119 @@
Functions for working with Yandex.Metrica dictionaries
----------------
In order for the functions below to work, the server config must specify the paths and addresses for getting all the Yandex.Metrica dictionaries. The dictionaries are loaded at the first call of any of these functions. If the reference lists can't be loaded, an exception is thrown.
For information about creating reference lists, see the section "Dictionaries".
Multiple geobases
~~~~~~~~~
ClickHouse supports working with multiple alternative geobases (regional hierarchies) simultaneously, in order to support various perspectives on which countries certain regions belong to.
The 'clickhouse-server' config specifies the file with the regional hierarchy:
``<path_to_regions_hierarchy_file>/opt/geo/regions_hierarchy.txt</path_to_regions_hierarchy_file>``
Besides this file, it also searches for files nearby that have the _ symbol and any suffix appended to the name (before the file extension).
For example, it will also find the file ``/opt/geo/regions_hierarchy_ua.txt``, if present.
``ua`` is called the dictionary key. For a dictionary without a suffix, the key is an empty string.
All the dictionaries are re-loaded in runtime (once every certain number of seconds, as defined in the builtin_dictionaries_reload_interval config parameter, or once an hour by default). However, the list of available dictionaries is defined one time, when the server starts.
All functions for working with regions have an optional argument at the end - the dictionary key. It is indicated as the geobase.
Example:
::
regionToCountry(RegionID) - Uses the default dictionary: /opt/geo/regions_hierarchy.txt
regionToCountry(RegionID, '') - Uses the default dictionary: /opt/geo/regions_hierarchy.txt
regionToCountry(RegionID, 'ua') - Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt
regionToCity(id[, geobase])
~~~~~~~~
Accepts a UInt32 number - the region ID from the Yandex geobase. If this region is a city or part of a city, it returns the region ID for the appropriate city. Otherwise, returns 0.
regionToArea(id[, geobase])
~~~~~~~~
Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as 'regionToCity'.
.. code-block:: sql
SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua'), 'en')
FROM system.numbers
LIMIT 15
┌─regionToName(regionToArea(toUInt32(number), \'ua\'), \'en\')─┐
│ │
│ Moscow and Moscow region │
│ Saint-Petersburg and Leningradskaya oblast │
│ Belgorod District │
│ Ivanovo district │
│ Kaluga District │
│ Kostroma District │
│ Kursk District │
│ Lipetsk District │
│ Orel District │
│ Ryazhan District │
│ Smolensk District │
│ Tambov District │
│ Tver District │
│ Tula District │
└──────────────────────────────────────────────────────────────┘
regionToDistrict(id[, geobase])
~~~~~~~~~
Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as 'regionToCity'.
.. code-block:: sql
SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua'), 'en')
FROM system.numbers
LIMIT 15
┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'), \'en\')─┐
│ │
│ Central │
│ Northwest │
│ South │
│ North Kavkaz │
│ Volga Region │
│ Ural │
│ Siberian │
│ Far East │
│ Scotland │
│ Faroe Islands │
│ Flemish Region │
│ Brussels-Capital Region │
│ Wallonia │
│ Federation of Bosnia and Herzegovina │
└──────────────────────────────────────────────────────────────────┘
regionToCountry(id[, geobase])
~~~~~~~~~
Converts a region to a country. In every other way, this function is the same as 'regionToCity'.
Example: ``regionToCountry(toUInt32(213)) = 225`` converts ``Moscow (213)`` to ``Russia (225)``.
regionToContinent(id[, geobase])
~~~~~~~~~
Converts a region to a continent. In every other way, this function is the same as 'regionToCity'.
Example: ``regionToContinent(toUInt32(213)) = 10001`` converts Moscow (213) to Eurasia (10001).
regionToPopulation(id[, geobase])
~~~~~~~~
Gets the population for a region.
The population can be recorded in files with the geobase. See the section "External dictionaries".
If the population is not recorded for the region, it returns 0.
In the Yandex geobase, the population might be recorded for child regions, but not for parent regions..
regionIn(lhs, rhs[, geobase])
~~~~~~~~~~
Checks whether a 'lhs' region belongs to a 'rhs' region. Returns a UInt8 number equal to 1 if it belongs, or 0 if it doesn't belong.
The relationship is reflexive - any region also belongs to itself.
regionHierarchy(id[, geobase])
~~~~~~~~~
ПAccepts a UInt32 number - the region ID from the Yandex geobase. Returns an array of region IDs consisting of the passed region and all parents along the chain.
Example: ``regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]``.
regionToName(id[, lang])
~~~~~~~~
Accepts a UInt32 number - the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language 'ru' is used. If the language is not supported, an exception is thrown. Returns a string - the name of the region in the corresponding language. If the region with the specified ID doesn't exist, an empty string is returned.
``ua`` and ``uk`` mean the same thing - Ukrainian.

133
docs/en/getting_started.md Normal file
View File

@ -0,0 +1,133 @@
Начало работы
=============
Системные требования
-----------------
Система некроссплатформенная. Требуется ОС Linux Ubuntu не более старая, чем Precise (12.04); архитектура x86_64 с поддержкой набора инструкций SSE 4.2.
Для проверки наличия SSE 4.2, выполните:
::
grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported"
Рекомендуется использовать Ubuntu Trusty или Ubuntu Xenial или Ubuntu Precise.
Терминал должен работать в кодировке UTF-8 (как по умолчанию в Ubuntu).
Установка
-----------------
В целях тестирования и разработки, система может быть установлена на один сервер или на рабочий компьютер.
Установка из пакетов
~~~~~~~~~~~~~~~~~~~~
Пропишите в `/etc/apt/sources.list` (или в отдельный файл `/etc/apt/sources.list.d/clickhouse.list`) репозитории:
::
deb http://repo.yandex.ru/clickhouse/trusty stable main
На других версиях Ubuntu, замените `trusty` на `xenial` или `precise`.
Затем выполните:
::
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 # optional
sudo apt-get update
sudo apt-get install clickhouse-client clickhouse-server-common
Также можно скачать и установить пакеты вручную, отсюда:
http://repo.yandex.ru/clickhouse/trusty/pool/main/c/clickhouse/,
http://repo.yandex.ru/clickhouse/xenial/pool/main/c/clickhouse/,
http://repo.yandex.ru/clickhouse/precise/pool/main/c/clickhouse/.
ClickHouse содержит настройки ограничения доступа. Они расположены в файле users.xml (рядом с config.xml).
По умолчанию, разрешён доступ отовсюду для пользователя default без пароля. См. секцию users/default/networks.
Подробнее смотрите в разделе "конфигурационные файлы".
Установка из исходников
~~~~~~~~~~~~~~~~~~~~~~~
Для сборки воспользуйтесь инструкцией: build.md
Вы можете собрать пакеты и установить их.
Также вы можете использовать программы без установки пакетов.
Клиент: dbms/src/Client/
Сервер: dbms/src/Server/
Для сервера создаёте директории с данными, например:
::
/opt/clickhouse/data/default/
/opt/clickhouse/metadata/default/
(Настраивается в конфиге сервера.)
Сделайте chown под нужного пользователя.
Обратите внимание на путь к логам в конфиге сервера (src/dbms/src/Server/config.xml).
Другие методы установки
~~~~~~~~~~~~~~~~~~~~~~~
Docker образ: https://hub.docker.com/r/yandex/clickhouse-server/
Gentoo overlay: https://github.com/kmeaw/clickhouse-overlay
Запуск
-------
Для запуска сервера (в качестве демона), выполните:
::
sudo service clickhouse-server start
Смотрите логи в директории `/var/log/clickhouse-server/`
Если сервер не стартует - проверьте правильность конфигурации в файле `/etc/clickhouse-server/config.xml`
Также можно запустить сервер из консоли:
::
clickhouse-server --config-file=/etc/clickhouse-server/config.xml
При этом, лог будет выводиться в консоль - удобно для разработки.
Если конфигурационный файл лежит в текущей директории, то указывать параметр --config-file не требуется - по умолчанию будет использован файл ./config.xml
Соединиться с сервером можно с помощью клиента командной строки:
::
clickhouse-client
Параметры по умолчанию обозначают - соединяться с localhost:9000, от имени пользователя default без пароля.
Клиент может быть использован для соединения с удалённым сервером. Пример:
::
clickhouse-client --host=example.com
Подробнее смотри раздел "Клиент командной строки".
Проверим работоспособность системы:
::
milovidov@milovidov-Latitude-E6320:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client
ClickHouse client version 0.0.18749.
Connecting to localhost:9000.
Connected to ClickHouse server version 0.0.18749.
:) SELECT 1
SELECT 1
┌─1─┐
│ 1 │
└───┘
1 rows in set. Elapsed: 0.003 sec.
:)
Поздравляю, система работает!
Тестовые данные
---------------
Если вы сотрудник Яндекса, вы можете воспользоваться тестовыми данными Яндекс.Метрики для изучения возможностей системы.
Как загрузить тестовые данные, написано здесь.
Если вы внешний пользователь системы, вы можете воспользоваться использовать общедоступные данные, способы загрузки которых указаны здесь.
Если возникли вопросы
---------------------
Если вы являетесь сотрудником Яндекса, обращайтесь на внутреннюю рассылку по ClickHouse.
Вы можете подписаться на эту рассылку, чтобы получать анонсы, быть в курсе нововведений, а также видеть вопросы, которые возникают у других пользователей.
Иначе вы можете задавать вопросы на Stackoverflow или участвовать в обсуждениях на Google Groups. Также вы можете отправить приватное сообщение для разрабочиков по адресу clickhouse-feedback@yandex-team.com.

130
docs/en/getting_started.rst Normal file
View File

@ -0,0 +1,130 @@
Getting started
=============
System requirements
-----------------
This is not a cross-platform system. It requires Linux Ubuntu Precise (12.04) or newer, x86_64 architecture with SSE 4.2 instruction set.
To test for SSE 4.2 support, do:
::
grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported"
We recommend using Ubuntu Trusty or Ubuntu Xenial or Ubuntu Precise.
The terminal must use UTF-8 encoding (the default in Ubuntu).
Installation
-----------------
For testing and development, the system can be installed on a single server or on a desktop computer.
Installing from packages
~~~~~~~~~~~~~~~~~~~~
In `/etc/apt/sources.list` (or in a separate `/etc/apt/sources.list.d/clickhouse.list` file), add the repository:
::
deb http://repo.yandex.ru/clickhouse/trusty stable main
For other Ubuntu versions, replace `trusty` to `xenial` or `precise`.
Then run:
::
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 # optional
sudo apt-get update
sudo apt-get install clickhouse-client clickhouse-server-common
You can also download and install packages manually from here:
http://repo.yandex.ru/clickhouse/trusty/pool/main/c/clickhouse/,
http://repo.yandex.ru/clickhouse/xenial/pool/main/c/clickhouse/,
http://repo.yandex.ru/clickhouse/precise/pool/main/c/clickhouse/.
ClickHouse contains access restriction settings. They are located in the 'users.xml' file (next to 'config.xml').
By default, access is allowed from everywhere for the default user without a password. See 'user/default/networks'. For more information, see the section "Configuration files".
Installing from source
~~~~~~~~~~~~~~~~~~~~~~~
To build, follow the instructions in build.md (for Linux) or in build_osx.md (for Mac OS X).
You can compile packages and install them. You can also use programs without installing packages.
::
Client: dbms/src/Client/
Server: dbms/src/Server/
For the server, create a catalog with data, such as:
::
/opt/clickhouse/data/default/
/opt/clickhouse/metadata/default/
(Configured in the server config.)
Run 'chown' for the desired user.
Note the path to logs in the server config (src/dbms/src/Server/config.xml).
Other methods of installation
~~~~~~~~~~~~~~~~~~~~~~~
The Docker image is located here: https://hub.docker.com/r/yandex/clickhouse-server/
There is Gentoo overlay located here: https://github.com/kmeaw/clickhouse-overlay
Launch
-------
To start the server (as a daemon), run:
::
sudo service clickhouse-server start
View the logs in the catalog `/var/log/clickhouse-server/`
If the server doesn't start, check the configurations in the file `/etc/clickhouse-server/config.xml`
You can also launch the server from the console:
::
clickhouse-server --config-file=/etc/clickhouse-server/config.xml
In this case, the log will be printed to the console, which is convenient during development. If the configuration file is in the current directory, you don't need to specify the '--config-file' parameter. By default, it uses './config.xml'.
You can use the command-line client to connect to the server:
::
clickhouse-client
The default parameters indicate connecting with localhost:9000 on behalf of the user 'default' without a password.
The client can be used for connecting to a remote server. For example:
::
clickhouse-client --host=example.com
For more information, see the section "Command-line client".
Checking the system:
::
milovidov@milovidov-Latitude-E6320:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client
ClickHouse client version 0.0.18749.
Connecting to localhost:9000.
Connected to ClickHouse server version 0.0.18749.
:) SELECT 1
SELECT 1
┌─1─┐
│ 1 │
└───┘
1 rows in set. Elapsed: 0.003 sec.
:)
Congratulations, it works!
Test data
---------------
If you are Yandex employee, you can use Yandex.Metrica test data to explore the system's capabilities. You can find instructions for using the test data here.
Otherwise, you could use one of available public datasets, described here.
If you have questions
---------------------
If you are Yandex employee, use internal ClickHouse maillist.
You can subscribe to this list to get announcements, information on new developments, and questions that other users have.
Otherwise, you could ask questions on Stack Overflow; discuss in Google Groups; or send private message to developers to address clickhouse-feedback@yandex-team.com.

25
docs/en/index.rst Normal file
View File

@ -0,0 +1,25 @@
Documentation
-----------------
.. toctree::
:maxdepth: 6
introduction/index
getting_started
interfaces/index
query_language/index
external_data
table_engines/index
system_tables/index
table_functions/index
formats/index
data_types/index
operators/index
functions/index
agg_functions/index
dicts/index
settings/index
configuration_files
access_rights
quotas

View File

@ -0,0 +1,93 @@
Command-line client
-----------------------
Для работы из командной строки вы можете использовать clickhouse-client:
::
$ clickhouse-client
ClickHouse client version 0.0.26176.
Connecting to localhost:9000.
Connected to ClickHouse server version 0.0.26176.
:) SELECT 1
The ``clickhouse-client`` program accepts the following parameters, which are all optional:
``--host, -h`` - server name, by defaul - localhost.
You can use either the name or the IPv4 or IPv6 address.
``--port`` - The port to connect to, by default - 9000.
Note that the HTTP interface and the native interface use different ports.
``--user, -u`` - The username, by default - default.
``--password`` - The password, by default - empty string.
``--query, -q`` - Query to process when using non-interactive mode.
``--database, -d`` - Select the current default database, by default - the current DB from the server settings (by default, the 'default' DB).
``--multiline, -m`` - If specified, allow multiline queries (do not send request on Enter).
``--multiquery, -n`` - If specified, allow processing multiple queries separated by semicolons.
Only works in non-interactive mode.
``--format, -f`` - Use the specified default format to output the result.
``--vertical, -E`` - If specified, use the Vertical format by default to output the result. This is the same as '--format=Vertical'. In this format, each value is printed on a separate line, which is helpful when displaying wide tables.
``--time, -t`` - If specified, print the query execution time to 'stderr' in non-interactive mode.
``--stacktrace`` - If specified, also prints the stack trace if an exception occurs.
``--config-file`` - Name of the configuration file that has additional settings or changed defaults for the settings listed above.
By default, files are searched for in this order:
./clickhouse-client.xml
~/./clickhouse-client/config.xml
/etc/clickhouse-client/config.xml
Settings are only taken from the first file found.
You can also specify any settings that will be used for processing queries. For example, ``clickhouse-client --max_threads=1``. For more information, see the section "Settings".
The client can be used in interactive and non-interactive (batch) mode.
To use batch mode, specify the 'query' parameter, or send data to 'stdin' (it verifies that 'stdin' is not a terminal), or both.
Similar to the HTTP interface, when using the 'query' parameter and sending data to 'stdin', the request is a concatenation of the 'query' parameter, a line break, and the data in 'stdin'. This is convenient for large INSERT queries.
Examples for insert data via clickhouse-client:
::
echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
3, 'some text', '2016-08-14 00:00:00'
4, 'some more text', '2016-08-14 00:00:01'
_EOF
cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
In batch mode, the default data format is TabSeparated. You can set the format in the FORMAT clause of the query.
By default, you can only process a single query in batch mode. To make multiple queries from a "script," use the 'multiquery' parameter. This works for all queries except INSERT. Query results are output consecutively without additional separators.
Similarly, to process a large number of queries, you can run 'clickhouse-client' for each query. Note that it may take tens of milliseconds to launch the 'clickhouse-client' program.
In interactive mode, you get a command line where you can enter queries.
If 'multiline' is not specified (the default):
To run a query, press Enter. The semicolon is not necessary at the end of the query. To enter a multiline query, enter a backslash ``\`` before the line break - after you press Enter, you will be asked to enter the next line of the query.
If 'multiline' is specified:
To run a query, end it with a semicolon and press Enter. If the semicolon was omitted at the end of the entered line, you will be asked to enter the next line of the query.
You can specify ``\G`` instead of or after the semicolon. This indicates using Vertical format. In this format, each value is printed on a separate line, which is convenient for wide tables. This unusual feature was added for compatibility with the MySQL CLI.
The command line is based on 'readline' (and 'history') (or 'libedit', or even nothing, depending on build). In other words, it uses the familiar keyboard shortcuts and keeps a history. The history is written to /.clickhouse-client-history.
By default, the format used is PrettyCompact. You can change the format in the FORMAT clause of the query, or by specifying '\G' at the end of the query, using the '--format' or '--vertical' argument in the command line, or using the client configuration file.
To exit the client, press Ctrl+D (or Ctrl+C), or enter one of the following :
"exit", "quit", "logout", "учше", "йгше", "дщпщге", "exit;", "quit;", "logout;", "учшеж", "йгшеж", "дщпщгеж", "q", "й", "\q", "\Q", ":q", "\й", "\Й", "Жй"
When processing a query, the client shows:
#. Progress, which is updated no more than 10 times per second (by default). For quick queries, the progress might not have time to be displayed.
#. The formatted query after parsing, for debugging.
#. The result in the specified format.
#. The number of lines in the result, the time passed, and the average speed of query processing.
To cancel a lengthy query, press Ctrl+C. However, you will still need to wait a little for the server to abort the request. It is not possible to cancel a query at certain stages. If you don't wait and press Ctrl+C a second time, the client will exit.
The command-line client allows passing external data (external temporary tables) for querying. For more information, see the section "External data for request processing".

View File

@ -0,0 +1,204 @@
HTTP interface
==============
The HTTP interface lets you use ClickHouse on any platform from any programming language. We use it for working from Java and Perl, as well as shell scripts. In other departments, the HTTP interface is used from Perl, Python, and Go. The HTTP interface is more limited than the native interface, but it has better compatibility.
By default, clickhouse-server listens for HTTP on port 8123 (this can be changed in the config).
If you make a GET / request without parameters, it returns the string "Ok" (with a line break at the end). You can use this in health-check scripts.
.. code-block:: bash
$ curl 'http://localhost:8123/'
Ok.
Send the request as a URL 'query' parameter, or as a POST. Or send the beginning of the request in the 'query' parameter, and the rest in the POST (we'll explain later why this is necessary). URL length is limited by 16KB, this limit should be taken into account when sending long queries in the 'query' parameter.
If successful, you receive the 200 response code and the result in the response body.
If an error occurs, you receive the 500 response code and an error description text in the response body.
When using the GET method, 'readonly' is set. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter.
Examples:
.. code-block:: bash
$ curl 'http://localhost:8123/?query=SELECT%201'
1
$ wget -O- -q 'http://localhost:8123/?query=SELECT 1'
1
$ GET 'http://localhost:8123/?query=SELECT 1'
1
$ echo -ne 'GET /?query=SELECT%201 HTTP/1.0\r\n\r\n' | nc localhost 8123
HTTP/1.0 200 OK
Connection: Close
Date: Fri, 16 Nov 2012 19:21:50 GMT
1
As you can see, curl is not very convenient because spaces have to be URL-escaped. Although wget escapes everything on its own, we don't recommend it because it doesn't work well over HTTP 1.1 when using keep-alive and Transfer-Encoding: chunked.
.. code-block:: bash
$ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @-
1
$ echo 'SELECT 1' | curl 'http://localhost:8123/?query=' --data-binary @-
1
$ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @-
1
If part of the query is sent in the parameter, and part in the POST, a line break is inserted between these two data parts.
Example (this won't work):
.. code-block:: bash
$ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @-
Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL
ECT 1
, expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception
By default, data is returned in TabSeparated format (for more information, see the "Formats" section).
You use the FORMAT clause of the query to request any other format.
.. code-block:: bash
$ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @-
┏━━━┓
┃ 1 ┃
┡━━━┩
│ 1 │
└───┘
The POST method of transmitting data is necessary for INSERT queries. In this case, you can write the beginning of the query in the URL parameter, and use POST to pass the data to insert. The data to insert could be, for example, a tab-separated dump from MySQL. In this way, the INSERT query replaces LOAD DATA LOCAL INFILE from MySQL.
Examples:
Creating a table:
.. code-block:: bash
echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | POST 'http://localhost:8123/'
Using the familiar INSERT query for data insertion:
.. code-block:: bash
echo 'INSERT INTO t VALUES (1),(2),(3)' | POST 'http://localhost:8123/'
Data can be sent separately from the query:
.. code-block:: bash
echo '(4),(5),(6)' | POST 'http://localhost:8123/?query=INSERT INTO t VALUES'
You can specify any data format. The 'Values' format is the same as what is used when writing INSERT INTO t VALUES:
.. code-block:: bash
echo '(7),(8),(9)' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT Values'
To insert data from a tab-separated dump, specify the corresponding format:
.. code-block:: bash
echo -ne '10\n11\n12\n' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT TabSeparated'
Reading the table contents. Data is output in random order due to parallel query processing:
.. code-block:: bash
$ GET 'http://localhost:8123/?query=SELECT a FROM t'
7
8
9
10
11
12
1
2
3
4
5
6
Deleting the table.
.. code-block:: bash
POST 'http://localhost:8123/?query=DROP TABLE t'
For successful requests that don't return a data table, an empty response body is returned.
You can use compression when transmitting data. The compressed data has a non-standard format, and you will need to use a special compressor program to work with it (`sudo apt-get install compressor-metrika-yandex`).
If you specified 'compress=1' in the URL, the server will compress the data it sends you.
If you specified 'decompress=1' in the URL, the server will decompress the same data that you pass in the POST method.
You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed.
You can use the 'database' URL parameter to specify the default database.
.. code-block:: bash
$ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @-
0
1
2
3
4
5
6
7
8
9
By default, the database that is registered in the server settings is used as the default database. By default, this is the database called 'default'. Alternatively, you can always specify the database using a dot before the table name.
The username and password can be indicated in one of two ways:
1. Using HTTP Basic Authentication. Example: ::
echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @-
2. In the 'user' and 'password' URL parameters. Example: ::
echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @-
3. Using 'X-ClickHouse-User' and 'X-ClickHouse-Key' headers. Example: ::
echo 'SELECT 1' | curl -H "X-ClickHouse-User: user" -H "X-ClickHouse-Key: password" 'http://localhost:8123/' -d @-
If the user name is not indicated, the username 'default' is used. If the password is not indicated, an empty password is used.
You can also use the URL parameters to specify any settings for processing a single query, or entire profiles of settings. Example:
`http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1`
For more information, see the section "Settings".
.. code-block:: bash
$ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @-
0
1
2
3
4
5
6
7
8
9
For information about other parameters, see the section "SET".
In contrast to the native interface, the HTTP interface does not support the concept of sessions or session settings, does not allow aborting a query (to be exact, it allows this in only a few cases), and does not show the progress of query processing. Parsing and data formatting are performed on the server side, and using the network might be ineffective.
The optional 'query_id' parameter can be passed as the query ID (any string). For more information, see the section "Settings, replace_running_query".
The optional 'quota_key' parameter can be passed as the quota key (any string). It can also be passed as 'X-ClickHouse-Quota' header. For more information, see the section "Quotas".
The HTTP interface allows passing external data (external temporary tables) for querying. For more information, see the section "External data for query processing".

View File

@ -0,0 +1,9 @@
Interfaces
==========
To explore the system's capabilities, download data to tables, or make manual queries, use the clickhouse-client program.
.. toctree::
:glob:
*

View File

@ -0,0 +1,4 @@
JDBC driver
------------
There is official JDBC driver for ClickHouse. See `here <https://github.com/yandex/clickhouse-jdbc>`_ .

View File

@ -0,0 +1,4 @@
Native interface (TCP)
----------------------
The native interface is used in the "clickhouse-client" command-line client for interaction between servers with distributed query processing, and also in C++ programs. We will only cover the command-line client.

View File

@ -0,0 +1,30 @@
Third-party client libraries
--------------------------------------
There exist third-party client libraries for ClickHouse:
* Python:
- `infi.clickhouse_orm <https://github.com/Infinidat/infi.clickhouse_orm>`_
- `sqlalchemy-clickhouse <https://github.com/cloudflare/sqlalchemy-clickhouse>`_
* PHP
- `clickhouse-php-client <https://github.com/8bitov/clickhouse-php-client>`_
- `PhpClickHouseClient <https://github.com/SevaCode/PhpClickHouseClient>`_
- `phpClickHouse <https://github.com/smi2/phpClickHouse>`_
* Go
- `clickhouse <https://github.com/kshvakov/clickhouse/>`_
- `go-clickhouse <https://github.com/roistat/go-clickhouse>`_
* NodeJs
- `clickhouse <https://github.com/TimonKK/clickhouse>`_
- `node-clickhouse <https://github.com/apla/node-clickhouse>`_
* Perl
- `perl-DBD-ClickHouse <https://github.com/elcamlost/perl-DBD-ClickHouse>`_
- `HTTP-ClickHouse <https://metacpan.org/release/HTTP-ClickHouse>`_
- `AnyEvent-ClickHouse <https://metacpan.org/release/AnyEvent-ClickHouse>`_
* Ruby
- `clickhouse <https://github.com/archan937/clickhouse>`_
* R
- `clickhouse-r <https://github.com/hannesmuehleisen/clickhouse-r>`_
* .NET
- `ClickHouse-Net <https://github.com/killwort/ClickHouse-Net>`_
Libraries was not tested by us. Ordering is arbitrary.

View File

@ -0,0 +1,15 @@
Third-party GUI
------------------------------
There are `open source project Tabix <https://github.com/smi2/tabix.ui>`_ company of SMI2, which implements a graphical web interface for ClickHouse.
Tabix key features:
- works with ClickHouse from the browser directly, without installing additional software;
- query editor that supports highlighting of SQL syntax ClickHouse, auto-completion for all objects, including dictionaries and context-sensitive help for built-in functions.
- graphs, charts and geo-referenced for mapping query results;
- interactive designer PivotTables (pivot) for query results;
- graphical tools for analysis ClickHouse;
- two color theme: light and dark.
`Tabix documentation <https://tabix.io/doc/>`_

View File

@ -0,0 +1,62 @@
Distinctive features of ClickHouse
===================================
1. True column-oriented DBMS.
---------------------------------
In a true column-oriented DBMS, there isn't any "garbage" stored with the values. For example, constant-length values must be supported, to avoid storing their length "number" next to the values. As an example, a billion UInt8-type values should actually consume around 1 GB uncompressed, or this will strongly affect the CPU use. It is very important to store data compactly (without any "garbage") even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data.
This is worth noting because there are systems that can store values of separate columns separately, but that can't effectively process analytical queries due to their optimization for other scenarios. Example are HBase, BigTable, Cassandra, and HyperTable. In these systems, you will get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second.
Also note that ClickHouse is a DBMS, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server.
2. Data compression.
-----------------
Some column-oriented DBMSs (InfiniDB CE and MonetDB) do not use data compression. However, data compression really improves performance.
3. Disk storage of data.
----------------------------
Many column-oriented DBMSs (SAP HANA, and Google PowerDrill) can only work in RAM. But even on thousands of servers, the RAM is too small for storing all the pageviews and sessions in Yandex.Metrica.
4. Parallel processing on multiple cores.
---------------------------------------------------------------
Large queries are parallelized in a natural way.
5. Distributed processing on multiple servers.
-----------------------------------------------
Almost none of the columnar DBMSs listed above have support for distributed processing.
In ClickHouse, data can reside on different shards. Each shard can be a group of replicas that are used for fault tolerance. The query is processed on all the shards in parallel. This is transparent for the user.
6. SQL support.
---------------
If you are familiar with standard SQL, we can't really talk about SQL support.
NULLs are not supported. All the functions have different names. However, this is a declarative query language based on SQL that can't be differentiated from SQL in many instances.
JOINs are supported. Subqueries are supported in FROM, IN, JOIN clauses; and scalar subqueries.
Correlated subqueries are not supported.
7. Vector engine.
-----------------
Data is not only stored by columns, but is processed by vectors - parts of columns. This allows us to achieve high CPU performance.
8. Real-time data updates.
-----------------------
ClickHouse supports primary key tables. In order to quickly perform queries on the range of the primary key, the data is sorted incrementally using the merge tree. Due to this, data can continually be added to the table. There is no locking when adding data.
9. Indexes.
-----------------
Having a primary key allows, for example, extracting data for specific clients (Metrica counters) for a specific time range, with low latency less than several dozen milliseconds.
10. Suitable for online queries.
------------------
This lets us use the system as the back-end for a web interface. Low latency means queries can be processed without delay, while the Yandex.Metrica interface page is loading (in online mode).
11. Support for approximated calculations.
-----------------
#. The system contains aggregate functions for approximated calculation of the number of various values, medians, and quantiles.
#. Supports running a query based on a part (sample) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk.
#. Supports running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources.
14. Data replication and support for data integrity on replicas.
-----------------
Uses asynchronous multimaster replication. After being written to any available replica, data is distributed to all the remaining replicas. The system maintains identical data on different replicas. Data is restored automatically after a failure, or using a "button" for complex cases.
For more information, see the section "Data replication".

View File

@ -0,0 +1,6 @@
ClickHouse features that can be considered disadvantages
------------------------------------------------------------
#. No transactions.
#. For aggregation, query results must fit in the RAM on a single server. However, the volume of source data for a query may be indefinitely large.
#. Lack of full-fledged UPDATE/DELETE implementation.

Some files were not shown because too many files have changed in this diff Show More