Merge branch 'master' into try-to-remove-dry-run

This commit is contained in:
Nikolai Kochetov 2019-08-08 18:19:51 +03:00
commit 3df0d66058
73 changed files with 1403 additions and 221 deletions

2
contrib/fastops vendored

@ -1 +1 @@
Subproject commit d2c85c5d6549cfd648a7f31ef7b14341881ff8ae
Subproject commit 88752a5e03cf34639a4a37a4b41d8b463fffd2b5

View File

@ -3,9 +3,8 @@ set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/fastops)
set(SRCS "")
if(HAVE_AVX)
set (SRCS ${SRCS} ${LIBRARY_DIR}/fastops/avx/ops_avx.cpp ${LIBRARY_DIR}/fastops/core/FastIntrinsics.cpp)
set (SRCS ${SRCS} ${LIBRARY_DIR}/fastops/avx/ops_avx.cpp)
set_source_files_properties(${LIBRARY_DIR}/fastops/avx/ops_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -DNO_AVX2")
set_source_files_properties(${LIBRARY_DIR}/fastops/core/FastIntrinsics.cpp PROPERTIES COMPILE_FLAGS "-mavx -DNO_AVX2")
endif()
if(HAVE_AVX2)

View File

@ -77,7 +77,7 @@ void MySQLHandler::run()
if (!connection_context.mysql.max_packet_size)
connection_context.mysql.max_packet_size = MAX_PACKET_LENGTH;
LOG_DEBUG(log, "Capabilities: " << handshake_response.capability_flags
/* LOG_TRACE(log, "Capabilities: " << handshake_response.capability_flags
<< "\nmax_packet_size: "
<< handshake_response.max_packet_size
<< "\ncharacter_set: "
@ -91,7 +91,7 @@ void MySQLHandler::run()
<< "\ndatabase: "
<< handshake_response.database
<< "\nauth_plugin_name: "
<< handshake_response.auth_plugin_name);
<< handshake_response.auth_plugin_name);*/
client_capability_flags = handshake_response.capability_flags;
if (!(client_capability_flags & CLIENT_PROTOCOL_41))

View File

@ -696,6 +696,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
LOG_INFO(log, "Listening https://" + address.toString());
#else
UNUSED(port);
throw Exception{"HTTPS protocol is disabled because Poco library was built without NetSSL support.",
ErrorCodes::SUPPORT_IS_DISABLED};
#endif
@ -732,6 +733,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
new Poco::Net::TCPServerParams));
LOG_INFO(log, "Listening for connections with secure native protocol (tcp_secure): " + address.toString());
#else
UNUSED(port);
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
ErrorCodes::SUPPORT_IS_DISABLED};
#endif
@ -768,6 +770,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
LOG_INFO(log, "Listening for secure replica communication (interserver) https://" + address.toString());
#else
UNUSED(port);
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
ErrorCodes::SUPPORT_IS_DISABLED};
#endif
@ -788,6 +791,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
LOG_INFO(log, "Listening for MySQL compatibility protocol: " + address.toString());
#else
UNUSED(port);
throw Exception{"SSL support for MySQL protocol is disabled because Poco library was built without NetSSL support.",
ErrorCodes::SUPPORT_IS_DISABLED};
#endif

View File

@ -1,5 +1,6 @@
#pragma once
#include <algorithm>
#include <roaring/roaring.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
@ -454,6 +455,44 @@ public:
return count;
}
/**
* Return new set with specified range (not include the range_end)
*/
UInt64 rb_range(UInt32 range_start, UInt32 range_end, RoaringBitmapWithSmallSet& r1) const
{
UInt64 count = 0;
if (range_start >= range_end)
return count;
if (isSmall())
{
std::vector<T> ans;
for (const auto & x : small)
{
T val = x.getValue();
if ((UInt32)val >= range_start && (UInt32)val < range_end)
{
r1.add(val);
count++;
}
}
}
else
{
roaring_uint32_iterator_t iterator;
roaring_init_iterator(rb, &iterator);
roaring_move_uint32_iterator_equalorlarger(&iterator, range_start);
while (iterator.has_value)
{
if ((UInt32)iterator.current_value >= range_end)
break;
r1.add(iterator.current_value);
roaring_advance_uint32_iterator(&iterator);
count++;
}
}
return count;
}
private:
/// To read and write the DB Buffer directly, migrate code from CRoaring
void db_roaring_bitmap_add_many(DB::ReadBuffer & dbBuf, roaring_bitmap_t * r, size_t n_args)

View File

@ -26,10 +26,12 @@ namespace ErrorCodes
template <typename T>
int ColumnDecimal<T>::compareAt(size_t n, size_t m, const IColumn & rhs_, int) const
{
auto other = static_cast<const Self &>(rhs_);
auto & other = static_cast<const Self &>(rhs_);
const T & a = data[n];
const T & b = other.data[m];
if (scale == other.scale)
return a > b ? 1 : (a < b ? -1 : 0);
return decimalLess<T>(b, a, other.scale, scale) ? 1 : (decimalLess<T>(a, b, scale, other.scale) ? -1 : 0);
}

View File

@ -67,13 +67,13 @@ public:
int fd = ::open(path.c_str(), O_RDWR | O_CREAT, 0666);
if (-1 == fd)
DB::throwFromErrno("Cannot open file " + path, DB::ErrorCodes::CANNOT_OPEN_FILE);
DB::throwFromErrnoWithPath("Cannot open file " + path, path, DB::ErrorCodes::CANNOT_OPEN_FILE);
try
{
int flock_ret = flock(fd, LOCK_EX);
if (-1 == flock_ret)
DB::throwFromErrno("Cannot lock file " + path, DB::ErrorCodes::CANNOT_OPEN_FILE);
DB::throwFromErrnoWithPath("Cannot lock file " + path, path, DB::ErrorCodes::CANNOT_OPEN_FILE);
if (!file_doesnt_exists)
{
@ -141,7 +141,7 @@ public:
int fd = ::open(path.c_str(), O_RDWR | O_CREAT, 0666);
if (-1 == fd)
DB::throwFromErrno("Cannot open file " + path, DB::ErrorCodes::CANNOT_OPEN_FILE);
DB::throwFromErrnoWithPath("Cannot open file " + path, path, DB::ErrorCodes::CANNOT_OPEN_FILE);
try
{

View File

@ -9,6 +9,9 @@
#include <IO/ReadBufferFromString.h>
#include <common/demangle.h>
#include <Common/config_version.h>
#include <Common/formatReadable.h>
#include <Storages/MergeTree/DiskSpaceMonitor.h>
#include <filesystem>
namespace DB
{
@ -52,6 +55,11 @@ void throwFromErrno(const std::string & s, int code, int e)
throw ErrnoException(s + ", " + errnoToString(code, e), code, e);
}
void throwFromErrnoWithPath(const std::string & s, const std::string & path, int code, int the_errno)
{
throw ErrnoException(s + ", " + errnoToString(code, the_errno), code, the_errno, path);
}
void tryLogCurrentException(const char * log_name, const std::string & start_of_message)
{
tryLogCurrentException(&Logger::get(log_name), start_of_message);
@ -68,7 +76,52 @@ void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_
}
}
std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded_stacktrace)
void getNoSpaceLeftInfoMessage(std::filesystem::path path, std::string & msg)
{
path = std::filesystem::absolute(path);
/// It's possible to get ENOSPC for non existent file (e.g. if there are no free inodes and creat() fails)
/// So try to get info for existent parent directory.
while (!std::filesystem::exists(path) && path.has_relative_path())
path = path.parent_path();
auto fs = DiskSpaceMonitor::getStatVFS(path);
msg += "\nTotal space: " + formatReadableSizeWithBinarySuffix(fs.f_blocks * fs.f_bsize)
+ "\nAvailable space: " + formatReadableSizeWithBinarySuffix(fs.f_bavail * fs.f_bsize)
+ "\nTotal inodes: " + formatReadableQuantity(fs.f_files)
+ "\nAvailable inodes: " + formatReadableQuantity(fs.f_favail);
auto mount_point = DiskSpaceMonitor::getMountPoint(path).string();
msg += "\nMount point: " + mount_point;
#if defined(__linux__)
msg += "\nFilesystem: " + DiskSpaceMonitor::getFilesystemName(mount_point);
#endif
}
std::string getExtraExceptionInfo(const std::exception & e)
{
String msg;
try
{
if (auto file_exception = dynamic_cast<const Poco::FileException *>(&e))
{
if (file_exception->code() == ENOSPC)
getNoSpaceLeftInfoMessage(file_exception->message(), msg);
}
else if (auto errno_exception = dynamic_cast<const DB::ErrnoException *>(&e))
{
if (errno_exception->getErrno() == ENOSPC && errno_exception->getPath())
getNoSpaceLeftInfoMessage(errno_exception->getPath().value(), msg);
}
}
catch (...)
{
msg += "\nCannot print extra info: " + getCurrentExceptionMessage(false, false, false);
}
return msg;
}
std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded_stacktrace /*= false*/, bool with_extra_info /*= true*/)
{
std::stringstream stream;
@ -78,7 +131,9 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded
}
catch (const Exception & e)
{
stream << getExceptionMessage(e, with_stacktrace, check_embedded_stacktrace) << " (version " << VERSION_STRING << VERSION_OFFICIAL << ")";
stream << getExceptionMessage(e, with_stacktrace, check_embedded_stacktrace)
<< (with_extra_info ? getExtraExceptionInfo(e) : "")
<< " (version " << VERSION_STRING << VERSION_OFFICIAL << ")";
}
catch (const Poco::Exception & e)
{
@ -86,7 +141,8 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded
{
stream << "Poco::Exception. Code: " << ErrorCodes::POCO_EXCEPTION << ", e.code() = " << e.code()
<< ", e.displayText() = " << e.displayText()
<< " (version " << VERSION_STRING << VERSION_OFFICIAL << ")";
<< (with_extra_info ? getExtraExceptionInfo(e) : "")
<< " (version " << VERSION_STRING << VERSION_OFFICIAL;
}
catch (...) {}
}
@ -100,7 +156,9 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded
if (status)
name += " (demangling status: " + toString(status) + ")";
stream << "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what() << ", version = " << VERSION_STRING << VERSION_OFFICIAL;
stream << "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what()
<< (with_extra_info ? getExtraExceptionInfo(e) : "")
<< ", version = " << VERSION_STRING << VERSION_OFFICIAL;
}
catch (...) {}
}

View File

@ -52,16 +52,18 @@ private:
class ErrnoException : public Exception
{
public:
ErrnoException(const std::string & msg, int code, int saved_errno_)
: Exception(msg, code), saved_errno(saved_errno_) {}
ErrnoException(const std::string & msg, int code, int saved_errno_, const std::optional<std::string> & path_ = {})
: Exception(msg, code), saved_errno(saved_errno_), path(path_) {}
ErrnoException * clone() const override { return new ErrnoException(*this); }
void rethrow() const override { throw *this; }
int getErrno() const { return saved_errno; }
const std::optional<std::string> getPath() const { return path; }
private:
int saved_errno;
std::optional<std::string> path;
const char * name() const throw() override { return "DB::ErrnoException"; }
const char * className() const throw() override { return "DB::ErrnoException"; }
@ -73,6 +75,8 @@ using Exceptions = std::vector<std::exception_ptr>;
std::string errnoToString(int code, int the_errno = errno);
[[noreturn]] void throwFromErrno(const std::string & s, int code, int the_errno = errno);
[[noreturn]] void throwFromErrnoWithPath(const std::string & s, const std::string & path, int code,
int the_errno = errno);
/** Try to write an exception to the log (and forget about it).
@ -87,7 +91,8 @@ void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_
* check_embedded_stacktrace - if DB::Exception has embedded stacktrace then
* only this stack trace will be printed.
*/
std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded_stacktrace = false);
std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded_stacktrace = false,
bool with_extra_info = true);
/// Returns error code from ErrorCodes
int getCurrentExceptionCode();

View File

@ -3,6 +3,8 @@
#include <Core/Types.h>
#include <Common/UInt128.h>
#include <type_traits>
/** Hash functions that are better than the trivial function std::hash.
*
@ -57,8 +59,6 @@ inline DB::UInt64 intHashCRC32(DB::UInt64 x)
}
template <typename T> struct DefaultHash;
template <typename T>
inline size_t DefaultHash64(T key)
{
@ -72,28 +72,18 @@ inline size_t DefaultHash64(T key)
return intHash64(u.out);
}
#define DEFINE_HASH(T) \
template <> struct DefaultHash<T>\
{\
size_t operator() (T key) const\
{\
return DefaultHash64<T>(key);\
}\
template <typename T, typename Enable = void>
struct DefaultHash;
template <typename T>
struct DefaultHash<T, std::enable_if_t<std::is_arithmetic_v<T>>>
{
size_t operator() (T key) const
{
return DefaultHash64<T>(key);
}
};
DEFINE_HASH(DB::UInt8)
DEFINE_HASH(DB::UInt16)
DEFINE_HASH(DB::UInt32)
DEFINE_HASH(DB::UInt64)
DEFINE_HASH(DB::Int8)
DEFINE_HASH(DB::Int16)
DEFINE_HASH(DB::Int32)
DEFINE_HASH(DB::Int64)
DEFINE_HASH(DB::Float32)
DEFINE_HASH(DB::Float64)
#undef DEFINE_HASH
template <typename T> struct HashCRC32;

View File

@ -51,7 +51,7 @@ StatusFile::StatusFile(const std::string & path_)
fd = ::open(path.c_str(), O_WRONLY | O_CREAT, 0666);
if (-1 == fd)
throwFromErrno("Cannot open file " + path, ErrorCodes::CANNOT_OPEN_FILE);
throwFromErrnoWithPath("Cannot open file " + path, path, ErrorCodes::CANNOT_OPEN_FILE);
try
{
@ -61,14 +61,14 @@ StatusFile::StatusFile(const std::string & path_)
if (errno == EWOULDBLOCK)
throw Exception("Cannot lock file " + path + ". Another server instance in same directory is already running.", ErrorCodes::CANNOT_OPEN_FILE);
else
throwFromErrno("Cannot lock file " + path, ErrorCodes::CANNOT_OPEN_FILE);
throwFromErrnoWithPath("Cannot lock file " + path, path, ErrorCodes::CANNOT_OPEN_FILE);
}
if (0 != ftruncate(fd, 0))
throwFromErrno("Cannot ftruncate " + path, ErrorCodes::CANNOT_TRUNCATE_FILE);
throwFromErrnoWithPath("Cannot ftruncate " + path, path, ErrorCodes::CANNOT_TRUNCATE_FILE);
if (0 != lseek(fd, 0, SEEK_SET))
throwFromErrno("Cannot lseek " + path, ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
throwFromErrnoWithPath("Cannot lseek " + path, path, ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
/// Write information about current server instance to the file.
{

View File

@ -26,16 +26,19 @@ void createHardLink(const String & source_path, const String & destination_path)
struct stat destination_descr;
if (0 != lstat(source_path.c_str(), &source_descr))
throwFromErrno("Cannot stat " + source_path, ErrorCodes::CANNOT_STAT);
throwFromErrnoWithPath("Cannot stat " + source_path, source_path, ErrorCodes::CANNOT_STAT);
if (0 != lstat(destination_path.c_str(), &destination_descr))
throwFromErrno("Cannot stat " + destination_path, ErrorCodes::CANNOT_STAT);
throwFromErrnoWithPath("Cannot stat " + destination_path, destination_path, ErrorCodes::CANNOT_STAT);
if (source_descr.st_ino != destination_descr.st_ino)
throwFromErrno("Destination file " + destination_path + " is already exist and have different inode.", ErrorCodes::CANNOT_LINK, link_errno);
throwFromErrnoWithPath(
"Destination file " + destination_path + " is already exist and have different inode.",
destination_path, ErrorCodes::CANNOT_LINK, link_errno);
}
else
throwFromErrno("Cannot link " + source_path + " to " + destination_path, ErrorCodes::CANNOT_LINK);
throwFromErrnoWithPath("Cannot link " + source_path + " to " + destination_path, destination_path,
ErrorCodes::CANNOT_LINK);
}
}

View File

@ -9,6 +9,7 @@ void registerFunctionsBitmap(FunctionFactory & factory)
{
factory.registerFunction<FunctionBitmapBuild>();
factory.registerFunction<FunctionBitmapToArray>();
factory.registerFunction<FunctionBitmapSubsetInRange>();
factory.registerFunction<FunctionBitmapSelfCardinality>();
factory.registerFunction<FunctionBitmapAndCardinality>();

View File

@ -30,6 +30,9 @@ namespace ErrorCodes
* Convert bitmap to integer array:
* bitmapToArray: bitmap -> integer[]
*
* Return subset in specified range (not include the range_end):
* bitmapSubsetInRange: bitmap,integer,integer -> bitmap
*
* Two bitmap and calculation:
* bitmapAnd: bitmap,bitmap -> bitmap
*
@ -240,6 +243,119 @@ private:
}
};
class FunctionBitmapSubsetInRange : public IFunction
{
public:
static constexpr auto name = "bitmapSubsetInRange";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitmapSubsetInRange>(); }
String getName() const override { return name; }
bool isVariadic() const override { return false; }
size_t getNumberOfArguments() const override { return 3; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
const DataTypeAggregateFunction * bitmap_type = typeid_cast<const DataTypeAggregateFunction *>(arguments[0].get());
if (!(bitmap_type && bitmap_type->getFunctionName() == AggregateFunctionGroupBitmapData<UInt32>::name()))
throw Exception(
"First argument for function " + getName() + " must be an bitmap but it has type " + arguments[0]->getName() + ".",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
auto arg_type1 = typeid_cast<const DataTypeNumber<UInt32> *>(arguments[1].get());
if (!(arg_type1))
throw Exception(
"Second argument for function " + getName() + " must be UInt32 but it has type " + arguments[1]->getName() + ".",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
auto arg_type2 = typeid_cast<const DataTypeNumber<UInt32> *>(arguments[1].get());
if (!(arg_type2))
throw Exception(
"Third argument for function " + getName() + " must be UInt32 but it has type " + arguments[2]->getName() + ".",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return arguments[0];
}
bool useDefaultImplementationForConstants() const override { return true; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
if (which.isUInt8())
executeIntType<UInt8>(block, arguments, result, input_rows_count);
else if (which.isUInt16())
executeIntType<UInt16>(block, arguments, result, input_rows_count);
else if (which.isUInt32())
executeIntType<UInt32>(block, arguments, result, input_rows_count);
else if (which.isUInt64())
executeIntType<UInt64>(block, arguments, result, input_rows_count);
else
throw Exception(
"Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
private:
using ToType = UInt64;
template <typename T>
void executeIntType(
Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count)
const
{
const IColumn * columns[3];
bool is_column_const[3];
const ColumnAggregateFunction * colAggFunc;
const PaddedPODArray<AggregateDataPtr> * container0;
const PaddedPODArray<UInt32> * container1, * container2;
for (size_t i = 0; i < 3; ++i)
{
columns[i] = block.getByPosition(arguments[i]).column.get();
is_column_const[i] = isColumnConst(*columns[i]);
}
if (is_column_const[0])
{
colAggFunc = typeid_cast<const ColumnAggregateFunction*>(typeid_cast<const ColumnConst*>(columns[0])->getDataColumnPtr().get());
}
else
{
colAggFunc = typeid_cast<const ColumnAggregateFunction*>(columns[0]);
}
container0 = &colAggFunc->getData();
if (is_column_const[1])
container1 = &typeid_cast<const ColumnUInt32*>(typeid_cast<const ColumnConst*>(columns[1])->getDataColumnPtr().get())->getData();
else
container1 = &typeid_cast<const ColumnUInt32*>(columns[1])->getData();
if (is_column_const[2])
container2 = &typeid_cast<const ColumnUInt32*>(typeid_cast<const ColumnConst*>(columns[2])->getDataColumnPtr().get())->getData();
else
container2 = &typeid_cast<const ColumnUInt32*>(columns[2])->getData();
auto col_to = ColumnAggregateFunction::create(colAggFunc->getAggregateFunction());
col_to->reserve(input_rows_count);
for (size_t i = 0; i < input_rows_count; ++i)
{
const AggregateDataPtr dataPtr0 = is_column_const[0] ? (*container0)[0] : (*container0)[i];
const AggregateFunctionGroupBitmapData<T>& bd0
= *reinterpret_cast<const AggregateFunctionGroupBitmapData<T>*>(dataPtr0);
const UInt32 range_start = is_column_const[1] ? (*container1)[0] : (*container1)[i];
const UInt32 range_end = is_column_const[2] ? (*container2)[0] : (*container2)[i];
auto bd2 = new AggregateFunctionGroupBitmapData<T>();
bd0.rbs.rb_range(range_start, range_end, bd2->rbs);
col_to->insertFrom(reinterpret_cast<ConstAggregateDataPtr>(bd2));
}
block.getByPosition(result).column = std::move(col_to);
}
};
template <typename Name>
class FunctionBitmapSelfCardinalityImpl : public IFunction
{

View File

@ -36,6 +36,10 @@ const UInt8 geohash_base32_decode_lookup_table[256] = {
const size_t BITS_PER_SYMBOL = 5;
const size_t MAX_PRECISION = 12;
const size_t MAX_BITS = MAX_PRECISION * BITS_PER_SYMBOL * 1.5;
const Float64 LON_MIN = -180;
const Float64 LON_MAX = 180;
const Float64 LAT_MIN = -90;
const Float64 LAT_MAX = 90;
using Encoded = std::array<UInt8, MAX_BITS>;
@ -64,7 +68,7 @@ inline Encoded encodeCoordinate(Float64 coord, Float64 min, Float64 max, UInt8 b
for (size_t i = 0; i < bits; ++i)
{
Float64 mid = (max + min) / 2;
const Float64 mid = (max + min) / 2;
if (coord >= mid)
{
result[i] = 1;
@ -148,7 +152,7 @@ inline void base32Encode(const Encoded & binary, UInt8 precision, char * out)
{
extern const char geohash_base32_encode_lookup_table[32];
for (UInt8 i = 0; i < precision * BITS_PER_SYMBOL; i += 5)
for (UInt8 i = 0; i < precision * BITS_PER_SYMBOL; i += BITS_PER_SYMBOL)
{
UInt8 v = binary[i];
v <<= 1;
@ -187,24 +191,38 @@ inline Encoded base32Decode(const char * encoded_string, size_t encoded_length)
return result;
}
inline Float64 getMaxSpan(CoordType type)
{
if (type == LONGITUDE)
{
return LON_MAX - LON_MIN;
}
return LAT_MAX - LAT_MIN;
}
namespace DB
inline Float64 getSpan(UInt8 precision, CoordType type)
{
const auto bits = singleCoordBitsPrecision(precision, type);
// since every bit of precision divides span by 2, divide max span by 2^bits.
return ldexp(getMaxSpan(type), -1 * bits);
}
namespace GeoUtils
{
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char *& out)
inline UInt8 geohashPrecision(UInt8 precision)
{
if (precision == 0 || precision > MAX_PRECISION)
{
precision = MAX_PRECISION;
}
return precision;
}
inline size_t geohashEncodeImpl(Float64 longitude, Float64 latitude, UInt8 precision, char * out)
{
const Encoded combined = merge(
encodeCoordinate(longitude, -180, 180, singleCoordBitsPrecision(precision, LONGITUDE)),
encodeCoordinate(latitude, -90, 90, singleCoordBitsPrecision(precision, LATITUDE)),
encodeCoordinate(longitude, LON_MIN, LON_MAX, singleCoordBitsPrecision(precision, LONGITUDE)),
encodeCoordinate(latitude, LAT_MIN, LAT_MAX, singleCoordBitsPrecision(precision, LATITUDE)),
precision);
base32Encode(combined, precision, out);
@ -212,9 +230,28 @@ size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char
return precision;
}
}
namespace DB
{
namespace ErrorCodes
{
extern const int ARGUMENT_OUT_OF_BOUND;
}
namespace GeoUtils
{
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char * out)
{
precision = geohashPrecision(precision);
return geohashEncodeImpl(longitude, latitude, precision, out);
}
void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * longitude, Float64 * latitude)
{
const UInt8 precision = std::min(encoded_len, MAX_PRECISION);
const UInt8 precision = std::min(encoded_len, static_cast<size_t>(MAX_PRECISION));
if (precision == 0)
{
return;
@ -223,8 +260,89 @@ void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * lo
Encoded lat_encoded, lon_encoded;
std::tie(lon_encoded, lat_encoded) = split(base32Decode(encoded_string, precision), precision);
*longitude = decodeCoordinate(lon_encoded, -180, 180, singleCoordBitsPrecision(precision, LONGITUDE));
*latitude = decodeCoordinate(lat_encoded, -90, 90, singleCoordBitsPrecision(precision, LATITUDE));
*longitude = decodeCoordinate(lon_encoded, LON_MIN, LON_MAX, singleCoordBitsPrecision(precision, LONGITUDE));
*latitude = decodeCoordinate(lat_encoded, LAT_MIN, LAT_MAX, singleCoordBitsPrecision(precision, LATITUDE));
}
GeohashesInBoxPreparedArgs geohashesInBoxPrepare(const Float64 longitude_min,
const Float64 latitude_min,
const Float64 longitude_max,
const Float64 latitude_max,
UInt8 precision)
{
precision = geohashPrecision(precision);
if (longitude_max < longitude_min || latitude_max < latitude_min)
{
return {};
}
const auto lon_step = getSpan(precision, LONGITUDE);
const auto lat_step = getSpan(precision, LATITUDE);
// align max to the right(or up) border of geohash grid cell to ensure that cell is in result.
Float64 lon_min = floor(longitude_min / lon_step) * lon_step;
Float64 lat_min = floor(latitude_min / lat_step) * lat_step;
Float64 lon_max = ceil(longitude_max / lon_step) * lon_step;
Float64 lat_max = ceil(latitude_max / lat_step) * lat_step;
const auto lon_span = lon_max - lon_min;
const auto lat_span = lat_max - lat_min;
// in case of a very small (or zero) span, produce at least 1 item.
const auto items_count = std::max(size_t{1}, static_cast<size_t>(ceil(lon_span/lon_step * lat_span/lat_step)));
return GeohashesInBoxPreparedArgs{
items_count,
precision,
lon_min,
lat_min,
lon_max,
lat_max,
lon_step,
lat_step
};
}
UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & args, char * out)
{
if (args.items_count == 0
|| args.precision == 0
|| args.precision > MAX_PRECISION
|| args.latitude_min > args.latitude_max
|| args.longitude_min > args.longitude_max
|| args.longitude_step <= 0
|| args.latitude_step <= 0)
{
return 0;
}
UInt64 items = 0;
for (auto lon = args.longitude_min; lon < args.longitude_max; lon += args.longitude_step)
{
for (auto lat = args.latitude_min; lat < args.latitude_max; lat += args.latitude_step)
{
assert(items <= args.items_count);
size_t l = geohashEncodeImpl(lon, lat, args.precision, out);
out += l;
*out = '\0';
++out;
++items;
}
}
if (items == 0 && args.items_count != 0)
{
size_t l = geohashEncodeImpl(args.longitude_min, args.latitude_min, args.precision, out);
out += l;
*out = '\0';
++out;
++items;
}
return items;
}
}

View File

@ -706,10 +706,33 @@ std::string serialize(Polygon && polygon)
return result;
}
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char *& out);
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char * out);
void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * longitude, Float64 * latitude);
std::vector<std::pair<Float64, Float64>> geohashCoverBox(Float64 longitude_min, Float64 latitude_min, Float64 longitude_max, Float64 latitude_max, UInt8 precision, UInt32 max_items = 0);
struct GeohashesInBoxPreparedArgs
{
UInt64 items_count = 0;
UInt8 precision = 0;
Float64 longitude_min = 0.0;
Float64 latitude_min = 0.0;
Float64 longitude_max = 0.0;
Float64 latitude_max = 0.0;
Float64 longitude_step = 0.0;
Float64 latitude_step = 0.0;
};
GeohashesInBoxPreparedArgs geohashesInBoxPrepare(const Float64 longitude_min,
const Float64 latitude_min,
Float64 longitude_max,
Float64 latitude_max,
UInt8 precision);
UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & estimation, char * out);
} /// GeoUtils

View File

@ -0,0 +1,169 @@
#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/GeoUtils.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <memory>
#include <string>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_LARGE_ARRAY_SIZE;
}
class FunctionGeohashesInBox : public IFunction
{
public:
static constexpr auto name = "geohashesInBox";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionGeohashesInBox>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 5; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
validateArgumentType(*this, arguments, 0, isFloat, "float");
validateArgumentType(*this, arguments, 1, isFloat, "float");
validateArgumentType(*this, arguments, 2, isFloat, "float");
validateArgumentType(*this, arguments, 3, isFloat, "float");
validateArgumentType(*this, arguments, 4, isUInt8, "integer");
if (!(arguments[0]->equals(*arguments[1]) &&
arguments[0]->equals(*arguments[2]) &&
arguments[0]->equals(*arguments[3])))
{
throw Exception("Illegal type of argument of " + getName() +
" all coordinate arguments must have the same type, instead they are:" +
arguments[0]->getName() + ", " +
arguments[1]->getName() + ", " +
arguments[2]->getName() + ", " +
arguments[3]->getName() + ".",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
}
bool useDefaultImplementationForConstants() const override { return true; }
template <typename LonAndLatType, typename PrecisionType>
void execute(const IColumn * lon_min_column,
const IColumn * lat_min_column,
const IColumn * lon_max_column,
const IColumn * lat_max_column,
const IColumn * precision_column,
ColumnPtr & result)
{
static constexpr size_t max_array_size = 10'000'000;
const auto * lon_min = checkAndGetColumn<ColumnVector<LonAndLatType>>(lon_min_column);
const auto * lat_min = checkAndGetColumn<ColumnVector<LonAndLatType>>(lat_min_column);
const auto * lon_max = checkAndGetColumn<ColumnVector<LonAndLatType>>(lon_max_column);
const auto * lat_max = checkAndGetColumn<ColumnVector<LonAndLatType>>(lat_max_column);
auto * precision = checkAndGetColumn<ColumnVector<PrecisionType>>(precision_column);
if (precision == nullptr)
{
precision = checkAndGetColumnConstData<ColumnVector<PrecisionType>>(precision_column);
}
if (!lon_min || !lat_min || !lon_max || !lat_max || !precision)
{
throw Exception("Unsupported argument types for function " + getName() + " : " +
lon_min_column->getName() + ", " +
lat_min_column->getName() + ", " +
lon_max_column->getName() + ", " +
lat_max_column->getName() + ".",
ErrorCodes::LOGICAL_ERROR);
}
const size_t total_rows = lat_min->size();
auto col_res = ColumnArray::create(ColumnString::create());
ColumnString & res_strings = typeid_cast<ColumnString &>(col_res->getData());
ColumnArray::Offsets & res_offsets = col_res->getOffsets();
ColumnString::Chars & res_strings_chars = res_strings.getChars();
ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets();
for (size_t row = 0; row < total_rows; ++row)
{
const Float64 lon_min_value = lon_min->getElement(row);
const Float64 lat_min_value = lat_min->getElement(row);
const Float64 lon_max_value = lon_max->getElement(row);
const Float64 lat_max_value = lat_max->getElement(row);
const auto prepared_args = GeoUtils::geohashesInBoxPrepare(
lon_min_value, lat_min_value, lon_max_value, lat_max_value,
precision->getElement(row % precision->size()));
if (prepared_args.items_count > max_array_size)
{
throw Exception(getName() + " would produce " + std::to_string(prepared_args.items_count) +
" array elements, which is bigger than the allowed maximum of " + std::to_string(max_array_size),
ErrorCodes::TOO_LARGE_ARRAY_SIZE);
}
res_strings_offsets.reserve(res_strings_offsets.size() + prepared_args.items_count);
res_strings_chars.resize(res_strings_chars.size() + prepared_args.items_count * (prepared_args.precision + 1));
const auto starting_offset = res_strings_offsets.empty() ? 0 : res_strings_offsets.back();
char * out = reinterpret_cast<char *>(res_strings_chars.data() + starting_offset);
// Actually write geohashes into preallocated buffer.
GeoUtils::geohashesInBox(prepared_args, out);
for (UInt8 i = 1; i <= prepared_args.items_count ; ++i)
{
res_strings_offsets.push_back(starting_offset + (prepared_args.precision + 1) * i);
}
res_offsets.push_back((res_offsets.empty() ? 0 : res_offsets.back()) + prepared_args.items_count);
}
if (!res_strings_offsets.empty() && res_strings_offsets.back() != res_strings_chars.size())
{
throw Exception("String column size mismatch (internal logical error)", ErrorCodes::LOGICAL_ERROR);
}
if (!res_offsets.empty() && res_offsets.back() != res_strings.size())
{
throw Exception("Arrary column size mismatch (internal logical error)" +
std::to_string(res_offsets.back()) + " != " + std::to_string(res_strings.size()),
ErrorCodes::LOGICAL_ERROR);
}
result = std::move(col_res);
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
const IColumn * lon_min = block.getByPosition(arguments[0]).column.get();
const IColumn * lat_min = block.getByPosition(arguments[1]).column.get();
const IColumn * lon_max = block.getByPosition(arguments[2]).column.get();
const IColumn * lat_max = block.getByPosition(arguments[3]).column.get();
const IColumn * prec = block.getByPosition(arguments[4]).column.get();
ColumnPtr & res = block.getByPosition(result).column;
if (checkColumn<ColumnVector<Float32>>(lon_min))
{
execute<Float32, UInt8>(lon_min, lat_min, lon_max, lat_max, prec, res);
}
else
{
execute<Float64, UInt8>(lon_min, lat_min, lon_max, lat_max, prec, res);
}
}
};
void registerFunctionGeohashesInBox(FunctionFactory & factory)
{
factory.registerFunction<FunctionGeohashesInBox>();
}
}

View File

@ -10,6 +10,7 @@ void registerFunctionPointInEllipses(FunctionFactory & factory);
void registerFunctionPointInPolygon(FunctionFactory & factory);
void registerFunctionGeohashEncode(FunctionFactory & factory);
void registerFunctionGeohashDecode(FunctionFactory & factory);
void registerFunctionGeohashesInBox(FunctionFactory & factory);
#if USE_H3
void registerFunctionGeoToH3(FunctionFactory &);
@ -22,6 +23,7 @@ void registerFunctionsGeo(FunctionFactory & factory)
registerFunctionPointInPolygon(factory);
registerFunctionGeohashEncode(factory);
registerFunctionGeohashDecode(factory);
registerFunctionGeohashesInBox(factory);
#if USE_H3
registerFunctionGeoToH3(factory);

View File

@ -1,9 +1,11 @@
#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnsCommon.h>
#include <DataTypes/DataTypesNumber.h>
#include <IO/WriteHelpers.h>
namespace DB
@ -13,6 +15,7 @@ namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int FUNCTION_THROW_IF_VALUE_IS_NON_ZERO;
}
@ -32,46 +35,70 @@ public:
return name;
}
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override
{
return 1;
return 0;
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isNativeNumber(arguments.front()))
const size_t number_of_arguments = arguments.size();
if (number_of_arguments < 1 || number_of_arguments > 2)
throw Exception{"Number of arguments for function " + getName() + " doesn't match: passed "
+ toString(number_of_arguments) + ", should be 1 or 2",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH};
if (!isNativeNumber(arguments[0]))
throw Exception{"Argument for function " + getName() + " must be number", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
if (number_of_arguments > 1 && !isString(arguments[1]))
throw Exception{"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
return std::make_shared<DataTypeUInt8>();
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
std::optional<String> custom_message;
if (arguments.size() == 2)
{
auto * msg_column = checkAndGetColumnConst<ColumnString>(block.getByPosition(arguments[1]).column.get());
if (!msg_column)
throw Exception{"Second argument for function " + getName() + " must be constant String", ErrorCodes::ILLEGAL_COLUMN};
custom_message = msg_column->getValue<String>();
}
const auto in = block.getByPosition(arguments.front()).column.get();
if ( !execute<UInt8>(block, in, result)
&& !execute<UInt16>(block, in, result)
&& !execute<UInt32>(block, in, result)
&& !execute<UInt64>(block, in, result)
&& !execute<Int8>(block, in, result)
&& !execute<Int16>(block, in, result)
&& !execute<Int32>(block, in, result)
&& !execute<Int64>(block, in, result)
&& !execute<Float32>(block, in, result)
&& !execute<Float64>(block, in, result))
if ( !execute<UInt8>(block, in, result, custom_message)
&& !execute<UInt16>(block, in, result, custom_message)
&& !execute<UInt32>(block, in, result, custom_message)
&& !execute<UInt64>(block, in, result, custom_message)
&& !execute<Int8>(block, in, result, custom_message)
&& !execute<Int16>(block, in, result, custom_message)
&& !execute<Int32>(block, in, result, custom_message)
&& !execute<Int64>(block, in, result, custom_message)
&& !execute<Float32>(block, in, result, custom_message)
&& !execute<Float64>(block, in, result, custom_message))
throw Exception{"Illegal column " + in->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
}
template <typename T>
bool execute(Block & block, const IColumn * in_untyped, const size_t result)
bool execute(Block & block, const IColumn * in_untyped, const size_t result, const std::optional<String> & message)
{
if (const auto in = checkAndGetColumn<ColumnVector<T>>(in_untyped))
{
const auto & in_data = in->getData();
if (!memoryIsZero(in_data.data(), in_data.size() * sizeof(in_data[0])))
throw Exception("Value passed to 'throwIf' function is non zero", ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO);
throw Exception{message.value_or("Value passed to '" + getName() + "' function is non zero"),
ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO};
/// We return non constant to avoid constant folding.
block.getByPosition(result).column = ColumnUInt8::create(in_data.size(), 0);

View File

@ -29,7 +29,8 @@ void MMapReadBufferFromFile::open(const std::string & file_name)
fd = ::open(file_name.c_str(), O_RDONLY);
if (-1 == fd)
throwFromErrno("Cannot open file " + file_name, errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
}

View File

@ -54,7 +54,7 @@ ReadBufferAIO::ReadBufferAIO(const std::string & filename_, size_t buffer_size_,
if (fd == -1)
{
auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE;
throwFromErrno("Cannot open file " + filename, error_code);
throwFromErrnoWithPath("Cannot open file " + filename, filename, error_code);
}
}

View File

@ -41,12 +41,13 @@ ReadBufferFromFile::ReadBufferFromFile(
fd = ::open(file_name.c_str(), flags == -1 ? O_RDONLY : flags);
if (-1 == fd)
throwFromErrno("Cannot open file " + file_name, errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
#ifdef __APPLE__
if (o_direct)
{
if (fcntl(fd, F_NOCACHE, 1) == -1)
throwFromErrno("Cannot set F_NOCACHE on file " + file_name, ErrorCodes::CANNOT_OPEN_FILE);
throwFromErrno("Cannot set F_NOCACHE on file " + file_name, file_name, ErrorCodes::CANNOT_OPEN_FILE);
}
#endif
}

View File

@ -61,7 +61,8 @@ bool ReadBufferFromFileDescriptor::nextImpl()
if (-1 == res && errno != EINTR)
{
ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadFailed);
throwFromErrno("Cannot read from file " + getFileName(), ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR);
throwFromErrnoWithPath("Cannot read from file " + getFileName(), getFileName(),
ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR);
}
if (res > 0)
@ -124,7 +125,8 @@ off_t ReadBufferFromFileDescriptor::doSeek(off_t offset, int whence)
pos = working_buffer.end();
off_t res = ::lseek(fd, new_pos, SEEK_SET);
if (-1 == res)
throwFromErrno("Cannot seek through file " + getFileName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
pos_in_file = new_pos;
watch.stop();

View File

@ -62,7 +62,7 @@ WriteBufferAIO::WriteBufferAIO(const std::string & filename_, size_t buffer_size
if (fd == -1)
{
auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE;
throwFromErrno("Cannot open file " + filename, error_code);
throwFromErrnoWithPath("Cannot open file " + filename, filename, error_code);
}
}
@ -96,7 +96,7 @@ void WriteBufferAIO::sync()
/// Ask OS to flush data to disk.
int res = ::fsync(fd);
if (res == -1)
throwFromErrno("Cannot fsync " + getFileName(), ErrorCodes::CANNOT_FSYNC);
throwFromErrnoWithPath("Cannot fsync " + getFileName(), getFileName(), ErrorCodes::CANNOT_FSYNC);
}
void WriteBufferAIO::nextImpl()
@ -173,7 +173,7 @@ void WriteBufferAIO::doTruncate(off_t length)
int res = ::ftruncate(fd, length);
if (res == -1)
throwFromErrno("Cannot truncate file " + filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
throwFromErrnoWithPath("Cannot truncate file " + filename, filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
}
void WriteBufferAIO::flush()
@ -427,7 +427,7 @@ void WriteBufferAIO::finalize()
/// Truncate the file to remove unnecessary zeros from it.
int res = ::ftruncate(fd, max_pos_in_file);
if (res == -1)
throwFromErrno("Cannot truncate file " + filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
throwFromErrnoWithPath("Cannot truncate file " + filename, filename, ErrorCodes::CANNOT_TRUNCATE_FILE);
}
}

View File

@ -44,13 +44,14 @@ WriteBufferFromFile::WriteBufferFromFile(
fd = ::open(file_name.c_str(), flags == -1 ? O_WRONLY | O_TRUNC | O_CREAT : flags, mode);
if (-1 == fd)
throwFromErrno("Cannot open file " + file_name, errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
#ifdef __APPLE__
if (o_direct)
{
if (fcntl(fd, F_NOCACHE, 1) == -1)
throwFromErrno("Cannot set F_NOCACHE on file " + file_name, ErrorCodes::CANNOT_OPEN_FILE);
throwFromErrno("Cannot set F_NOCACHE on file " + file_name, file_name, ErrorCodes::CANNOT_OPEN_FILE);
}
#endif
}

View File

@ -56,7 +56,8 @@ void WriteBufferFromFileDescriptor::nextImpl()
if ((-1 == res || 0 == res) && errno != EINTR)
{
ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
throwFromErrno("Cannot write to file " + getFileName(), ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
throwFromErrnoWithPath("Cannot write to file " + getFileName(), getFileName(),
ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
}
if (res > 0)
@ -111,7 +112,7 @@ void WriteBufferFromFileDescriptor::sync()
/// Request OS to sync data with storage medium.
int res = fsync(fd);
if (-1 == res)
throwFromErrno("Cannot fsync " + getFileName(), ErrorCodes::CANNOT_FSYNC);
throwFromErrnoWithPath("Cannot fsync " + getFileName(), getFileName(), ErrorCodes::CANNOT_FSYNC);
}
@ -119,7 +120,8 @@ off_t WriteBufferFromFileDescriptor::doSeek(off_t offset, int whence)
{
off_t res = lseek(fd, offset, whence);
if (-1 == res)
throwFromErrno("Cannot seek through file " + getFileName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
throwFromErrnoWithPath("Cannot seek through file " + getFileName(), getFileName(),
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
return res;
}
@ -128,7 +130,7 @@ void WriteBufferFromFileDescriptor::doTruncate(off_t length)
{
int res = ftruncate(fd, length);
if (-1 == res)
throwFromErrno("Cannot truncate file " + getFileName(), ErrorCodes::CANNOT_TRUNCATE_FILE);
throwFromErrnoWithPath("Cannot truncate file " + getFileName(), getFileName(), ErrorCodes::CANNOT_TRUNCATE_FILE);
}
}

View File

@ -39,7 +39,8 @@ public:
off_t res = lseek(fd, 0, SEEK_SET);
if (-1 == res)
throwFromErrno("Cannot reread temporary file " + file_name, ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
throwFromErrnoWithPath("Cannot reread temporary file " + file_name, file_name,
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
return std::make_shared<ReadBufferFromTemporaryWriteBuffer>(fd, file_name, std::move(origin->tmp_file));
}

View File

@ -1,6 +1,7 @@
#include <IO/WriteBufferAIO.h>
#include <Core/Defines.h>
#include <functional>
#include <filesystem>
#include <iostream>
#include <fstream>

View File

@ -565,7 +565,8 @@ void DistributedBlockOutputStream::writeToShard(const Block & block, const std::
}
if (link(first_file_tmp_path.data(), block_file_path.data()))
throwFromErrno("Could not link " + block_file_path + " to " + first_file_tmp_path, ErrorCodes::CANNOT_LINK);
throwFromErrnoWithPath("Could not link " + block_file_path + " to " + first_file_tmp_path, block_file_path,
ErrorCodes::CANNOT_LINK);
}
/** remove the temporary file, enabling the OS to reclaim inode after all threads

View File

@ -145,9 +145,12 @@ namespace
}
}
void IStorage::check(const Names & column_names) const
void IStorage::check(const Names & column_names, bool include_virtuals) const
{
const NamesAndTypesList & available_columns = getColumns().getAllPhysical();
NamesAndTypesList available_columns = getColumns().getAllPhysical();
if (include_virtuals)
available_columns.splice(available_columns.end(), getColumns().getVirtuals());
const String list_of_columns = listOfColumns(available_columns);
if (column_names.empty())

View File

@ -116,7 +116,7 @@ public: /// thread-unsafe part. lockStructure must be acquired
/// Verify that all the requested names are in the table and are set correctly:
/// list of names is not empty and the names do not repeat.
void check(const Names & column_names) const;
void check(const Names & column_names, bool include_virtuals = false) const;
/// Check that all the requested names are in the table and have the correct types.
void check(const NamesAndTypesList & columns) const;

View File

@ -56,32 +56,42 @@ void ReadBufferFromKafkaConsumer::commit()
void ReadBufferFromKafkaConsumer::subscribe(const Names & topics)
{
{
String message = "Subscribed to topics:";
String message = "Already subscribed to topics:";
for (const auto & topic : consumer->get_subscription())
message += " " + topic;
LOG_TRACE(log, message);
}
{
String message = "Assigned to topics:";
String message = "Already assigned to topics:";
for (const auto & toppar : consumer->get_assignment())
message += " " + toppar.get_topic();
LOG_TRACE(log, message);
}
consumer->resume();
// While we wait for an assignment after subscribtion, we'll poll zero messages anyway.
// If we're doing a manual select then it's better to get something after a wait, then immediate nothing.
if (consumer->get_subscription().empty())
// But due to the nature of async pause/resume/subscribe we can't guarantee any persistent state:
// see https://github.com/edenhill/librdkafka/issues/2455
while (consumer->get_subscription().empty())
{
consumer->pause(); // don't accidentally read any messages
consumer->subscribe(topics);
consumer->poll(5s);
consumer->resume();
stalled = false;
// FIXME: if we failed to receive "subscribe" response while polling and destroy consumer now, then we may hang up.
// see https://github.com/edenhill/librdkafka/issues/2077
try
{
consumer->subscribe(topics);
if (nextImpl())
break;
// FIXME: if we failed to receive "subscribe" response while polling and destroy consumer now, then we may hang up.
// see https://github.com/edenhill/librdkafka/issues/2077
}
catch (cppkafka::HandleException & e)
{
if (e.get_error() == RD_KAFKA_RESP_ERR__TIMED_OUT)
continue;
throw;
}
}
stalled = false;

View File

@ -2,7 +2,15 @@
#include <mutex>
#include <sys/statvfs.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#if defined(__linux__)
#include <cstdio>
#include <mntent.h>
#endif
#include <memory>
#include <filesystem>
#include <boost/noncopyable.hpp>
#include <common/logger_useful.h>
#include <Common/Exception.h>
@ -23,6 +31,7 @@ namespace ErrorCodes
{
extern const int CANNOT_STATVFS;
extern const int NOT_ENOUGH_SPACE;
extern const int SYSTEM_ERROR;
}
@ -96,12 +105,18 @@ public:
using ReservationPtr = std::unique_ptr<Reservation>;
static UInt64 getUnreservedFreeSpace(const std::string & path)
inline static struct statvfs getStatVFS(const std::string & path)
{
struct statvfs fs;
if (statvfs(path.c_str(), &fs) != 0)
throwFromErrno("Could not calculate available disk space (statvfs)", ErrorCodes::CANNOT_STATVFS);
throwFromErrnoWithPath("Could not calculate available disk space (statvfs)", path,
ErrorCodes::CANNOT_STATVFS);
return fs;
}
static UInt64 getUnreservedFreeSpace(const std::string & path)
{
struct statvfs fs = getStatVFS(path);
UInt64 res = fs.f_bfree * fs.f_bsize;
@ -140,6 +155,62 @@ public:
return std::make_unique<Reservation>(size);
}
/// Returns mount point of filesystem where absoulte_path (must exist) is located
static std::filesystem::path getMountPoint(std::filesystem::path absolute_path)
{
if (absolute_path.is_relative())
throw Exception("Path is relative. It's a bug.", ErrorCodes::LOGICAL_ERROR);
absolute_path = std::filesystem::canonical(absolute_path);
const auto get_device_id = [](const std::filesystem::path & p)
{
struct stat st;
if (stat(p.c_str(), &st))
throwFromErrnoWithPath("Cannot stat " + p.string(), p.string(), ErrorCodes::SYSTEM_ERROR);
return st.st_dev;
};
/// If /some/path/to/dir/ and /some/path/to/ have different device id,
/// then device which contains /some/path/to/dir/filename is mounted to /some/path/to/dir/
auto device_id = get_device_id(absolute_path);
while (absolute_path.has_relative_path())
{
auto parent = absolute_path.parent_path();
auto parent_device_id = get_device_id(parent);
if (device_id != parent_device_id)
return absolute_path;
absolute_path = parent;
device_id = parent_device_id;
}
return absolute_path;
}
/// Returns name of filesystem mounted to mount_point
#if !defined(__linux__)
[[noreturn]]
#endif
static std::string getFilesystemName([[maybe_unused]] const std::string & mount_point)
{
#if defined(__linux__)
auto mounted_filesystems = setmntent("/etc/mtab", "r");
if (!mounted_filesystems)
throw DB::Exception("Cannot open /etc/mtab to get name of filesystem", ErrorCodes::SYSTEM_ERROR);
mntent fs_info;
constexpr size_t buf_size = 4096; /// The same as buffer used for getmntent in glibc. It can happen that it's not enough
char buf[buf_size];
while (getmntent_r(mounted_filesystems, &fs_info, buf, buf_size) && fs_info.mnt_dir != mount_point)
;
endmntent(mounted_filesystems);
if (fs_info.mnt_dir != mount_point)
throw DB::Exception("Cannot find name of filesystem by mount point " + mount_point, ErrorCodes::SYSTEM_ERROR);
return fs_info.mnt_fsname;
#else
throw DB::Exception("Supported on linux only", ErrorCodes::NOT_IMPLEMENTED);
#endif
}
private:
static UInt64 reserved_bytes;
static UInt64 reservation_count;

View File

@ -413,7 +413,8 @@ void MergeTreeDataPart::remove() const
{
String path_to_remove = to + "/" + file;
if (0 != unlink(path_to_remove.c_str()))
throwFromErrno("Cannot unlink file " + path_to_remove, ErrorCodes::CANNOT_UNLINK);
throwFromErrnoWithPath("Cannot unlink file " + path_to_remove, path_to_remove,
ErrorCodes::CANNOT_UNLINK);
}
#if !__clang__
#pragma GCC diagnostic pop
@ -423,11 +424,12 @@ void MergeTreeDataPart::remove() const
{
String path_to_remove = to + "/" + file;
if (0 != unlink(path_to_remove.c_str()))
throwFromErrno("Cannot unlink file " + path_to_remove, ErrorCodes::CANNOT_UNLINK);
throwFromErrnoWithPath("Cannot unlink file " + path_to_remove, path_to_remove,
ErrorCodes::CANNOT_UNLINK);
}
if (0 != rmdir(to.c_str()))
throwFromErrno("Cannot rmdir file " + to, ErrorCodes::CANNOT_UNLINK);
throwFromErrnoWithPath("Cannot rmdir file " + to, to, ErrorCodes::CANNOT_UNLINK);
}
catch (...)
{

View File

@ -493,7 +493,7 @@ ClusterPtr StorageDistributed::skipUnusedShards(ClusterPtr cluster, const Select
{
const auto & select = query_info.query->as<ASTSelectQuery &>();
if (!select.where())
if (!select.where() || !sharding_key_expr)
return nullptr;
const auto & blocks = evaluateExpressionOverConstantCondition(select.where(), sharding_key_expr);

View File

@ -214,7 +214,8 @@ StorageStripeLog::StorageStripeLog(
{
/// create files if they do not exist
if (0 != mkdir(full_path.c_str(), S_IRWXU | S_IRWXG | S_IRWXO) && errno != EEXIST)
throwFromErrno("Cannot create directory " + full_path, ErrorCodes::CANNOT_CREATE_DIRECTORY);
throwFromErrnoWithPath("Cannot create directory " + full_path, full_path,
ErrorCodes::CANNOT_CREATE_DIRECTORY);
}
}

View File

@ -343,7 +343,8 @@ StorageTinyLog::StorageTinyLog(
{
/// create files if they do not exist
if (0 != mkdir(full_path.c_str(), S_IRWXU | S_IRWXG | S_IRWXO) && errno != EEXIST)
throwFromErrno("Cannot create directory " + full_path, ErrorCodes::CANNOT_CREATE_DIRECTORY);
throwFromErrnoWithPath("Cannot create directory " + full_path, full_path,
ErrorCodes::CANNOT_CREATE_DIRECTORY);
}
for (const auto & col : getColumns().getAllPhysical())

View File

@ -21,7 +21,7 @@ BlockInputStreams StorageValues::read(
size_t /*max_block_size*/,
unsigned /*num_streams*/)
{
check(column_names);
check(column_names, true);
return BlockInputStreams(1, std::make_shared<OneBlockInputStream>(res_block));
}

View File

@ -69,3 +69,10 @@ TEST(TransformQueryForExternalDatabase, Like)
"SELECT \"column\" FROM \"test\".\"table\" WHERE \"column\" NOT LIKE 'w%rld'",
state().context, state().columns);
}
TEST(TransformQueryForExternalDatabase, Substring)
{
check("SELECT column FROM test.table WHERE left(column, 10) = RIGHT(column, 10) AND SUBSTRING(column FROM 1 FOR 2) = 'Hello'",
"SELECT \"column\" FROM \"test\".\"table\"",
state().context, state().columns);
}

View File

@ -22,7 +22,7 @@ try
Poco::File("./test_dir/file").createFile();
if (0 != symlink("./test_dir", "./test_link"))
DB::throwFromErrno("Cannot create symlink", DB::ErrorCodes::SYSTEM_ERROR);
DB::throwFromErrnoWithPath("Cannot create symlink", "./test_link", DB::ErrorCodes::SYSTEM_ERROR);
Poco::File link("./test_link");
link.renameTo("./test_link2");

View File

@ -122,6 +122,7 @@ def kafka_setup_teardown():
# Tests
@pytest.mark.timeout(60)
def test_kafka_settings_old_syntax(kafka_cluster):
instance.query('''
CREATE TABLE test.kafka (key UInt64, value UInt64)
@ -136,14 +137,15 @@ def test_kafka_settings_old_syntax(kafka_cluster):
kafka_produce('old', messages)
result = ''
for i in range(50):
result += instance.query('SELECT * FROM test.kafka')
while True:
result += instance.query('SELECT * FROM test.kafka', ignore_error=True)
if kafka_check_result(result):
break
time.sleep(0.5)
kafka_check_result(result, True)
@pytest.mark.skip(reason="fails for some reason")
@pytest.mark.timeout(60)
def test_kafka_settings_new_syntax(kafka_cluster):
instance.query('''
CREATE TABLE test.kafka (key UInt64, value UInt64)
@ -171,14 +173,15 @@ def test_kafka_settings_new_syntax(kafka_cluster):
kafka_produce('new', messages)
result = ''
for i in range(50):
result += instance.query('SELECT * FROM test.kafka')
while True:
result += instance.query('SELECT * FROM test.kafka', ignore_error=True)
if kafka_check_result(result):
break
time.sleep(0.5)
kafka_check_result(result, True)
@pytest.mark.timeout(60)
def test_kafka_csv_with_delimiter(kafka_cluster):
instance.query('''
CREATE TABLE test.kafka (key UInt64, value UInt64)
@ -196,14 +199,15 @@ def test_kafka_csv_with_delimiter(kafka_cluster):
kafka_produce('csv', messages)
result = ''
for i in range(50):
result += instance.query('SELECT * FROM test.kafka')
while True:
result += instance.query('SELECT * FROM test.kafka', ignore_error=True)
if kafka_check_result(result):
break
time.sleep(0.5)
kafka_check_result(result, True)
@pytest.mark.timeout(60)
def test_kafka_tsv_with_delimiter(kafka_cluster):
instance.query('''
CREATE TABLE test.kafka (key UInt64, value UInt64)
@ -221,14 +225,15 @@ def test_kafka_tsv_with_delimiter(kafka_cluster):
kafka_produce('tsv', messages)
result = ''
for i in range(50):
result += instance.query('SELECT * FROM test.kafka')
while True:
result += instance.query('SELECT * FROM test.kafka', ignore_error=True)
if kafka_check_result(result):
break
time.sleep(0.5)
kafka_check_result(result, True)
@pytest.mark.timeout(60)
def test_kafka_json_without_delimiter(kafka_cluster):
instance.query('''
CREATE TABLE test.kafka (key UInt64, value UInt64)
@ -250,14 +255,15 @@ def test_kafka_json_without_delimiter(kafka_cluster):
kafka_produce('json', [messages])
result = ''
for i in range(50):
result += instance.query('SELECT * FROM test.kafka')
while True:
result += instance.query('SELECT * FROM test.kafka', ignore_error=True)
if kafka_check_result(result):
break
time.sleep(0.5)
kafka_check_result(result, True)
@pytest.mark.timeout(60)
def test_kafka_protobuf(kafka_cluster):
instance.query('''
CREATE TABLE test.kafka (key UInt64, value String)
@ -274,14 +280,15 @@ def test_kafka_protobuf(kafka_cluster):
kafka_produce_protobuf_messages('pb', 21, 29)
result = ''
for i in range(50):
result += instance.query('SELECT * FROM test.kafka')
while True:
result += instance.query('SELECT * FROM test.kafka', ignore_error=True)
if kafka_check_result(result):
break
time.sleep(0.5)
kafka_check_result(result, True)
@pytest.mark.timeout(60)
def test_kafka_materialized_view(kafka_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -305,19 +312,20 @@ def test_kafka_materialized_view(kafka_cluster):
messages.append(json.dumps({'key': i, 'value': i}))
kafka_produce('mv', messages)
for i in range(50):
while True:
result = instance.query('SELECT * FROM test.view')
if kafka_check_result(result):
break
time.sleep(0.5)
kafka_check_result(result, True)
instance.query('''
DROP TABLE test.consumer;
DROP TABLE test.view;
''')
@pytest.mark.skip(reason="Hungs")
kafka_check_result(result, True)
@pytest.mark.timeout(300)
def test_kafka_flush_on_big_message(kafka_cluster):
# Create batchs of messages of size ~100Kb
kafka_messages = 1000
@ -354,15 +362,20 @@ def test_kafka_flush_on_big_message(kafka_cluster):
except kafka.errors.GroupCoordinatorNotAvailableError:
continue
for i in range(50):
while True:
result = instance.query('SELECT count() FROM test.view')
if int(result) == kafka_messages*batch_messages:
break
time.sleep(0.5)
instance.query('''
DROP TABLE test.consumer;
DROP TABLE test.view;
''')
assert int(result) == kafka_messages*batch_messages, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(60)
def test_kafka_virtual_columns(kafka_cluster):
instance.query('''
CREATE TABLE test.kafka (key UInt64, value UInt64)
@ -384,14 +397,15 @@ def test_kafka_virtual_columns(kafka_cluster):
kafka_produce('virt1', [messages])
result = ''
for i in range(50):
result += instance.query('SELECT _key, key, _topic, value, _offset FROM test.kafka')
while True:
result += instance.query('SELECT _key, key, _topic, value, _offset FROM test.kafka', ignore_error=True)
if kafka_check_result(result, False, 'test_kafka_virtual1.reference'):
break
time.sleep(0.5)
kafka_check_result(result, True, 'test_kafka_virtual1.reference')
@pytest.mark.timeout(60)
def test_kafka_virtual_columns_with_materialized_view(kafka_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -415,18 +429,18 @@ def test_kafka_virtual_columns_with_materialized_view(kafka_cluster):
messages.append(json.dumps({'key': i, 'value': i}))
kafka_produce('virt2', messages)
for i in range(50):
while True:
result = instance.query('SELECT kafka_key, key, topic, value, offset FROM test.view')
if kafka_check_result(result, False, 'test_kafka_virtual2.reference'):
break
time.sleep(0.5)
kafka_check_result(result, True, 'test_kafka_virtual2.reference')
instance.query('''
DROP TABLE test.consumer;
DROP TABLE test.view;
''')
kafka_check_result(result, True, 'test_kafka_virtual2.reference')
if __name__ == '__main__':
cluster.start()

View File

@ -0,0 +1,30 @@
<test>
<tags>
<tag>sorting</tag>
<tag>comparison</tag>
</tags>
<type>loop</type>
<stop_conditions>
<all_of>
<iterations>5</iterations>
<min_time_not_changing_for_ms>10000</min_time_not_changing_for_ms>
</all_of>
<any_of>
<iterations>50</iterations>
<total_time_ms>60000</total_time_ms>
</any_of>
</stop_conditions>
<query>SELECT toInt32(number) AS n FROM numbers(1000000) ORDER BY n DESC</query>
<query>SELECT toDecimal32(number, 0) AS n FROM numbers(1000000) ORDER BY n</query>
<query>SELECT toDecimal32(number, 0) AS n FROM numbers(1000000) ORDER BY n DESC</query>
<query>SELECT toDecimal64(number, 8) AS n FROM numbers(1000000) ORDER BY n DESC</query>
<query>SELECT toDecimal128(number, 10) AS n FROM numbers(1000000) ORDER BY n DESC</query>
<main_metric>
<min_time/>
</main_metric>
</test>

View File

@ -1,2 +1,3 @@
1
1
1000000

View File

@ -3,7 +3,9 @@
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
exception_pattern="Value passed to 'throwIf' function is non zero"
default_exception_message="Value passed to 'throwIf' function is non zero"
custom_exception_message="Number equals 1000000"
${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000000) FROM system.numbers" 2>&1 | grep -cF "$exception_pattern"
${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000000) FROM system.numbers" 2>&1 | grep -cF "$default_exception_message"
${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT throwIf(number = 1000000, '$custom_exception_message') FROM system.numbers" 2>&1 | grep -cF "$custom_exception_message"
${CLICKHOUSE_CLIENT} --server_logs_file /dev/null --query="SELECT sum(x = 0) FROM (SELECT throwIf(number = 1000000) AS x FROM numbers(1000000))" 2>&1

View File

@ -59,3 +59,11 @@
1
0
1
[]
[]
[1]
[]
[5]
[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33]
[30,31,32,33,100]
[100]

View File

@ -177,18 +177,37 @@ select bitmapHasAll(bitmapBuild([
-- bitmapContains:
---- Empty
SELECT bitmapContains(bitmapBuild(emptyArrayUInt32()), CAST(0, 'UInt32'));
SELECT bitmapContains(bitmapBuild(emptyArrayUInt16()), CAST(5, 'UInt32'));
SELECT bitmapContains(bitmapBuild(emptyArrayUInt32()), toUInt32(0));
SELECT bitmapContains(bitmapBuild(emptyArrayUInt16()), toUInt32(5));
---- Small
select bitmapContains(bitmapBuild([1,5,7,9]),CAST(0, 'UInt32'));
select bitmapContains(bitmapBuild([1,5,7,9]),CAST(9, 'UInt32'));
select bitmapContains(bitmapBuild([1,5,7,9]),toUInt32(0));
select bitmapContains(bitmapBuild([1,5,7,9]),toUInt32(9));
---- Large
select bitmapContains(bitmapBuild([
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,
100,200,500]),CAST(100, 'UInt32'));
100,200,500]),toUInt32(100));
select bitmapContains(bitmapBuild([
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,
100,200,500]),CAST(101, 'UInt32'));
100,200,500]),toUInt32(101));
select bitmapContains(bitmapBuild([
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,
100,200,500]),CAST(500, 'UInt32'));
100,200,500]),toUInt32(500));
-- bitmapSubsetInRange:
---- Empty
SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild(emptyArrayUInt32()), toUInt32(0), toUInt32(10)));
SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild(emptyArrayUInt16()), toUInt32(0), toUInt32(10)));
---- Small
select bitmapToArray(bitmapSubsetInRange(bitmapBuild([1,5,7,9]), toUInt32(0), toUInt32(4)));
select bitmapToArray(bitmapSubsetInRange(bitmapBuild([1,5,7,9]), toUInt32(10), toUInt32(10)));
select bitmapToArray(bitmapSubsetInRange(bitmapBuild([1,5,7,9]), toUInt32(3), toUInt32(7)));
---- Large
select bitmapToArray(bitmapSubsetInRange(bitmapBuild([
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,
100,200,500]), toUInt32(0), toUInt32(100)));
select bitmapToArray(bitmapSubsetInRange(bitmapBuild([
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,
100,200,500]), toUInt32(30), toUInt32(200)));
select bitmapToArray(bitmapSubsetInRange(bitmapBuild([
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,
100,200,500]), toUInt32(100), toUInt32(200)));

View File

@ -0,0 +1,2 @@
1000000
1000000

View File

@ -0,0 +1,14 @@
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
CREATE TABLE t1 (str String, dec Decimal64(8)) ENGINE = MergeTree ORDER BY str;
CREATE TABLE t2 (str String, dec Decimal64(8)) ENGINE = MergeTree ORDER BY dec;
INSERT INTO t1 SELECT toString(number), toDecimal64(number, 8) FROM system.numbers LIMIT 1000000;
SELECT count() FROM t1;
INSERT INTO t2 SELECT toString(number), toDecimal64(number, 8) FROM system.numbers LIMIT 1000000;
SELECT count() FROM t2;
DROP TABLE t1;
DROP TABLE t2;

View File

@ -0,0 +1,40 @@
center
['7zz','ebp','kpb','s00']
['7zzzm','7zzzq','7zzzr','7zzzt','7zzzv','7zzzw','7zzzx','7zzzy','7zzzz','ebpbj','ebpbm','ebpbn','ebpbp','ebpbq','ebpbr','ebpbt','ebpbw','ebpbx','kpbp2','kpbp3','kpbp6','kpbp8','kpbp9','kpbpb','kpbpc','kpbpd','kpbpf','s0000','s0001','s0002','s0003','s0004','s0006','s0008','s0009','s000d']
['7zzzz','ebpbp','kpbpb','s0000']
north pole
['bpb']
['gzz']
['upb']
['zzz']
south pole
['000']
['5bp']
['h00']
['pbp']
wrap point around equator
['rzz']
['xbp']
['2pb']
['800']
arbitrary values in all 4 quarters
['w1muy4','w1muy5','w1muy6','w1muy7','w1muyh','w1muyk']
['thym0','thym1','thym2','thym3','thym4','thym6','thym8','thym9','thymd']
['6gkzx5','6gkzx7','6gkzxh','6gkzxj','6gkzxk','6gkzxm']
['h927mu','h927mv','h927my','h927qh','h927qj','h927qn']
small range always produces array of length 1
zooming
['s7']
['s7w']
['s7w1','s7w3','s7w4','s7w5','s7w6','s7w7','s7w9','s7wc','s7wd','s7we','s7wf','s7wg','s7wh','s7wj','s7wk','s7wm','s7wn','s7wp','s7wq','s7wr','s7ws','s7wt','s7wu','s7wv','s7ww','s7wx','s7wy','s7wz']
['s7w1z','s7w3b','s7w3c','s7w3f','s7w3g','s7w3u','s7w4p','s7w4r','s7w4x','s7w4z','s7w5p','s7w60','s7w61','s7w62','s7w63','s7w64','s7w65','s7w66','s7w67','s7w68','s7w69','s7w6b','s7w6c','s7w6d','s7w6e','s7w6f','s7w6g','s7w6h','s7w6k','s7w6s','s7w6u','s7w70','s7w71','s7w74','s7w75','s7w7h']
['s7w1z0','s7w1z1','s7w1z2','s7w1z3','s7w1z4','s7w1z5','s7w1z6','s7w1z7','s7w1z8','s7w1z9','s7w1zb','s7w1zc','s7w1zd','s7w1ze','s7w1zf','s7w1zg','s7w1zh','s7w1zj','s7w1zk','s7w1zm','s7w1zn','s7w1zp','s7w1zq','s7w1zr','s7w1zs','s7w1zt','s7w1zu','s7w1zv','s7w1zw','s7w1zx','s7w1zy','s7w1zz','s7w3b0','s7w3b1','s7w3b2','s7w3b3','s7w3b4','s7w3b5','s7w3b6','s7w3b7','s7w3b8','s7w3b9','s7w3bd','s7w3be','s7w3bh','s7w3bj','s7w3bk','s7w3bm','s7w3bn','s7w3bp','s7w3bq','s7w3br','s7w3bs','s7w3bt','s7w3bw','s7w3bx','s7w4p0','s7w4p1','s7w4p2','s7w4p3','s7w4p4','s7w4p5','s7w4p6','s7w4p7','s7w4p8','s7w4p9','s7w4pb','s7w4pc','s7w4pd','s7w4pe','s7w4pf','s7w4pg','s7w4ph','s7w4pk','s7w4ps','s7w4pu','s7w600','s7w601','s7w602','s7w603','s7w604','s7w605','s7w606','s7w607','s7w608','s7w609','s7w60d','s7w60e','s7w60h','s7w60k','s7w60s']
['s7w1z0g','s7w1z0u','s7w1z0v','s7w1z0y','s7w1z0z','s7w1z15','s7w1z17','s7w1z1e','s7w1z1g','s7w1z1h','s7w1z1j','s7w1z1k','s7w1z1m','s7w1z1n','s7w1z1p','s7w1z1q','s7w1z1r','s7w1z1s','s7w1z1t','s7w1z1u','s7w1z1v','s7w1z1w','s7w1z1x','s7w1z1y','s7w1z1z','s7w1z2b','s7w1z2c','s7w1z2f','s7w1z30','s7w1z31','s7w1z32','s7w1z33','s7w1z34','s7w1z36','s7w1z38','s7w1z39','s7w1z3b','s7w1z3c','s7w1z3d','s7w1z3f','s7w1z45','s7w1z47','s7w1z4e','s7w1z4h','s7w1z4j','s7w1z4k','s7w1z4m','s7w1z4n','s7w1z4p','s7w1z4q','s7w1z4r','s7w1z4s','s7w1z4t','s7w1z4w','s7w1z4x','s7w1z60','s7w1z61','s7w1z62','s7w1z63','s7w1z64','s7w1z66','s7w1z68','s7w1z69','s7w1z6d']
['s7w1z0gs','s7w1z0gt','s7w1z0gu','s7w1z0gv','s7w1z0gw','s7w1z0gx','s7w1z0gy','s7w1z0gz','s7w1z0uh','s7w1z0uj','s7w1z0uk','s7w1z0um','s7w1z0un','s7w1z0up','s7w1z0uq','s7w1z0ur','s7w1z158','s7w1z159','s7w1z15b','s7w1z15c','s7w1z15d','s7w1z15f','s7w1z1h0','s7w1z1h1','s7w1z1h2','s7w1z1h3','s7w1z1h4','s7w1z1h6']
['s7w1z0gs3','s7w1z0gs6','s7w1z0gs7','s7w1z0gs9','s7w1z0gsc','s7w1z0gsd','s7w1z0gse','s7w1z0gsf','s7w1z0gsg','s7w1z0gsk','s7w1z0gss','s7w1z0gsu','s7w1z0gt1','s7w1z0gt4','s7w1z0gt5','s7w1z0gth']
['s7w1z0gs3y','s7w1z0gs3z','s7w1z0gs6n','s7w1z0gs6p','s7w1z0gs9b','s7w1z0gsd0']
['s7w1z0gs3y0','s7w1z0gs3y1','s7w1z0gs3y2','s7w1z0gs3y3']
['s7w1z0gs3y0z','s7w1z0gs3y1p','s7w1z0gs3y1r','s7w1z0gs3y1x','s7w1z0gs3y2b','s7w1z0gs3y2c','s7w1z0gs3y2f','s7w1z0gs3y2g','s7w1z0gs3y2u','s7w1z0gs3y2v','s7w1z0gs3y30','s7w1z0gs3y31','s7w1z0gs3y32','s7w1z0gs3y33','s7w1z0gs3y34','s7w1z0gs3y35','s7w1z0gs3y36','s7w1z0gs3y37','s7w1z0gs3y38','s7w1z0gs3y39','s7w1z0gs3y3d','s7w1z0gs3y3e','s7w1z0gs3y3h','s7w1z0gs3y3j','s7w1z0gs3y3k','s7w1z0gs3y3m','s7w1z0gs3y3s','s7w1z0gs3y3t']
['s7w1z0gs3y0z','s7w1z0gs3y1p','s7w1z0gs3y1r','s7w1z0gs3y1x','s7w1z0gs3y2b','s7w1z0gs3y2c','s7w1z0gs3y2f','s7w1z0gs3y2g','s7w1z0gs3y2u','s7w1z0gs3y2v','s7w1z0gs3y30','s7w1z0gs3y31','s7w1z0gs3y32','s7w1z0gs3y33','s7w1z0gs3y34','s7w1z0gs3y35','s7w1z0gs3y36','s7w1z0gs3y37','s7w1z0gs3y38','s7w1z0gs3y39','s7w1z0gs3y3d','s7w1z0gs3y3e','s7w1z0gs3y3h','s7w1z0gs3y3j','s7w1z0gs3y3k','s7w1z0gs3y3m','s7w1z0gs3y3s','s7w1z0gs3y3t']
['s7w1z0gs3y0z','s7w1z0gs3y1p','s7w1z0gs3y1r','s7w1z0gs3y1x','s7w1z0gs3y2b','s7w1z0gs3y2c','s7w1z0gs3y2f','s7w1z0gs3y2g','s7w1z0gs3y2u','s7w1z0gs3y2v','s7w1z0gs3y30','s7w1z0gs3y31','s7w1z0gs3y32','s7w1z0gs3y33','s7w1z0gs3y34','s7w1z0gs3y35','s7w1z0gs3y36','s7w1z0gs3y37','s7w1z0gs3y38','s7w1z0gs3y39','s7w1z0gs3y3d','s7w1z0gs3y3e','s7w1z0gs3y3h','s7w1z0gs3y3j','s7w1z0gs3y3k','s7w1z0gs3y3m','s7w1z0gs3y3s','s7w1z0gs3y3t']
errors

View File

@ -0,0 +1,63 @@
-- test data acquired with: https://github.com/sunng87/node-geohash
-- geohash.bboxes(minlat, minlon, maxlat, maxlon, precision)
-- as
-- geohashesInBox(minlon, minlat, maxlon, maxlat, precision)
-- except for the cases when JS-version produces result outside of given region,
-- typically at wrap points: poles, 0-latitude and 0-longitude.
select 'center';
SELECT arraySort(geohashesInBox(-1.0, -1.0, 1.0, 1.0, 3));
SELECT arraySort(geohashesInBox(-0.1, -0.1, 0.1, 0.1, 5));
SELECT arraySort(geohashesInBox(-0.01, -0.01, 0.01, 0.01, 5));
select 'north pole';
SELECT arraySort(geohashesInBox(-180.0, 89.0, -179.0, 90.0, 3));
SELECT arraySort(geohashesInBox(-1.0, 89.0, 0.0, 90.0, 3));
SELECT arraySort(geohashesInBox(0.0, 89.0, 1.0, 90.0, 3));
SELECT arraySort(geohashesInBox(179.0, 89.0, 180.0, 90.0, 3));
select 'south pole';
SELECT arraySort(geohashesInBox(-180.0, -90.0, -179.0, -89.0, 3));
SELECT arraySort(geohashesInBox(-1.0, -90.0, 0.0, -89.0, 3));
SELECT arraySort(geohashesInBox(0.0, -90.0, 1.0, -89.0, 3));
SELECT arraySort(geohashesInBox(179.0, -90.0, 180.0, -89.0, 3));
select 'wrap point around equator';
SELECT arraySort(geohashesInBox(179.0, -1.0, 180.0, 0.0, 3));
SELECT arraySort(geohashesInBox(179.0, 0.0, 180.0, 1.0, 3));
SELECT arraySort(geohashesInBox(-180.0, -1.0, -179.0, 0.0, 3));
SELECT arraySort(geohashesInBox(-180.0, 0.0, -179.0, 1.0, 3));
select 'arbitrary values in all 4 quarters';
SELECT arraySort(geohashesInBox(98.36, 7.88, 98.37, 7.89, 6));
SELECT arraySort(geohashesInBox(53.8, 27.6, 53.9, 27.7, 5));
SELECT arraySort(geohashesInBox(-49.26, -25.38, -49.25, -25.37, 6));
SELECT arraySort(geohashesInBox(23.11, -82.37, 23.12, -82.36, 6));
select 'small range always produces array of length 1';
SELECT lon/5 - 180 as lon1, lat/5 - 90 as lat1, lon1 as lon2, lat1 as lat2, geohashesInBox(lon1, lat1, lon2, lat2, 1) as g FROM (SELECT arrayJoin(range(360*5)) as lon, arrayJoin(range(180*5)) as lat) WHERE length(g) != 1;
SELECT lon/5 - 40 as lon1, lat/5 - 20 as lat1, lon1 as lon2, lat1 as lat2, geohashesInBox(lon1, lat1, lon2, lat2, 12) as g FROM (SELECT arrayJoin(range(80*5)) as lon, arrayJoin(range(10*5)) as lat) WHERE length(g) != 1;
SELECT lon/5 - 40 as lon1, lat/5 - 20 as lat1, lon1 + 0.0000000001 as lon2, lat1 + 0.0000000001 as lat2, geohashesInBox(lon1, lat1, lon2, lat2, 1) as g FROM (SELECT arrayJoin(range(80*5)) as lon, arrayJoin(range(10*5)) as lat) WHERE length(g) != 1;
select 'zooming';
SELECT arraySort(geohashesInBox(20.0, 20.0, 21.0, 21.0, 2));
SELECT arraySort(geohashesInBox(20.0, 20.0, 21.0, 21.0, 3));
SELECT arraySort(geohashesInBox(20.0, 20.0, 21.0, 21.0, 4));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.25, 20.25, 5));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.0625, 20.0625, 6));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.01, 20.01, 7));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.001, 20.001, 8));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.0001, 20.0001, 9));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.00001, 20.00001, 10));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.000001, 20.000001, 11));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.000001, 20.000001, 12));
-- precision greater than 12 is truncated to 12, so these two calls would produce same result as above
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.000001, 20.000001, 13));
SELECT arraySort(geohashesInBox(20.0, 20.0, 20.000001, 20.000001, 14));
select 'errors';
SELECT geohashesInBox(); -- { serverError 42 } -- not enough arguments
SELECT geohashesInBox(1, 2, 3, 4, 5); -- { serverError 43 } -- wrong types of arguments
SELECT geohashesInBox(toFloat32(1.0), 2.0, 3.0, 4.0, 5); -- { serverError 43 } -- all lats and longs should be of the same type
SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 12); -- { serverError 128 } -- to many elements in array

View File

@ -0,0 +1,5 @@
1 2019-01-01 a
1 2019-01-01 \N
1 2019-01-01 \N
2 2019-01-01 b
3 2019-01-01 c

View File

@ -0,0 +1,15 @@
drop table if exists test_join;
create table test_join (date Date, id Int32, name Nullable(String)) engine = MergeTree partition by date order by id;
insert into test_join values ('2019-01-01', 1, 'a');
insert into test_join values ('2019-01-01', 2, 'b');
insert into test_join values ('2019-01-01', 3, 'c');
insert into test_join values ('2019-01-01', 1, null);
SELECT id, date, name FROM (SELECT id, date, name FROM test_join GROUP BY id, name, date)
FULL OUTER JOIN (SELECT id, date, name FROM test_join GROUP BY id, name, date)
USING (id, name, date)
ORDER BY id, name;
drop table test_join;

View File

@ -0,0 +1,12 @@
DROP TABLE IF EXISTS t_local;
DROP TABLE IF EXISTS t_distr;
CREATE TABLE t_local (a Int) ENGINE = Memory;
CREATE TABLE t_distr (a Int) ENGINE = Distributed(test_shard_localhost, currentDatabase(), 't_local');
INSERT INTO t_local VALUES (1), (2);
SET optimize_skip_unused_shards = 1;
SELECT * FROM t_distr WHERE a = 1;
DROP table t_local;
DROP table t_distr;

View File

@ -1,5 +1,5 @@
# Anonymized Yandex.Metrica Data
Dataset consists of two tables containing anonymized data about hits (`hits_v1`) and visits (`visits_v1`) of Yandex.Metrica. Each of the tables can be downloaded as a compressed `tsv.xz` file or as prepared partitions.
Dataset consists of two tables containing anonymized data about hits (`hits_v1`) and visits (`visits_v1`) of Yandex.Metrica. Each of the tables can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at `https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_100m_obfuscated_v1.tsv.xz` and as prepared partitions at `https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz`.
## Obtaining Tables from Prepared Partitions
**Download and import hits:**

View File

@ -21,7 +21,7 @@ It is highly recommended to set up monitoring for:
ClickHouse server has embedded instruments for self-state monitoring.
To track server events use server logs. See the [logger](#server_settings-logger) section of the configuration file.
To track server events use server logs. See the [logger](server_settings/settings.md#server_settings-logger) section of the configuration file.
ClickHouse collects:
@ -30,7 +30,7 @@ ClickHouse collects:
You can find metrics in the [system.metrics](#system_tables-metrics), [system.events](#system_tables-events), and [system.asynchronous_metrics](#system_tables-asynchronous_metrics) tables.
You can configure ClickHouse to export metrics to [Graphite](https://github.com/graphite-project). See the [Graphite section](server_settings/settings.md#server_settings-graphite) in the ClickHouse server configuration file. Before configuring export of metrics, you should set up Graphite by following their official guide https://graphite.readthedocs.io/en/latest/install.html.
You can configure ClickHouse to export metrics to [Graphite](https://github.com/graphite-project). See the [Graphite section](server_settings/settings.md#server_settings-graphite) in the ClickHouse server configuration file. Before configuring export of metrics, you should set up Graphite by following their official [guide](https://graphite.readthedocs.io/en/latest/install.html).
Additionally, you can monitor server availability through the HTTP API. Send the `HTTP GET` request to `/`. If the server is available, it responds with `200 OK`.

View File

@ -81,9 +81,9 @@ For descriptions of request parameters, see the [request description](../../quer
<a name="mergetree_setting-merge_with_ttl_timeout"></a>
- `merge_with_ttl_timeout` — Minimum delay in seconds before repeating a merge with TTL. Default value: 86400 (1 day).
**Example of setting the sections **
**Example of setting the sections**
```
```sql
ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192
```
@ -125,7 +125,7 @@ The `MergeTree` engine is configured in the same way as in the example above for
## Data Storage
A table consists of data *parts* sorted by primary key.
A table consists of data parts sorted by primary key.
When data is inserted in a table, separate data parts are created and each of them is lexicographically sorted by primary key. For example, if the primary key is `(CounterID, Date)`, the data in the part is sorted by `CounterID`, and within each `CounterID`, it is ordered by `Date`.

View File

@ -1009,7 +1009,7 @@ SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])
## stochasticLinearRegression {#agg_functions-stochasticlinearregression}
This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size and has few methods for updating weights ([simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)).
This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size and has few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)).
### Parameters {#agg_functions-stochasticlinearregression-parameters}
@ -1022,7 +1022,7 @@ stochasticLinearRegression(1.0, 1.0, 10, 'SGD')
1. `learning rate` is the coefficient on step length, when gradient descent step is performed. Too big learning rate may cause infinite weights of the model. Default is `0.00001`.
2. `l2 regularization coefficient` which may help to prevent overfitting. Default is `0.1`.
3. `mini-batch size` sets the number of elements, which gradients will be computed and summed to perform one step of gradient descent. Pure stochastic descent uses one element, however having small batches(about 10 elements) make gradient steps more stable. Default is `15`.
4. `method for updating weights`, there are 3 of them: `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods. Default is `'SGD'`.
4. `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods.
### Usage {#agg_functions-stochasticlinearregression-usage}

View File

@ -109,25 +109,7 @@ Defines storage time for values. Can be specified only for MergeTree-family tabl
## Column Compression Codecs
Besides default data compression, defined in [server settings](../operations/server_settings/settings.md#compression), per-column specification is also available.
Supported compression algorithms:
- `NONE` — No compression.
- `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression.
- `LZ4HC[(level)]` — LZ4 CH (high compression) algorithm with configurable level. Default level: 9. If you set `level <= 0`, the default level is applied. Possible levels: [1, 12]. Recommended levels are in range: [4, 9].
- `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: [1, 22]. Default value: 1.
- `Delta(delta_bytes)` — compression approach, when raw values are replaced with the difference of two neighbour values. Up to `delta_bytes` are used for storing delta value, so `delta_bytes` is a maximum size of raw values.
Possible `delta_bytes` values: 1, 2, 4, 8. Default value for `delta_bytes` is `sizeof(type)`, if it is equals to 1, 2, 4, 8. Otherwise it equals 1.
- `DoubleDelta` — Compresses values down to 1 bit (in the best case), using deltas calculation. Best compression rates are achieved on monotonic sequences with constant stride, for example, time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64 bit types. Uses 1 extra bit for 32 byte deltas: 5 bit prefix instead of 4 bit prefix. For additional information, see the "Compressing time stamps" section of the [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf) document.
- `Gorilla` — Compresses values down to 1 bit (in the best case). The codec is efficient when storing series of floating point values that change slowly, because the best compression rate is achieved when neighbouring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64 bit types. For additional information, see the "Compressing values" section of the [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf) document.
High compression levels useful for asymmetric scenarios, like compress once, decompress a lot of times. Greater levels stands for better compression and higher CPU usage.
!!!warning
You cannot decompress ClickHouse database files with external utilities, for example, `lz4`. Use the special utility [clickhouse-compressor](https://github.com/yandex/ClickHouse/tree/master/dbms/programs/compressor).
Syntax example:
By default, ClickHouse applies to columns the compression method, defined in [server settings](../operations/server_settings/settings.md#compression). Also, you can define compression method for each individual column in the `CREATE TABLE` query.
```
CREATE TABLE codec_example
@ -136,28 +118,48 @@ CREATE TABLE codec_example
ts DateTime CODEC(LZ4HC),
float_value Float32 CODEC(NONE),
double_value Float64 CODEC(LZ4HC(9))
)
ENGINE = MergeTree
PARTITION BY tuple()
ORDER BY dt
```
Codecs can be combined in a pipeline. Default table codec is not included into pipeline (if it should be applied to a column, you have to specify it explicitly in pipeline). Example below shows an optimization approach for storing timeseries metrics.
Usually, values for particular metric, stored in `path` does not differ significantly from point to point. Using delta-encoding allows to reduce disk space usage significantly.
```
CREATE TABLE timeseries_example
(
dt Date,
ts DateTime,
path String,
value Float32 CODEC(Delta, ZSTD)
)
ENGINE = MergeTree
PARTITION BY dt
ORDER BY (path, ts)
ENGINE = <Engine>
...
```
If a codec is specified, the default codec doesn't apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codecs combination for you project, pass benchmarks, similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article.
!!!warning
You cannot decompress ClickHouse database files with external utilities, for example, `lz4`. Use the special utility, [clickhouse-compressor](https://github.com/yandex/ClickHouse/tree/master/dbms/programs/compressor).
Compression is supported for the table engines:
- [*MergeTree](../operations/table_engines/mergetree.md) family
- [*Log](../operations/table_engines/log_family.md) family
- [Set](../operations/table_engines/set.md)
- [Join](../operations/table_engines/join.md)
ClickHouse supports common purpose codecs and specialized codecs.
### Specialized codecs {#create-query-specialized-codecs}
These codecs are designed to make compression more effective using specifities of the data. Some of this codecs don't compress data by itself, but they prepare data to be compressed better by common purpose codecs.
Specialized codecs:
- `Delta(delta_bytes)` — Compression approach, when raw values are replaced with the difference of two neighbor values. Up to `delta_bytes` are used for storing delta value, so `delta_bytes` is a maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. Default value for `delta_bytes` is `sizeof(type)`, if it is equals to 1, 2, 4, 8. Otherwise it equals 1.
- `DoubleDelta` — Compresses values down to 1 bit (in the best case), using deltas calculation. Best compression rates are achieved on monotonic sequences with constant stride, for example, time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64 bit types. Uses 1 extra bit for 32 byte deltas: 5 bit prefix instead of 4 bit prefix. For additional information, see the "Compressing time stamps" section of the [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf) document.
- `Gorilla` — Compresses values down to 1 bit (in the best case). The codec is efficient when storing series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64 bit types. For additional information, see the "Compressing values" section of the [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf) document.
- `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that don't differ between maximum and minimum values in the whole data part for which the compression is used.
### Common purpose codecs {#create-query-common-purpose-codecs}
Codecs:
- `NONE` — No compression.
- `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression.
- `LZ4HC[(level)]` — LZ4 CH (high compression) algorithm with configurable level. Default level: 9. If you set `level <= 0`, the default level is applied. Possible levels: [1, 12]. Recommended levels are in range: [4, 9].
- `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: [1, 22]. Default level: 1.
High compression levels useful for asymmetric scenarios, like compress once, decompress a lot of times. Greater levels stands for better compression and higher CPU usage.
## Temporary Tables
ClickHouse supports temporary tables which have the following characteristics:

View File

@ -56,6 +56,32 @@ SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res
└─────────────┘
```
## bitmapSubsetInRange {#bitmap_functions-bitmapsubsetinrange}
Return subset in specified range (not include the range_end).
```
bitmapSubsetInRange(bitmap, range_start, range_end)
```
**Parameters**
- `bitmap` [Bitmap object](#bitmap_functions-bitmapbuild).
- `range_start` range start point. Type: [UInt32](../../data_types/int_uint.md).
- `range_end` range end point(excluded). Type: [UInt32](../../data_types/int_uint.md).
**Example**
``` sql
SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res
```
```
┌─res───────────────┐
│ [30,31,32,33,100] │
└───────────────────┘
```
## bitmapContains {#bitmap_functions-bitmapcontains}
Checks whether the bitmap contains an element.

View File

@ -183,4 +183,36 @@ SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index
└────────────────────┘
```
## geohashesInBox
Returns an array of geohash-encoded strings of given precision that fall inside and intersect boundaries of given box, basically a 2D grid flattened into array.
**Input values**
- longitude_min - min longitude, floating value in range `[-180°, 180°]`
- latitude_min - min latitude, floating value in range `[-90°, 90°]`
- longitude_max - max longitude, floating value in range `[-180°, 180°]`
- latitude_max - max latitude, floating value in range `[-90°, 90°]`
- precision - geohash precision, `UInt8` in range `[1, 12]`
Please note that all coordinate parameters should be of the same type: either `Float32` or `Float64`.
**Returned values**
- array of precision-long strings of geohash-boxes covering provided area, you should not rely on order of items.
- [] - empty array if *min* values of *latitude* and *longitude* aren't less than corresponding *max* values.
Please note that function will throw an exception if resulting array is over 10'000'000 items long.
**Example**
```
SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos
```
```
┌─thasos──────────────────────────────────────┐
│ ['sx1q','sx1r','sx32','sx1w','sx1x','sx38'] │
└─────────────────────────────────────────────┘
```
[Original article](https://clickhouse.yandex/docs/en/query_language/functions/geo/) <!--hide-->

View File

@ -195,18 +195,21 @@ RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ...
All tables are renamed under global locking. Renaming tables is a light operation. If you indicated another database after TO, the table will be moved to this database. However, the directories with databases must reside in the same file system (otherwise, an error is returned).
## SET
## SET {#query-set}
``` sql
```sql
SET param = value
```
Allows you to set `param` to `value`. You can also make all the settings from the specified settings profile in a single query. To do this, specify 'profile' as the setting name. For more information, see the section "Settings".
The setting is made for the session, or for the server (globally) if `GLOBAL` is specified.
When making a global setting, the setting is not applied to sessions already running, including the current session. It will only be used for new sessions.
Assigns `value` to the `param` configurations settings for the current session. You cannot change [server settings](../operations/server_settings/index.md) this way.
When the server is restarted, global settings made using `SET` are lost.
To make settings that persist after a server restart, you can only use the server's config file.
You can also set all the values from the specified settings profile in a single query.
```sql
SET profile = 'profile-name-from-the-settings-file'
```
For more information, see [Settings](../operations/settings/settings.md).
## SHOW CREATE TABLE

View File

@ -1 +1 @@
../../CHANGELOG_RU.md
../../CHANGELOG.md

View File

@ -1 +1 @@
../../CHANGELOG_RU.md
../../CHANGELOG.md

View File

@ -878,7 +878,7 @@ SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])
## stochasticLinearRegression {#agg_functions-stochasticlinearregression}
Функция реализует стохастическую линейную регрессию. Поддерживает пользовательские параметры для скорости обучения, коэффициента регуляризации L2, размера mini-batch и имеет несколько методов обновления весов ([simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)).
Функция реализует стохастическую линейную регрессию. Поддерживает пользовательские параметры для скорости обучения, коэффициента регуляризации L2, размера mini-batch и имеет несколько методов обновления весов ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (по умолчанию), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)).
### Параметры {#agg_functions-stochasticlinearregression-parameters}
@ -891,7 +891,8 @@ stochasticLinearRegression(1.0, 1.0, 10, 'SGD')
1. Скорость обучения — коэффициент длины шага, при выполнении градиентного спуска. Слишком большая скорость обучения может привести к бесконечным весам модели. По умолчанию `0.00001`.
2. Коэффициент регуляризации l2. Помогает предотвратить подгонку. По умолчанию `0.1`.
3. Размер mini-batch задаёт количество элементов, чьи градиенты будут вычислены и просуммированы при выполнении одного шага градиентного спуска. Чистый стохастический спуск использует один элемент, однако использование mini-batch (около 10 элементов) делает градиентные шаги более стабильными. По умолчанию `15`.
4. Метод обновления весов, можно выбрать один из следующих: `SGD`, `Momentum`, `Nesterov`. `Momentum` и `Nesterov` более требовательные к вычислительным ресурсам и памяти, однако они имеют высокую скорость схождения и остальные методы стохастического градиента. По умолчанию `SGD`.
4. Метод обновления весов, можно выбрать один из следующих: `Adam` (по умолчанию), `SGD`, `Momentum`, `Nesterov`. `Momentum` и `Nesterov` более требовательные к вычислительным ресурсам и памяти, однако они имеют высокую скорость схождения и устойчивости методов стохастического градиента.
### Использование {#agg_functions-stochasticlinearregression-usage}
@ -1005,4 +1006,3 @@ stochasticLogisticRegression(1.0, 1.0, 10, 'SGD')
- [Отличие линейной от логистической регрессии](https://moredez.ru/q/51225972/)
[Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/agg_functions/reference/) <!--hide-->

View File

@ -0,0 +1,78 @@
## IPv4
`IPv4`是与`UInt32`类型保持二进制兼容的Domain类型其用于存储IPv4地址的值。它提供了更为紧凑的二进制存储的同时支持识别可读性更加友好的输入输出格式。
### 基本使用
``` sql
CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url;
DESCRIBE TABLE hits;
```
```
┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐
│ url │ String │ │ │ │ │
│ from │ IPv4 │ │ │ │ │
└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘
```
同时您也可以使用`IPv4`类型的列作为主键:
``` sql
CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY from;
```
在写入与查询时,`IPv4`类型能够识别可读性更加友好的输入输出格式:
``` sql
INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')('https://clickhouse.yandex', '183.247.232.58')('https://clickhouse.yandex/docs/en/', '116.106.34.242');
SELECT * FROM hits;
```
```
┌─url────────────────────────────────┬───────────from─┐
│ https://clickhouse.yandex/docs/en/ │ 116.106.34.242 │
│ https://wikipedia.org │ 116.253.40.133 │
│ https://clickhouse.yandex │ 183.247.232.58 │
└────────────────────────────────────┴────────────────┘
```
同时它提供更为紧凑的二进制存储格式:
``` sql
SELECT toTypeName(from), hex(from) FROM hits LIMIT 1;
```
```
┌─toTypeName(from)─┬─hex(from)─┐
│ IPv4 │ B7F7E83A │
└──────────────────┴───────────┘
```
不可隐式转换为除`UInt32`以外的其他类型类型。如果要将`IPv4`类型的值转换成字符串,你可以使用`IPv4NumToString()`显示的进行转换:
``` sql
SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1;
```
```
┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐
│ String │ 183.247.232.58 │
└───────────────────────────────────┴────────────────┘
```
或可以使用`CAST`将它转换为`UInt32`类型:
``` sql
SELECT toTypeName(i), CAST(from as UInt32) as i FROM hits LIMIT 1;
```
```
┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐
│ UInt32 │ 3086477370 │
└──────────────────────────────────┴────────────┘
```
[来源文章](https://clickhouse.yandex/docs/en/data_types/domains/ipv4) <!--hide-->

View File

@ -0,0 +1,78 @@
## IPv6
`IPv6`是与`FixedString(16)`类型保持二进制兼容的Domain类型其用于存储IPv6地址的值。它提供了更为紧凑的二进制存储的同时支持识别可读性更加友好的输入输出格式。
### 基本用法
``` sql
CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url;
DESCRIBE TABLE hits;
```
```
┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐
│ url │ String │ │ │ │ │
│ from │ IPv6 │ │ │ │ │
└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘
```
同时您也可以使用`IPv6`类型的列作为主键:
``` sql
CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from;
```
在写入与查询时,`IPv6`类型能够识别可读性更加友好的输入输出格式:
``` sql
INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:3100::2')('https://clickhouse.yandex', '2001:44c8:129:2632:33:0:252:2')('https://clickhouse.yandex/docs/en/', '2a02:e980:1e::1');
SELECT * FROM hits;
```
```
┌─url────────────────────────────────┬─from──────────────────────────┐
│ https://clickhouse.yandex │ 2001:44c8:129:2632:33:0:252:2 │
│ https://clickhouse.yandex/docs/en/ │ 2a02:e980:1e::1 │
│ https://wikipedia.org │ 2a02:aa08:e000:3100::2 │
└────────────────────────────────────┴───────────────────────────────┘
```
同时它提供更为紧凑的二进制存储格式:
``` sql
SELECT toTypeName(from), hex(from) FROM hits LIMIT 1;
```
```
┌─toTypeName(from)─┬─hex(from)────────────────────────┐
│ IPv6 │ 200144C8012926320033000002520002 │
└──────────────────┴──────────────────────────────────┘
```
不可隐式转换为除`FixedString(16)`以外的其他类型类型。如果要将`IPv6`类型的值转换成字符串,你可以使用`IPv6NumToString()`显示的进行转换:
``` sql
SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1;
```
```
┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐
│ String │ 2001:44c8:129:2632:33:0:252:2 │
└───────────────────────────────────┴───────────────────────────────┘
```
或使用`CAST`将其转换为`FixedString(16)`
``` sql
SELECT toTypeName(i), CAST(from as FixedString(16)) as i FROM hits LIMIT 1;
```
```
┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐
│ FixedString(16) │ <20><><EFBFBD>
└───────────────────────────────────────────┴─────────┘
```
[来源文章](https://clickhouse.yandex/docs/en/data_types/domains/ipv6) <!--hide-->

View File

@ -0,0 +1,26 @@
# Domains
Domain类型是特定实现的类型它总是与某个现存的基础类型保持二进制兼容的同时添加一些额外的特性以能够在维持磁盘数据不变的情况下使用这些额外的特性。目前ClickHouse暂不支持自定义domain类型。
如果你可以在一个地方使用与Domain类型二进制兼容的基础类型那么在相同的地方您也可以使用Domain类型例如
* 使用Domain类型作为表中列的类型
* 对Domain类型的列进行读/写数据
* 如果与Domain二进制兼容的基础类型可以作为索引那么Domain类型也可以作为索引
* 将Domain类型作为参数传递给函数使用
* 其他
### Domains的额外特性
* 在执行SHOW CREATE TABLE 或 DESCRIBE TABLE时其对应的列总是展示为Domain类型的名称
* 在INSERT INTO domain_table(domain_column) VALUES(...)中输入数据总是以更人性化的格式进行输入
* 在SELECT domain_column FROM domain_table中数据总是以更人性化的格式输出
* 在INSERT INTO domain_table FORMAT CSV ...中,实现外部源数据以更人性化的格式载入
### Domains类型的限制
* 无法通过`ALTER TABLE`将基础类型的索引转换为Domain类型的索引。
* 当从其他列或表插入数据时无法将string类型的值隐式地转换为Domain类型的值。
* 无法对存储为Domain类型的值添加约束。
[来源文章](https://clickhouse.yandex/docs/en/data_types/domains/overview) <!--hide-->

View File

@ -1 +0,0 @@
../../../en/data_types/domains/ipv4.md

View File

@ -1 +0,0 @@
../../../en/data_types/domains/ipv6.md

View File

@ -1 +0,0 @@
../../../en/data_types/domains/overview.md

View File

@ -51,6 +51,56 @@ SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res
└─────────────┘
```
## bitmapSubsetInRange
将位图指定范围不包含range_end转换为另一个位图。
```
bitmapSubsetInRange(bitmap, range_start, range_end)
```
**参数**
- `bitmap` 位图对象.
- `range_start` 范围起始点(含).
- `range_end` 范围结束点(不含).
**示例**
``` sql
SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res
```
```
┌─res───────────────┐
│ [30,31,32,33,100] │
└───────────────────┘
```
## bitmapContains
检查位图是否包含指定元素。
```
bitmapContains(haystack, needle)
```
**参数**
- `haystack` 位图对象.
- `needle` 元素类型UInt32.
**示例**
``` sql
SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res
```
```text
┌─res─┐
│ 1 │
└─────┘
```
## bitmapHasAny
与`hasAny(arrayarray)`类似如果位图有任何公共元素则返回1否则返回0。

View File

@ -1,6 +1,6 @@
#include <common/DateLUT.h>
#include <boost/filesystem.hpp>
#include <filesystem>
#include <Poco/Exception.h>
#include <Poco/SHA1Engine.h>
#include <Poco/DigestStream.h>
@ -26,7 +26,7 @@ Poco::DigestEngine::Digest calcSHA1(const std::string & path)
std::string determineDefaultTimeZone()
{
namespace fs = boost::filesystem;
namespace fs = std::filesystem;
const char * tzdir_env_var = std::getenv("TZDIR");
fs::path tz_database_path = tzdir_env_var ? tzdir_env_var : "/usr/share/zoneinfo/";
@ -87,7 +87,10 @@ std::string determineDefaultTimeZone()
/// Try the same with full symlinks resolution
{
tz_file_path = fs::canonical(tz_file_path, tz_database_path);
if (!tz_file_path.is_absolute())
tz_file_path = tz_database_path / tz_file_path;
tz_file_path = fs::canonical(tz_file_path);
fs::path relative_path = tz_file_path.lexically_relative(tz_database_path);
if (!relative_path.empty() && *relative_path.begin() != ".." && *relative_path.begin() != ".")
@ -109,11 +112,11 @@ std::string determineDefaultTimeZone()
{
/// Some timezone databases contain copies of toplevel tzdata files in the posix/ directory
/// and tzdata files with leap seconds in the right/ directory. Skip them.
candidate_it.no_push();
candidate_it.disable_recursion_pending();
continue;
}
if (candidate_it->status().type() != fs::regular_file || path.filename() == "localtime")
if (!fs::is_regular_file(*candidate_it) || path.filename() == "localtime")
continue;
if (fs::file_size(path) == tzfile_size && calcSHA1(path.string()) == tzfile_sha1)