Merge remote-tracking branch 'upstream/master' into nikvas0/index_mutate

This commit is contained in:
Nikita Vasilev 2019-05-09 21:59:22 +03:00
commit 05f9373af0
68 changed files with 2313 additions and 367 deletions

3
.gitmodules vendored
View File

@ -79,3 +79,6 @@
[submodule "contrib/hyperscan"]
path = contrib/hyperscan
url = https://github.com/ClickHouse-Extras/hyperscan.git
[submodule "contrib/simdjson"]
path = contrib/simdjson
url = https://github.com/lemire/simdjson.git

View File

@ -1,6 +1,15 @@
project(ClickHouse)
cmake_minimum_required(VERSION 3.3)
cmake_policy(SET CMP0023 NEW)
foreach(policy
CMP0023
CMP0074 # CMake 3.12
)
if(POLICY ${policy})
cmake_policy(SET ${policy} NEW)
endif()
endforeach()
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")
set(CMAKE_EXPORT_COMPILE_COMMANDS 1) # Write compile_commands.json
set(CMAKE_LINK_DEPENDS_NO_SHARED 1) # Do not relink all depended targets on .so
@ -318,6 +327,7 @@ include (cmake/find_consistent-hashing.cmake)
include (cmake/find_base64.cmake)
include (cmake/find_hyperscan.cmake)
include (cmake/find_lfalloc.cmake)
include (cmake/find_simdjson.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(metrohash)

View File

@ -12,7 +12,6 @@ ClickHouse is an open-source column-oriented database management system that all
* You can also [fill this form](https://forms.yandex.com/surveys/meet-yandex-clickhouse-team/) to meet Yandex ClickHouse team in person.
## Upcoming Events
* [ClickHouse Community Meetup in Limassol](https://www.facebook.com/events/386638262181785/) on May 7.
* ClickHouse at [Percona Live 2019](https://www.percona.com/live/19/other-open-source-databases-track) in Austin on May 28-30.
* [ClickHouse Community Meetup in Beijing](https://www.huodongxing.com/event/2483759276200) on June 8.
* [ClickHouse Community Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20.

View File

@ -1,6 +1,9 @@
option(ENABLE_ICU "Enable ICU" ON)
if(ENABLE_ICU)
if (APPLE)
set(ICU_ROOT "/usr/local/opt/icu4c" CACHE STRING "")
endif()
find_package(ICU COMPONENTS i18n uc data) # TODO: remove Modules/FindICU.cmake after cmake 3.7
#set (ICU_LIBRARIES ${ICU_I18N_LIBRARY} ${ICU_UC_LIBRARY} ${ICU_DATA_LIBRARY} CACHE STRING "")
if(ICU_FOUND)

View File

@ -1,4 +1,4 @@
if (NOT SANITIZE AND NOT ARCH_ARM AND NOT ARCH_32 AND NOT ARCH_PPC64LE AND NOT OS_FREEBSD)
if (NOT SANITIZE AND NOT ARCH_ARM AND NOT ARCH_32 AND NOT ARCH_PPC64LE AND NOT OS_FREEBSD AND NOT APPLE)
option (ENABLE_LFALLOC "Set to FALSE to use system libgsasl library instead of bundled" ${NOT_UNBUNDLED})
endif ()

14
cmake/find_simdjson.cmake Normal file
View File

@ -0,0 +1,14 @@
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/simdjson/include/simdjson/jsonparser.h")
message (WARNING "submodule contrib/simdjson is missing. to fix try run: \n git submodule update --init --recursive")
return()
endif ()
if (NOT HAVE_AVX2)
message (WARNING "submodule contrib/simdjson requires AVX2 support")
return()
endif ()
option (USE_SIMDJSON "Use simdjson" ON)
set (SIMDJSON_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/simdjson/include")
set (SIMDJSON_LIBRARY "simdjson")

View File

@ -227,7 +227,7 @@ if (USE_INTERNAL_POCO_LIBRARY)
set (ENABLE_TESTS 0)
set (POCO_ENABLE_TESTS 0)
set (CMAKE_DISABLE_FIND_PACKAGE_ZLIB 1)
if (MSVC)
if (MSVC OR NOT USE_POCO_DATAODBC)
set (ENABLE_DATA_ODBC 0 CACHE INTERNAL "") # TODO (build fail)
endif ()
add_subdirectory (poco)
@ -313,3 +313,7 @@ endif()
if (USE_INTERNAL_HYPERSCAN_LIBRARY)
add_subdirectory (hyperscan)
endif()
if (USE_SIMDJSON)
add_subdirectory (simdjson-cmake)
endif()

1
contrib/simdjson vendored Submodule

@ -0,0 +1 @@
Subproject commit 681cd3369860f4eada49a387cbff93030f759c95

View File

@ -0,0 +1,26 @@
if (NOT HAVE_AVX2)
message (FATAL_ERROR "No AVX2 support")
endif ()
if(MAKE_STATIC_LIBRARIES)
set(SIMDJSON_LIB_TYPE STATIC)
MESSAGE(STATUS "Building static library ${SIMDJSON_LIBRARY}")
else()
set(SIMDJSON_LIB_TYPE SHARED)
MESSAGE(STATUS "Building dynamic library ${SIMDJSON_LIBRARY}")
endif()
set(SIMDJSON_SRC_DIR "${SIMDJSON_INCLUDE_DIR}/../src")
set(SIMDJSON_SRC
${SIMDJSON_SRC_DIR}/jsonioutil.cpp
${SIMDJSON_SRC_DIR}/jsonminifier.cpp
${SIMDJSON_SRC_DIR}/jsonparser.cpp
${SIMDJSON_SRC_DIR}/stage1_find_marks.cpp
${SIMDJSON_SRC_DIR}/stage2_build_tape.cpp
${SIMDJSON_SRC_DIR}/parsedjson.cpp
${SIMDJSON_SRC_DIR}/parsedjsoniterator.cpp
)
add_library(${SIMDJSON_LIBRARY} ${SIMDJSON_LIB_TYPE} ${SIMDJSON_SRC})
target_include_directories(${SIMDJSON_LIBRARY} PRIVATE "${SIMDJSON_INCLUDE_DIR}")
target_compile_options(${SIMDJSON_LIBRARY} PRIVATE -mavx2 -mbmi -mbmi2 -mpclmul)

View File

@ -1,7 +1,6 @@
#include <Common/Exception.h>
#include <Common/OptimizedRegularExpression.h>
#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5
@ -211,20 +210,18 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
{
if (!has_alternative_on_depth_0)
{
/** We choose the non-alternative substring of the maximum length, among the prefixes,
* or a non-alternative substring of maximum length.
*/
/// We choose the non-alternative substring of the maximum length for first search.
/// Tuning for typical usage domain
auto tuning_strings_condition = [](const std::string & str)
{
return str != "://" && str != "http://" && str != "www" && str != "Windows ";
};
size_t max_length = 0;
Substrings::const_iterator candidate_it = trivial_substrings.begin();
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (((it->second == 0 && candidate_it->second != 0)
|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
/// Tuning for typical usage domain
&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
&& (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
if (it->first.size() > max_length && tuning_strings_condition(it->first))
{
max_length = it->first.size();
candidate_it = it;

View File

@ -25,6 +25,7 @@
#cmakedefine01 USE_BROTLI
#cmakedefine01 USE_SSL
#cmakedefine01 USE_HYPERSCAN
#cmakedefine01 USE_SIMDJSON
#cmakedefine01 USE_LFALLOC
#cmakedefine01 USE_LFALLOC_RANDOM_HINT

View File

@ -1,6 +1,8 @@
#include <DataStreams/AggregatingSortedBlockInputStream.h>
#include <Common/typeid_cast.h>
#include <Common/StringUtils/StringUtils.h>
#include <DataTypes/DataTypeAggregateFunction.h>
#include <DataTypes/DataTypeCustomSimpleAggregateFunction.h>
namespace DB
@ -22,7 +24,7 @@ AggregatingSortedBlockInputStream::AggregatingSortedBlockInputStream(
ColumnWithTypeAndName & column = header.safeGetByPosition(i);
/// We leave only states of aggregate functions.
if (!startsWith(column.type->getName(), "AggregateFunction"))
if (!dynamic_cast<const DataTypeAggregateFunction *>(column.type.get()) && !dynamic_cast<const DataTypeCustomSimpleAggregateFunction *>(column.type->getCustomName()))
{
column_numbers_not_to_aggregate.push_back(i);
continue;
@ -40,7 +42,17 @@ AggregatingSortedBlockInputStream::AggregatingSortedBlockInputStream(
continue;
}
column_numbers_to_aggregate.push_back(i);
if (auto simple_aggr = dynamic_cast<const DataTypeCustomSimpleAggregateFunction *>(column.type->getCustomName()))
{
// simple aggregate function
SimpleAggregateDescription desc{simple_aggr->getFunction(), i};
columns_to_simple_aggregate.emplace_back(std::move(desc));
}
else
{
// standard aggregate function
column_numbers_to_aggregate.push_back(i);
}
}
}
@ -91,7 +103,11 @@ void AggregatingSortedBlockInputStream::merge(MutableColumns & merged_columns, s
/// if there are enough rows accumulated and the last one is calculated completely
if (key_differs && merged_rows >= max_block_size)
{
/// Write the simple aggregation result for the previous group.
insertSimpleAggregationResult(merged_columns);
return;
}
queue.pop();
@ -110,6 +126,14 @@ void AggregatingSortedBlockInputStream::merge(MutableColumns & merged_columns, s
for (auto & column_to_aggregate : columns_to_aggregate)
column_to_aggregate->insertDefault();
/// Write the simple aggregation result for the previous group.
if (merged_rows > 0)
insertSimpleAggregationResult(merged_columns);
/// Reset simple aggregation states for next row
for (auto & desc : columns_to_simple_aggregate)
desc.createState();
++merged_rows;
}
@ -127,6 +151,9 @@ void AggregatingSortedBlockInputStream::merge(MutableColumns & merged_columns, s
}
}
/// Write the simple aggregation result for the previous group.
insertSimpleAggregationResult(merged_columns);
finished = true;
}
@ -138,6 +165,21 @@ void AggregatingSortedBlockInputStream::addRow(SortCursor & cursor)
size_t j = column_numbers_to_aggregate[i];
columns_to_aggregate[i]->insertMergeFrom(*cursor->all_columns[j], cursor->pos);
}
for (auto & desc : columns_to_simple_aggregate)
{
auto & col = cursor->all_columns[desc.column_number];
desc.add_function(desc.function.get(), desc.state.data(), &col, cursor->pos, nullptr);
}
}
void AggregatingSortedBlockInputStream::insertSimpleAggregationResult(MutableColumns & merged_columns)
{
for (auto & desc : columns_to_simple_aggregate)
{
desc.function->insertResultInto(desc.state.data(), *merged_columns[desc.column_number]);
desc.destroyState();
}
}
}

View File

@ -7,6 +7,7 @@
#include <DataStreams/MergingSortedBlockInputStream.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <Columns/ColumnAggregateFunction.h>
#include <Common/AlignedBuffer.h>
namespace DB
@ -38,10 +39,13 @@ private:
/// Read finished.
bool finished = false;
struct SimpleAggregateDescription;
/// Columns with which numbers should be aggregated.
ColumnNumbers column_numbers_to_aggregate;
ColumnNumbers column_numbers_not_to_aggregate;
std::vector<ColumnAggregateFunction *> columns_to_aggregate;
std::vector<SimpleAggregateDescription> columns_to_simple_aggregate;
RowRef current_key; /// The current primary key.
RowRef next_key; /// The primary key of the next row.
@ -54,6 +58,53 @@ private:
/** Extract all states of aggregate functions and merge them with the current group.
*/
void addRow(SortCursor & cursor);
/** Insert all values of current row for simple aggregate functions
*/
void insertSimpleAggregationResult(MutableColumns & merged_columns);
/// Stores information for aggregation of SimpleAggregateFunction columns
struct SimpleAggregateDescription
{
/// An aggregate function 'anyLast', 'sum'...
AggregateFunctionPtr function;
IAggregateFunction::AddFunc add_function;
size_t column_number;
AlignedBuffer state;
bool created = false;
SimpleAggregateDescription(const AggregateFunctionPtr & function_, const size_t column_number_) : function(function_), column_number(column_number_)
{
add_function = function->getAddressOfAddFunction();
state.reset(function->sizeOfData(), function->alignOfData());
}
void createState()
{
if (created)
return;
function->create(state.data());
created = true;
}
void destroyState()
{
if (!created)
return;
function->destroy(state.data());
created = false;
}
/// Explicitly destroy aggregation state if the stream is terminated
~SimpleAggregateDescription()
{
destroyState();
}
SimpleAggregateDescription() = default;
SimpleAggregateDescription(SimpleAggregateDescription &&) = default;
SimpleAggregateDescription(const SimpleAggregateDescription &) = delete;
};
};
}

View File

@ -1,6 +1,8 @@
#pragma once
#include <memory>
#include <cstddef>
#include <Core/Types.h>
namespace DB
{
@ -10,21 +12,21 @@ class WriteBuffer;
struct FormatSettings;
class IColumn;
/** Further refinment of the properties of data type.
*
* Contains methods for serialization/deserialization.
* Implementations of this interface represent a data type domain (example: IPv4)
* which is a refinement of the exsitgin type with a name and specific text
* representation.
*
* IDataTypeDomain is totally immutable object. You can always share them.
/** Allow to customize an existing data type and set a different name and/or text serialization/deserialization methods.
* See use in IPv4 and IPv6 data types, and also in SimpleAggregateFunction.
*/
class IDataTypeDomain
class IDataTypeCustomName
{
public:
virtual ~IDataTypeDomain() {}
virtual ~IDataTypeCustomName() {}
virtual const char* getName() const = 0;
virtual String getName() const = 0;
};
class IDataTypeCustomTextSerialization
{
public:
virtual ~IDataTypeCustomTextSerialization() {}
/** Text serialization for displaying on a terminal or saving into a text file, and the like.
* Without escaping or quoting.
@ -56,4 +58,31 @@ public:
virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const = 0;
};
using DataTypeCustomNamePtr = std::unique_ptr<const IDataTypeCustomName>;
using DataTypeCustomTextSerializationPtr = std::unique_ptr<const IDataTypeCustomTextSerialization>;
/** Describe a data type customization
*/
struct DataTypeCustomDesc
{
DataTypeCustomNamePtr name;
DataTypeCustomTextSerializationPtr text_serialization;
DataTypeCustomDesc(DataTypeCustomNamePtr name_, DataTypeCustomTextSerializationPtr text_serialization_)
: name(std::move(name_)), text_serialization(std::move(text_serialization_)) {}
};
using DataTypeCustomDescPtr = std::unique_ptr<DataTypeCustomDesc>;
/** A simple implementation of IDataTypeCustomName
*/
class DataTypeCustomFixedName : public IDataTypeCustomName
{
private:
String name;
public:
DataTypeCustomFixedName(String name_) : name(name_) {}
String getName() const override { return name; }
};
} // namespace DB

View File

@ -1,9 +1,9 @@
#include <Columns/ColumnsNumber.h>
#include <Common/Exception.h>
#include <Common/formatIPv6.h>
#include <DataTypes/DataTypeDomainWithSimpleSerialization.h>
#include <DataTypes/DataTypeCustomSimpleTextSerialization.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/IDataTypeDomain.h>
#include <DataTypes/DataTypeCustom.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/FunctionsCoding.h>
@ -20,20 +20,15 @@ namespace ErrorCodes
namespace
{
class DataTypeDomainIPv4 : public DataTypeDomainWithSimpleSerialization
class DataTypeCustomIPv4Serialization : public DataTypeCustomSimpleTextSerialization
{
public:
const char * getName() const override
{
return "IPv4";
}
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override
{
const auto col = checkAndGetColumn<ColumnUInt32>(&column);
if (!col)
{
throw Exception(String(getName()) + " domain can only serialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
throw Exception("IPv4 type can only serialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
}
char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'};
@ -48,7 +43,7 @@ public:
ColumnUInt32 * col = typeid_cast<ColumnUInt32 *>(&column);
if (!col)
{
throw Exception(String(getName()) + " domain can only deserialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
throw Exception("IPv4 type can only deserialize columns of type UInt32." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
}
char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'};
@ -63,20 +58,16 @@ public:
}
};
class DataTypeDomainIPv6 : public DataTypeDomainWithSimpleSerialization
class DataTypeCustomIPv6Serialization : public DataTypeCustomSimpleTextSerialization
{
public:
const char * getName() const override
{
return "IPv6";
}
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override
{
const auto col = checkAndGetColumn<ColumnFixedString>(&column);
if (!col)
{
throw Exception(String(getName()) + " domain can only serialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
throw Exception("IPv6 type domain can only serialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
}
char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'};
@ -91,7 +82,7 @@ public:
ColumnFixedString * col = typeid_cast<ColumnFixedString *>(&column);
if (!col)
{
throw Exception(String(getName()) + " domain can only deserialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
throw Exception("IPv6 type domain can only deserialize columns of type FixedString(16)." + column.getName(), ErrorCodes::ILLEGAL_COLUMN);
}
char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'};
@ -100,7 +91,7 @@ public:
std::string ipv6_value(IPV6_BINARY_LENGTH, '\0');
if (!parseIPv6(buffer, reinterpret_cast<unsigned char *>(ipv6_value.data())))
{
throw Exception(String("Invalid ") + getName() + " value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING);
throw Exception("Invalid IPv6 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING);
}
col->insertString(ipv6_value);
@ -111,8 +102,17 @@ public:
void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory)
{
factory.registerDataTypeDomain("UInt32", std::make_unique<DataTypeDomainIPv4>());
factory.registerDataTypeDomain("FixedString(16)", std::make_unique<DataTypeDomainIPv6>());
factory.registerSimpleDataTypeCustom("IPv4", []
{
return std::make_pair(DataTypeFactory::instance().get("UInt32"),
std::make_unique<DataTypeCustomDesc>(std::make_unique<DataTypeCustomFixedName>("IPv4"), std::make_unique<DataTypeCustomIPv4Serialization>()));
});
factory.registerSimpleDataTypeCustom("IPv6", []
{
return std::make_pair(DataTypeFactory::instance().get("FixedString(16)"),
std::make_unique<DataTypeCustomDesc>(std::make_unique<DataTypeCustomFixedName>("IPv6"), std::make_unique<DataTypeCustomIPv6Serialization>()));
});
}
} // namespace DB

View File

@ -0,0 +1,137 @@
#include <Common/FieldVisitors.h>
#include <Common/typeid_cast.h>
#include <IO/ReadHelpers.h>
#include <Columns/ColumnAggregateFunction.h>
#include <DataTypes/DataTypeCustomSimpleAggregateFunction.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTIdentifier.h>
#include <boost/algorithm/string/join.hpp>
namespace DB
{
namespace ErrorCodes
{
extern const int SYNTAX_ERROR;
extern const int BAD_ARGUMENTS;
extern const int PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int LOGICAL_ERROR;
}
static const std::vector<String> supported_functions{"any", "anyLast", "min", "max", "sum"};
String DataTypeCustomSimpleAggregateFunction::getName() const
{
std::stringstream stream;
stream << "SimpleAggregateFunction(" << function->getName();
if (!parameters.empty())
{
stream << "(";
for (size_t i = 0; i < parameters.size(); ++i)
{
if (i)
stream << ", ";
stream << applyVisitor(DB::FieldVisitorToString(), parameters[i]);
}
stream << ")";
}
for (const auto & argument_type : argument_types)
stream << ", " << argument_type->getName();
stream << ")";
return stream.str();
}
static std::pair<DataTypePtr, DataTypeCustomDescPtr> create(const ASTPtr & arguments)
{
String function_name;
AggregateFunctionPtr function;
DataTypes argument_types;
Array params_row;
if (!arguments || arguments->children.empty())
throw Exception("Data type SimpleAggregateFunction requires parameters: "
"name of aggregate function and list of data types for arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (const ASTFunction * parametric = arguments->children[0]->as<ASTFunction>())
{
if (parametric->parameters)
throw Exception("Unexpected level of parameters to aggregate function", ErrorCodes::SYNTAX_ERROR);
function_name = parametric->name;
const ASTs & parameters = parametric->arguments->as<ASTExpressionList &>().children;
params_row.resize(parameters.size());
for (size_t i = 0; i < parameters.size(); ++i)
{
const ASTLiteral * lit = parameters[i]->as<ASTLiteral>();
if (!lit)
throw Exception("Parameters to aggregate functions must be literals",
ErrorCodes::PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS);
params_row[i] = lit->value;
}
}
else if (auto opt_name = getIdentifierName(arguments->children[0]))
{
function_name = *opt_name;
}
else if (arguments->children[0]->as<ASTLiteral>())
{
throw Exception("Aggregate function name for data type SimpleAggregateFunction must be passed as identifier (without quotes) or function",
ErrorCodes::BAD_ARGUMENTS);
}
else
throw Exception("Unexpected AST element passed as aggregate function name for data type SimpleAggregateFunction. Must be identifier or function.",
ErrorCodes::BAD_ARGUMENTS);
for (size_t i = 1; i < arguments->children.size(); ++i)
argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i]));
if (function_name.empty())
throw Exception("Logical error: empty name of aggregate function passed", ErrorCodes::LOGICAL_ERROR);
function = AggregateFunctionFactory::instance().get(function_name, argument_types, params_row);
// check function
if (std::find(std::begin(supported_functions), std::end(supported_functions), function->getName()) == std::end(supported_functions))
{
throw Exception("Unsupported aggregate function " + function->getName() + ", supported functions are " + boost::algorithm::join(supported_functions, ","),
ErrorCodes::BAD_ARGUMENTS);
}
DataTypePtr storage_type = DataTypeFactory::instance().get(argument_types[0]->getName());
if (!function->getReturnType()->equals(*removeLowCardinality(storage_type)))
{
throw Exception("Incompatible data types between aggregate function '" + function->getName() + "' which returns " + function->getReturnType()->getName() + " and column storage type " + storage_type->getName(),
ErrorCodes::BAD_ARGUMENTS);
}
DataTypeCustomNamePtr custom_name = std::make_unique<DataTypeCustomSimpleAggregateFunction>(function, argument_types, params_row);
return std::make_pair(storage_type, std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr));
}
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory)
{
factory.registerDataTypeCustom("SimpleAggregateFunction", create);
}
}

View File

@ -0,0 +1,42 @@
#pragma once
#include <DataTypes/DataTypeCustom.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <Common/FieldVisitors.h>
#include <IO/ReadHelpers.h>
namespace DB
{
/** The type SimpleAggregateFunction(fct, type) is meant to be used in an AggregatingMergeTree. It behaves like a standard
* data type but when rows are merged, an aggregation function is applied.
*
* The aggregation function is limited to simple functions whose merge state is the final result:
* any, anyLast, min, max, sum
*
* Examples:
*
* SimpleAggregateFunction(sum, Nullable(Float64))
* SimpleAggregateFunction(anyLast, LowCardinality(Nullable(String)))
* SimpleAggregateFunction(anyLast, IPv4)
*
* Technically, a standard IDataType is instanciated and customized with IDataTypeCustomName and DataTypeCustomDesc.
*/
class DataTypeCustomSimpleAggregateFunction : public IDataTypeCustomName
{
private:
const AggregateFunctionPtr function;
const DataTypes argument_types;
const Array parameters;
public:
DataTypeCustomSimpleAggregateFunction(const AggregateFunctionPtr & function_, const DataTypes & argument_types_, const Array & parameters_)
: function(function_), argument_types(argument_types_), parameters(parameters_) {}
const AggregateFunctionPtr getFunction() const { return function; }
String getName() const override;
};
}

View File

@ -1,4 +1,4 @@
#include <DataTypes/DataTypeDomainWithSimpleSerialization.h>
#include <DataTypes/DataTypeCustomSimpleTextSerialization.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
@ -9,7 +9,7 @@ namespace
{
using namespace DB;
static String serializeToString(const DataTypeDomainWithSimpleSerialization & domain, const IColumn & column, size_t row_num, const FormatSettings & settings)
static String serializeToString(const DataTypeCustomSimpleTextSerialization & domain, const IColumn & column, size_t row_num, const FormatSettings & settings)
{
WriteBufferFromOwnString buffer;
domain.serializeText(column, row_num, buffer, settings);
@ -17,7 +17,7 @@ static String serializeToString(const DataTypeDomainWithSimpleSerialization & do
return buffer.str();
}
static void deserializeFromString(const DataTypeDomainWithSimpleSerialization & domain, IColumn & column, const String & s, const FormatSettings & settings)
static void deserializeFromString(const DataTypeCustomSimpleTextSerialization & domain, IColumn & column, const String & s, const FormatSettings & settings)
{
ReadBufferFromString istr(s);
domain.deserializeText(column, istr, settings);
@ -28,59 +28,59 @@ static void deserializeFromString(const DataTypeDomainWithSimpleSerialization &
namespace DB
{
DataTypeDomainWithSimpleSerialization::~DataTypeDomainWithSimpleSerialization()
DataTypeCustomSimpleTextSerialization::~DataTypeCustomSimpleTextSerialization()
{
}
void DataTypeDomainWithSimpleSerialization::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
writeEscapedString(serializeToString(*this, column, row_num, settings), ostr);
}
void DataTypeDomainWithSimpleSerialization::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String str;
readEscapedString(str, istr);
deserializeFromString(*this, column, str, settings);
}
void DataTypeDomainWithSimpleSerialization::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
writeQuotedString(serializeToString(*this, column, row_num, settings), ostr);
}
void DataTypeDomainWithSimpleSerialization::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String str;
readQuotedString(str, istr);
deserializeFromString(*this, column, str, settings);
}
void DataTypeDomainWithSimpleSerialization::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
writeCSVString(serializeToString(*this, column, row_num, settings), ostr);
}
void DataTypeDomainWithSimpleSerialization::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String str;
readCSVString(str, istr, settings.csv);
deserializeFromString(*this, column, str, settings);
}
void DataTypeDomainWithSimpleSerialization::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings);
}
void DataTypeDomainWithSimpleSerialization::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String str;
readJSONString(str, istr);
deserializeFromString(*this, column, str, settings);
}
void DataTypeDomainWithSimpleSerialization::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
void DataTypeCustomSimpleTextSerialization::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
writeXMLString(serializeToString(*this, column, row_num, settings), ostr);
}

View File

@ -1,6 +1,6 @@
#pragma once
#include <DataTypes/IDataTypeDomain.h>
#include <DataTypes/DataTypeCustom.h>
namespace DB
{
@ -10,12 +10,12 @@ class WriteBuffer;
struct FormatSettings;
class IColumn;
/** Simple DataTypeDomain that uses serializeText/deserializeText
/** Simple IDataTypeCustomTextSerialization that uses serializeText/deserializeText
* for all serialization and deserialization. */
class DataTypeDomainWithSimpleSerialization : public IDataTypeDomain
class DataTypeCustomSimpleTextSerialization : public IDataTypeCustomTextSerialization
{
public:
virtual ~DataTypeDomainWithSimpleSerialization() override;
virtual ~DataTypeCustomSimpleTextSerialization() override;
// Methods that subclasses must override in order to get full serialization/deserialization support.
virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override = 0;

View File

@ -1,5 +1,5 @@
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/IDataTypeDomain.h>
#include <DataTypes/DataTypeCustom.h>
#include <Parsers/parseQuery.h>
#include <Parsers/ParserCreateQuery.h>
#include <Parsers/ASTFunction.h>
@ -115,19 +115,23 @@ void DataTypeFactory::registerSimpleDataType(const String & name, SimpleCreator
}, case_sensitiveness);
}
void DataTypeFactory::registerDataTypeDomain(const String & type_name, DataTypeDomainPtr domain, CaseSensitiveness case_sensitiveness)
void DataTypeFactory::registerDataTypeCustom(const String & family_name, CreatorWithCustom creator, CaseSensitiveness case_sensitiveness)
{
all_domains.reserve(all_domains.size() + 1);
auto data_type = get(type_name);
setDataTypeDomain(*data_type, *domain);
registerDataType(domain->getName(), [data_type](const ASTPtr & /*ast*/)
registerDataType(family_name, [creator](const ASTPtr & ast)
{
return data_type;
}, case_sensitiveness);
auto res = creator(ast);
res.first->setCustomization(std::move(res.second));
all_domains.emplace_back(std::move(domain));
return res.first;
}, case_sensitiveness);
}
void DataTypeFactory::registerSimpleDataTypeCustom(const String &name, SimpleCreatorWithCustom creator, CaseSensitiveness case_sensitiveness)
{
registerDataTypeCustom(name, [creator](const ASTPtr & /*ast*/)
{
return creator();
}, case_sensitiveness);
}
const DataTypeFactory::Creator& DataTypeFactory::findCreatorByName(const String & family_name) const
@ -153,11 +157,6 @@ const DataTypeFactory::Creator& DataTypeFactory::findCreatorByName(const String
throw Exception("Unknown data type family: " + family_name, ErrorCodes::UNKNOWN_TYPE);
}
void DataTypeFactory::setDataTypeDomain(const IDataType & data_type, const IDataTypeDomain & domain)
{
data_type.setDomain(&domain);
}
void registerDataTypeNumbers(DataTypeFactory & factory);
void registerDataTypeDecimal(DataTypeFactory & factory);
void registerDataTypeDate(DataTypeFactory & factory);
@ -175,6 +174,7 @@ void registerDataTypeNested(DataTypeFactory & factory);
void registerDataTypeInterval(DataTypeFactory & factory);
void registerDataTypeLowCardinality(DataTypeFactory & factory);
void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory);
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory);
DataTypeFactory::DataTypeFactory()
@ -196,6 +196,7 @@ DataTypeFactory::DataTypeFactory()
registerDataTypeInterval(*this);
registerDataTypeLowCardinality(*this);
registerDataTypeDomainIPv4AndIPv6(*this);
registerDataTypeDomainSimpleAggregateFunction(*this);
}
DataTypeFactory::~DataTypeFactory()

View File

@ -17,9 +17,6 @@ namespace DB
class IDataType;
using DataTypePtr = std::shared_ptr<const IDataType>;
class IDataTypeDomain;
using DataTypeDomainPtr = std::unique_ptr<const IDataTypeDomain>;
/** Creates a data type by name of data type family and parameters.
*/
@ -28,6 +25,8 @@ class DataTypeFactory final : public ext::singleton<DataTypeFactory>, public IFa
private:
using SimpleCreator = std::function<DataTypePtr()>;
using DataTypesDictionary = std::unordered_map<String, Creator>;
using CreatorWithCustom = std::function<std::pair<DataTypePtr,DataTypeCustomDescPtr>(const ASTPtr & parameters)>;
using SimpleCreatorWithCustom = std::function<std::pair<DataTypePtr,DataTypeCustomDescPtr>()>;
public:
DataTypePtr get(const String & full_name) const;
@ -40,11 +39,13 @@ public:
/// Register a simple data type, that have no parameters.
void registerSimpleDataType(const String & name, SimpleCreator creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
// Register a domain - a refinement of existing type.
void registerDataTypeDomain(const String & type_name, DataTypeDomainPtr domain, CaseSensitiveness case_sensitiveness = CaseSensitive);
/// Register a customized type family
void registerDataTypeCustom(const String & family_name, CreatorWithCustom creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
/// Register a simple customized data type
void registerSimpleDataTypeCustom(const String & name, SimpleCreatorWithCustom creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
private:
static void setDataTypeDomain(const IDataType & data_type, const IDataTypeDomain & domain);
const Creator& findCreatorByName(const String & family_name) const;
private:
@ -53,9 +54,6 @@ private:
/// Case insensitive data types will be additionally added here with lowercased name.
DataTypesDictionary case_insensitive_data_types;
// All domains are owned by factory and shared amongst DataType instances.
std::vector<DataTypeDomainPtr> all_domains;
DataTypeFactory();
~DataTypeFactory() override;

View File

@ -9,7 +9,7 @@
#include <IO/WriteHelpers.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/IDataTypeDomain.h>
#include <DataTypes/DataTypeCustom.h>
#include <DataTypes/NestedUtils.h>
@ -23,8 +23,7 @@ namespace ErrorCodes
extern const int DATA_TYPE_CANNOT_BE_PROMOTED;
}
IDataType::IDataType()
: domain(nullptr)
IDataType::IDataType() : custom_name(nullptr), custom_text_serialization(nullptr)
{
}
@ -34,9 +33,9 @@ IDataType::~IDataType()
String IDataType::getName() const
{
if (domain)
if (custom_name)
{
return domain->getName();
return custom_name->getName();
}
else
{
@ -142,9 +141,9 @@ void IDataType::insertDefaultInto(IColumn & column) const
void IDataType::serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->serializeTextEscaped(column, row_num, ostr, settings);
custom_text_serialization->serializeTextEscaped(column, row_num, ostr, settings);
}
else
{
@ -154,9 +153,9 @@ void IDataType::serializeAsTextEscaped(const IColumn & column, size_t row_num, W
void IDataType::deserializeAsTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->deserializeTextEscaped(column, istr, settings);
custom_text_serialization->deserializeTextEscaped(column, istr, settings);
}
else
{
@ -166,9 +165,9 @@ void IDataType::deserializeAsTextEscaped(IColumn & column, ReadBuffer & istr, co
void IDataType::serializeAsTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->serializeTextQuoted(column, row_num, ostr, settings);
custom_text_serialization->serializeTextQuoted(column, row_num, ostr, settings);
}
else
{
@ -178,9 +177,9 @@ void IDataType::serializeAsTextQuoted(const IColumn & column, size_t row_num, Wr
void IDataType::deserializeAsTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->deserializeTextQuoted(column, istr, settings);
custom_text_serialization->deserializeTextQuoted(column, istr, settings);
}
else
{
@ -190,9 +189,9 @@ void IDataType::deserializeAsTextQuoted(IColumn & column, ReadBuffer & istr, con
void IDataType::serializeAsTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->serializeTextCSV(column, row_num, ostr, settings);
custom_text_serialization->serializeTextCSV(column, row_num, ostr, settings);
}
else
{
@ -202,9 +201,9 @@ void IDataType::serializeAsTextCSV(const IColumn & column, size_t row_num, Write
void IDataType::deserializeAsTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->deserializeTextCSV(column, istr, settings);
custom_text_serialization->deserializeTextCSV(column, istr, settings);
}
else
{
@ -214,9 +213,9 @@ void IDataType::deserializeAsTextCSV(IColumn & column, ReadBuffer & istr, const
void IDataType::serializeAsText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->serializeText(column, row_num, ostr, settings);
custom_text_serialization->serializeText(column, row_num, ostr, settings);
}
else
{
@ -226,9 +225,9 @@ void IDataType::serializeAsText(const IColumn & column, size_t row_num, WriteBuf
void IDataType::serializeAsTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->serializeTextJSON(column, row_num, ostr, settings);
custom_text_serialization->serializeTextJSON(column, row_num, ostr, settings);
}
else
{
@ -238,9 +237,9 @@ void IDataType::serializeAsTextJSON(const IColumn & column, size_t row_num, Writ
void IDataType::deserializeAsTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->deserializeTextJSON(column, istr, settings);
custom_text_serialization->deserializeTextJSON(column, istr, settings);
}
else
{
@ -250,9 +249,9 @@ void IDataType::deserializeAsTextJSON(IColumn & column, ReadBuffer & istr, const
void IDataType::serializeAsTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (domain)
if (custom_text_serialization)
{
domain->serializeTextXML(column, row_num, ostr, settings);
custom_text_serialization->serializeTextXML(column, row_num, ostr, settings);
}
else
{
@ -260,13 +259,14 @@ void IDataType::serializeAsTextXML(const IColumn & column, size_t row_num, Write
}
}
void IDataType::setDomain(const IDataTypeDomain* const new_domain) const
void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const
{
if (domain != nullptr)
{
throw Exception("Type " + getName() + " already has a domain.", ErrorCodes::LOGICAL_ERROR);
}
domain = new_domain;
/// replace only if not null
if (custom_desc_->name)
custom_name = std::move(custom_desc_->name);
if (custom_desc_->text_serialization)
custom_text_serialization = std::move(custom_desc_->text_serialization);
}
}

View File

@ -4,6 +4,7 @@
#include <Common/COW.h>
#include <boost/noncopyable.hpp>
#include <Core/Field.h>
#include <DataTypes/DataTypeCustom.h>
namespace DB
@ -12,7 +13,6 @@ namespace DB
class ReadBuffer;
class WriteBuffer;
class IDataTypeDomain;
class IDataType;
struct FormatSettings;
@ -459,18 +459,19 @@ public:
private:
friend class DataTypeFactory;
/** Sets domain on existing DataType, can be considered as second phase
* of construction explicitly done by DataTypeFactory.
* Will throw an exception if domain is already set.
/** Customize this DataType
*/
void setDomain(const IDataTypeDomain* newDomain) const;
void setCustomization(DataTypeCustomDescPtr custom_desc_) const;
private:
/** This is mutable to allow setting domain on `const IDataType` post construction,
* simplifying creation of domains for all types, without them even knowing
* of domain existence.
/** This is mutable to allow setting custom name and serialization on `const IDataType` post construction.
*/
mutable IDataTypeDomain const* domain;
mutable DataTypeCustomNamePtr custom_name;
mutable DataTypeCustomTextSerializationPtr custom_text_serialization;
public:
const IDataTypeCustomName * getCustomName() const { return custom_name.get(); }
const IDataTypeCustomTextSerialization * getCustomTextSerialization() const { return custom_text_serialization.get(); }
};

View File

@ -69,3 +69,8 @@ if (USE_HYPERSCAN)
target_link_libraries (clickhouse_functions PRIVATE ${HYPERSCAN_LIBRARY})
target_include_directories (clickhouse_functions SYSTEM PRIVATE ${HYPERSCAN_INCLUDE_DIR})
endif ()
if (USE_SIMDJSON)
target_link_libraries(clickhouse_functions PRIVATE ${SIMDJSON_LIBRARY})
target_include_directories(clickhouse_functions PRIVATE ${SIMDJSON_INCLUDE_DIR})
endif ()

View File

@ -0,0 +1,378 @@
#include <Functions/FunctionsJSON.h>
#include <Functions/FunctionFactory.h>
#include <Common/config.h>
#if USE_SIMDJSON
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
namespace DB
{
template <typename T>
class JSONNullableImplBase
{
public:
static DataTypePtr getType() { return std::make_shared<DataTypeNullable>(std::make_shared<T>()); }
static Field getDefault() { return {}; }
};
class JSONHasImpl : public JSONNullableImplBase<DataTypeUInt8>
{
public:
static constexpr auto name{"jsonHas"};
static Field getValue(ParsedJson::iterator &) { return {1}; }
};
class JSONLengthImpl : public JSONNullableImplBase<DataTypeUInt64>
{
public:
static constexpr auto name{"jsonLength"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (!pjh.is_object_or_array())
return getDefault();
size_t size = 0;
if (pjh.down())
{
size += 1;
while (pjh.next())
size += 1;
}
return {size};
}
};
class JSONTypeImpl : public JSONNullableImplBase<DataTypeString>
{
public:
static constexpr auto name{"jsonType"};
static Field getValue(ParsedJson::iterator & pjh)
{
switch (pjh.get_type())
{
case '[':
return "Array";
case '{':
return "Object";
case '"':
return "String";
case 'l':
return "Int64";
case 'd':
return "Float64";
case 't':
return "Bool";
case 'f':
return "Bool";
case 'n':
return "Null";
default:
return "Unknown";
}
}
};
class JSONExtractImpl
{
public:
static constexpr auto name{"jsonExtract"};
static DataTypePtr getType(const DataTypePtr & type)
{
WhichDataType which{type};
if (which.isNativeUInt() || which.isNativeInt() || which.isFloat() || which.isEnum() || which.isDateOrDateTime()
|| which.isStringOrFixedString() || which.isInterval())
return std::make_shared<DataTypeNullable>(type);
if (which.isArray())
{
auto array_type = static_cast<const DataTypeArray *>(type.get());
return std::make_shared<DataTypeArray>(getType(array_type->getNestedType()));
}
if (which.isTuple())
{
auto tuple_type = static_cast<const DataTypeTuple *>(type.get());
DataTypes types;
types.reserve(tuple_type->getElements().size());
for (const DataTypePtr & element : tuple_type->getElements())
{
types.push_back(getType(element));
}
return std::make_shared<DataTypeTuple>(std::move(types));
}
throw Exception{"Unsupported return type schema: " + type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
static Field getDefault(const DataTypePtr & type)
{
WhichDataType which{type};
if (which.isNativeUInt() || which.isNativeInt() || which.isFloat() || which.isEnum() || which.isDateOrDateTime()
|| which.isStringOrFixedString() || which.isInterval())
return {};
if (which.isArray())
return {Array{}};
if (which.isTuple())
{
auto tuple_type = static_cast<const DataTypeTuple *>(type.get());
Tuple tuple;
tuple.toUnderType().reserve(tuple_type->getElements().size());
for (const DataTypePtr & element : tuple_type->getElements())
tuple.toUnderType().push_back(getDefault(element));
return {tuple};
}
// should not reach
throw Exception{"Unsupported return type schema: " + type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
static Field getValue(ParsedJson::iterator & pjh, const DataTypePtr & type)
{
WhichDataType which{type};
if (which.isNativeUInt() || which.isNativeInt() || which.isEnum() || which.isDateOrDateTime() || which.isInterval())
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault(type);
}
if (which.isFloat())
{
if (pjh.is_integer())
return {static_cast<double>(pjh.get_integer())};
else if (pjh.is_double())
return {pjh.get_double()};
else
return getDefault(type);
}
if (which.isStringOrFixedString())
{
if (pjh.is_string())
return {String{pjh.get_string()}};
else
return getDefault(type);
}
if (which.isArray())
{
if (!pjh.is_object_or_array())
return getDefault(type);
auto array_type = static_cast<const DataTypeArray *>(type.get());
Array array;
bool first = true;
while (first ? pjh.down() : pjh.next())
{
first = false;
ParsedJson::iterator pjh1{pjh};
array.push_back(getValue(pjh1, array_type->getNestedType()));
}
return {array};
}
if (which.isTuple())
{
if (!pjh.is_object_or_array())
return getDefault(type);
auto tuple_type = static_cast<const DataTypeTuple *>(type.get());
Tuple tuple;
tuple.toUnderType().reserve(tuple_type->getElements().size());
bool valid = true;
bool first = true;
for (const DataTypePtr & element : tuple_type->getElements())
{
if (valid)
{
valid &= first ? pjh.down() : pjh.next();
first = false;
ParsedJson::iterator pjh1{pjh};
tuple.toUnderType().push_back(getValue(pjh1, element));
}
else
tuple.toUnderType().push_back(getDefault(element));
}
return {tuple};
}
// should not reach
throw Exception{"Unsupported return type schema: " + type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
};
class JSONExtractUIntImpl : public JSONNullableImplBase<DataTypeUInt64>
{
public:
static constexpr auto name{"jsonExtractUInt"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault();
}
};
class JSONExtractIntImpl : public JSONNullableImplBase<DataTypeInt64>
{
public:
static constexpr auto name{"jsonExtractInt"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault();
}
};
class JSONExtractFloatImpl : public JSONNullableImplBase<DataTypeFloat64>
{
public:
static constexpr auto name{"jsonExtractFloat"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_double())
return {pjh.get_double()};
else
return getDefault();
}
};
class JSONExtractBoolImpl : public JSONNullableImplBase<DataTypeUInt8>
{
public:
static constexpr auto name{"jsonExtractBool"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.get_type() == 't')
return {1};
else if (pjh.get_type() == 'f')
return {0};
else
return getDefault();
}
};
// class JSONExtractRawImpl: public JSONNullableImplBase<DataTypeString>
// {
// public:
// static constexpr auto name {"jsonExtractRaw"};
// static Field getValue(ParsedJson::iterator & pjh)
// {
// //
// }
// };
class JSONExtractStringImpl : public JSONNullableImplBase<DataTypeString>
{
public:
static constexpr auto name{"jsonExtractString"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_string())
return {String{pjh.get_string()}};
else
return getDefault();
}
};
}
#else
namespace DB
{
struct JSONHasImpl { static constexpr auto name{"jsonHas"}; };
struct JSONLengthImpl { static constexpr auto name{"jsonLength"}; };
struct JSONTypeImpl { static constexpr auto name{"jsonType"}; };
struct JSONExtractImpl { static constexpr auto name{"jsonExtract"}; };
struct JSONExtractUIntImpl { static constexpr auto name{"jsonExtractUInt"}; };
struct JSONExtractIntImpl { static constexpr auto name{"jsonExtractInt"}; };
struct JSONExtractFloatImpl { static constexpr auto name{"jsonExtractFloat"}; };
struct JSONExtractBoolImpl { static constexpr auto name{"jsonExtractBool"}; };
//struct JSONExtractRawImpl { static constexpr auto name {"jsonExtractRaw"}; };
struct JSONExtractStringImpl { static constexpr auto name{"jsonExtractString"}; };
}
#endif
namespace DB
{
void registerFunctionsJSON(FunctionFactory & factory)
{
#if USE_SIMDJSON
if (__builtin_cpu_supports("avx2"))
{
factory.registerFunction<FunctionJSONBase<JSONHasImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONLengthImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONTypeImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractImpl, true>>();
factory.registerFunction<FunctionJSONBase<JSONExtractUIntImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractIntImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractFloatImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractBoolImpl, false>>();
// factory.registerFunction<FunctionJSONBase<
// JSONExtractRawImpl,
// false
// >>();
factory.registerFunction<FunctionJSONBase<JSONExtractStringImpl, false>>();
return;
}
#endif
factory.registerFunction<FunctionJSONDummy<JSONHasImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONLengthImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONTypeImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractUIntImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractIntImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractFloatImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractBoolImpl>>();
//factory.registerFunction<FunctionJSONDummy<JSONExtractRawImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractStringImpl>>();
}
}

View File

@ -0,0 +1,243 @@
#pragma once
#include <Functions/IFunction.h>
#include <Common/config.h>
#if USE_SIMDJSON
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeFactory.h>
#include <Common/typeid_cast.h>
#include <ext/range.h>
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
#pragma clang diagnostic ignored "-Wnewline-eof"
#endif
#include <simdjson/jsonparser.h>
#ifdef __clang__
#pragma clang diagnostic pop
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
template <typename Impl, bool ExtraArg>
class FunctionJSONBase : public IFunction
{
private:
enum class Action
{
key = 1,
index = 2,
};
mutable std::vector<Action> actions;
mutable DataTypePtr virtual_type;
bool tryMove(ParsedJson::iterator & pjh, Action action, const Field & accessor)
{
switch (action)
{
case Action::key:
if (!pjh.is_object() || !pjh.move_to_key(accessor.get<String>().data()))
return false;
break;
case Action::index:
if (!pjh.is_object_or_array() || !pjh.down())
return false;
int steps = accessor.get<Int64>();
if (steps > 0)
steps -= 1;
else if (steps < 0)
{
steps += 1;
ParsedJson::iterator pjh1{pjh};
while (pjh1.next())
steps += 1;
}
else
return false;
for (const auto i : ext::range(0, steps))
{
(void)i;
if (!pjh.next())
return false;
}
break;
}
return true;
}
public:
static constexpr auto name = Impl::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionJSONBase>(); }
String getName() const override { return Impl::name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if constexpr (ExtraArg)
{
if (arguments.size() < 2)
throw Exception{"Function " + getName() + " requires at least two arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH};
auto col_type_const = typeid_cast<const ColumnConst *>(arguments[1].column.get());
if (!col_type_const)
throw Exception{"Illegal non-const column " + arguments[1].column->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN};
virtual_type = DataTypeFactory::instance().get(col_type_const->getValue<String>());
}
else
{
if (arguments.size() < 1)
throw Exception{"Function " + getName() + " requires at least one arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH};
}
if (!isString(arguments[0].type))
throw Exception{"Illegal type " + arguments[0].type->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
actions.reserve(arguments.size() - 1 - ExtraArg);
for (const auto i : ext::range(1 + ExtraArg, arguments.size()))
{
if (isString(arguments[i].type))
actions.push_back(Action::key);
else if (isInteger(arguments[i].type))
actions.push_back(Action::index);
else
throw Exception{"Illegal type " + arguments[i].type->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
if constexpr (ExtraArg)
return Impl::getType(virtual_type);
else
return Impl::getType();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result_pos, size_t input_rows_count) override
{
MutableColumnPtr to{block.getByPosition(result_pos).type->createColumn()};
to->reserve(input_rows_count);
const ColumnPtr & arg_json = block.getByPosition(arguments[0]).column;
auto col_json_const = typeid_cast<const ColumnConst *>(arg_json.get());
auto col_json_string
= typeid_cast<const ColumnString *>(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get());
if (!col_json_string)
throw Exception{"Illegal column " + arg_json->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
const ColumnString::Chars & chars = col_json_string->getChars();
const ColumnString::Offsets & offsets = col_json_string->getOffsets();
size_t max_size = 1;
for (const auto i : ext::range(0, input_rows_count))
if (max_size < offsets[i] - offsets[i - 1] - 1)
max_size = offsets[i] - offsets[i - 1] - 1;
ParsedJson pj;
if (!pj.allocateCapacity(max_size))
throw Exception{"Can not allocate memory for " + std::to_string(max_size) + " units when parsing JSON",
ErrorCodes::CANNOT_ALLOCATE_MEMORY};
for (const auto i : ext::range(0, input_rows_count))
{
bool ok = json_parse(&chars[offsets[i - 1]], offsets[i] - offsets[i - 1] - 1, pj) == 0;
ParsedJson::iterator pjh{pj};
for (const auto j : ext::range(0, actions.size()))
{
if (!ok)
break;
ok = tryMove(pjh, actions[j], (*block.getByPosition(arguments[j + 1 + ExtraArg]).column)[i]);
}
if (ok)
{
if constexpr (ExtraArg)
to->insert(Impl::getValue(pjh, virtual_type));
else
to->insert(Impl::getValue(pjh));
}
else
{
if constexpr (ExtraArg)
to->insert(Impl::getDefault(virtual_type));
else
to->insert(Impl::getDefault());
}
}
block.getByPosition(result_pos).column = std::move(to);
}
};
}
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
template <typename Impl>
class FunctionJSONDummy : public IFunction
{
public:
static constexpr auto name = Impl::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionJSONDummy>(); }
String getName() const override { return Impl::name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName &) const override
{
throw Exception{"Function " + getName() + " is not supported without AVX2", ErrorCodes::NOT_IMPLEMENTED};
}
void executeImpl(Block &, const ColumnNumbers &, size_t, size_t) override
{
throw Exception{"Function " + getName() + " is not supported without AVX2", ErrorCodes::NOT_IMPLEMENTED};
}
};
}

View File

@ -164,43 +164,46 @@ struct NgramDistanceImpl
return num;
}
template <bool SaveNgrams>
static ALWAYS_INLINE inline size_t calculateNeedleStats(
const char * data,
const size_t size,
NgramStats & ngram_stats,
[[maybe_unused]] UInt16 * ngram_storage,
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
UInt16 (*hash_functor)(const CodePoint *))
{
// To prevent size_t overflow below.
if (size < N)
return 0;
const char * start = data;
const char * end = data + size;
CodePoint cp[simultaneously_codepoints_num] = {};
/// read_code_points returns the position of cp where it stopped reading codepoints.
size_t found = read_code_points(cp, start, end);
/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
size_t i = N - 1;
/// Initialize with this value because for the first time `found` does not initialize first N - 1 codepoints.
size_t len = -N + 1;
size_t len = 0;
do
{
len += found - N + 1;
for (; i + N <= found; ++i)
++ngram_stats[hash_functor(cp + i)];
{
++len;
UInt16 hash = hash_functor(cp + i);
if constexpr (SaveNgrams)
*ngram_storage++ = hash;
++ngram_stats[hash];
}
i = 0;
} while (start < end && (found = read_code_points(cp, start, end)));
return len;
}
template <bool ReuseStats>
static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric(
const char * data,
const size_t size,
NgramStats & ngram_stats,
size_t & distance,
[[maybe_unused]] UInt16 * ngram_storage,
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
UInt16 (*hash_functor)(const CodePoint *))
{
@ -209,18 +212,6 @@ struct NgramDistanceImpl
const char * end = data + size;
CodePoint cp[simultaneously_codepoints_num] = {};
/// allocation tricks, most strings are relatively small
static constexpr size_t small_buffer_size = 256;
std::unique_ptr<UInt16[]> big_buffer;
UInt16 small_buffer[small_buffer_size];
UInt16 * ngram_storage = small_buffer;
if (size > small_buffer_size)
{
ngram_storage = new UInt16[size];
big_buffer.reset(ngram_storage);
}
/// read_code_points returns the position of cp where it stopped reading codepoints.
size_t found = read_code_points(cp, start, end);
/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
@ -235,21 +226,25 @@ struct NgramDistanceImpl
--distance;
else
++distance;
ngram_storage[ngram_cnt++] = hash;
if constexpr (ReuseStats)
ngram_storage[ngram_cnt] = hash;
++ngram_cnt;
--ngram_stats[hash];
}
iter = 0;
} while (start < end && (found = read_code_points(cp, start, end)));
/// Return the state of hash map to its initial.
for (size_t i = 0; i < ngram_cnt; ++i)
++ngram_stats[ngram_storage[i]];
if constexpr (ReuseStats)
{
for (size_t i = 0; i < ngram_cnt; ++i)
++ngram_stats[ngram_storage[i]];
}
return ngram_cnt;
}
template <class Callback, class... Args>
static inline size_t dispatchSearcher(Callback callback, Args &&... args)
static inline auto dispatchSearcher(Callback callback, Args &&... args)
{
if constexpr (!UTF8)
return callback(std::forward<Args>(args)..., readASCIICodePoints, ASCIIHash);
@ -259,8 +254,7 @@ struct NgramDistanceImpl
static void constant_constant(std::string data, std::string needle, Float32 & res)
{
NgramStats common_stats;
memset(common_stats, 0, sizeof(common_stats));
NgramStats common_stats = {};
/// We use unsafe versions of getting ngrams, so I decided to use padded strings.
const size_t needle_size = needle.size();
@ -268,11 +262,11 @@ struct NgramDistanceImpl
needle.resize(needle_size + default_padding);
data.resize(data_size + default_padding);
size_t second_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats);
size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);
size_t distance = second_size;
if (data_size <= max_string_size)
{
size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance);
size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
res = distance * 1.f / std::max(first_size + second_size, size_t(1));
}
else
@ -281,18 +275,89 @@ struct NgramDistanceImpl
}
}
static void vector_vector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const ColumnString::Chars & needle_data,
const ColumnString::Offsets & needle_offsets,
PaddedPODArray<Float32> & res)
{
const size_t haystack_offsets_size = haystack_offsets.size();
size_t prev_haystack_offset = 0;
size_t prev_needle_offset = 0;
NgramStats common_stats = {};
/// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
/// And we can reuse these storages in one thread because we care only about what was written to first places.
std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
for (size_t i = 0; i < haystack_offsets_size; ++i)
{
const char * haystack = reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]);
const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1;
const char * needle = reinterpret_cast<const char *>(&needle_data[prev_needle_offset]);
const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1;
if (needle_size <= max_string_size && haystack_size <= max_string_size)
{
/// Get needle stats.
const size_t needle_stats_size = dispatchSearcher(
calculateNeedleStats<true>,
needle,
needle_size,
common_stats,
needle_ngram_storage.get());
size_t distance = needle_stats_size;
/// Combine with haystack stats, return to initial needle stats.
const size_t haystack_stats_size = dispatchSearcher(
calculateHaystackStatsAndMetric<true>,
haystack,
haystack_size,
common_stats,
distance,
haystack_ngram_storage.get());
/// Return to zero array stats.
for (size_t j = 0; j < needle_stats_size; ++j)
--common_stats[needle_ngram_storage[j]];
/// For now, common stats is a zero array.
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
}
else
{
/// Strings are too big, we are assuming they are not the same. This is done because of limiting number
/// of bigrams added and not allocating too much memory.
res[i] = 1.f;
}
prev_needle_offset = needle_offsets[i];
prev_haystack_offset = haystack_offsets[i];
}
}
static void vector_constant(
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray<Float32> & res)
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
std::string needle,
PaddedPODArray<Float32> & res)
{
/// zeroing our map
NgramStats common_stats;
memset(common_stats, 0, sizeof(common_stats));
NgramStats common_stats = {};
/// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
/// And we can reuse these storages in one thread because we care only about what was written to first places.
std::unique_ptr<UInt16[]> ngram_storage(new UInt16[max_string_size]);
/// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case.
const size_t needle_size = needle.size();
needle.resize(needle_size + default_padding);
const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats);
const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);
size_t distance = needle_stats_size;
size_t prev_offset = 0;
@ -303,7 +368,11 @@ struct NgramDistanceImpl
if (haystack_size <= max_string_size)
{
size_t haystack_stats_size = dispatchSearcher(
calculateHaystackStatsAndMetric, reinterpret_cast<const char *>(haystack), haystack_size, common_stats, distance);
calculateHaystackStatsAndMetric<true>,
reinterpret_cast<const char *>(haystack),
haystack_size, common_stats,
distance,
ngram_storage.get());
res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
}
else

View File

@ -62,10 +62,7 @@ public:
const ColumnConst * col_haystack_const = typeid_cast<const ColumnConst *>(&*column_haystack);
const ColumnConst * col_needle_const = typeid_cast<const ColumnConst *>(&*column_needle);
if (!col_needle_const)
throw Exception("Second argument of function " + getName() + " must be constant string.", ErrorCodes::ILLEGAL_COLUMN);
if (col_haystack_const)
if (col_haystack_const && col_needle_const)
{
ResultType res{};
const String & needle = col_needle_const->getValue<String>();
@ -88,8 +85,9 @@ public:
vec_res.resize(column_haystack->size());
const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle);
if (col_haystack_vector)
if (col_haystack_vector && col_needle_const)
{
const String & needle = col_needle_const->getValue<String>();
if (needle.size() > Impl::max_string_size)
@ -101,6 +99,27 @@ public:
}
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needle, vec_res);
}
else if (col_haystack_vector && col_needle_vector)
{
Impl::vector_vector(
col_haystack_vector->getChars(),
col_haystack_vector->getOffsets(),
col_needle_vector->getChars(),
col_needle_vector->getOffsets(),
vec_res);
}
else if (col_haystack_const && col_needle_vector)
{
const String & needle = col_haystack_const->getValue<String>();
if (needle.size() > Impl::max_string_size)
{
throw Exception(
"String size of needle is too big for function " + getName() + ". Should be at most "
+ std::to_string(Impl::max_string_size),
ErrorCodes::TOO_LARGE_STRING_SIZE);
}
Impl::vector_constant(col_needle_vector->getChars(), col_needle_vector->getOffsets(), needle, vec_res);
}
else
{
throw Exception(

View File

@ -0,0 +1,51 @@
#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <DataTypes/DataTypesNumber.h>
namespace DB
{
/** ignoreExceptNull(...) is a function that takes any arguments, and always returns 0 except Null.
*/
class FunctionIgnoreExceptNull : public IFunction
{
public:
static constexpr auto name = "ignoreExceptNull";
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionIgnoreExceptNull>();
}
bool isVariadic() const override
{
return true;
}
size_t getNumberOfArguments() const override
{
return 0;
}
String getName() const override
{
return name;
}
DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override
{
return std::make_shared<DataTypeUInt8>();
}
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{
block.getByPosition(result).column = DataTypeUInt8().createColumnConst(input_rows_count, UInt64(0));
}
};
void registerFunctionIgnoreExceptNull(FunctionFactory & factory)
{
factory.registerFunction<FunctionIgnoreExceptNull>();
}
}

View File

@ -73,11 +73,6 @@ public:
return std::make_shared<DataTypeUInt8>();
}
bool useDefaultImplementationForNulls() const override
{
return false;
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
/// Second argument must be ColumnSet.
@ -89,7 +84,7 @@ public:
Block block_of_key_columns;
/// First argument may be tuple or single column.
/// First argument may be a tuple or a single column.
const ColumnWithTypeAndName & left_arg = block.getByPosition(arguments[0]);
const ColumnTuple * tuple = typeid_cast<const ColumnTuple *>(left_arg.column.get());
const ColumnConst * const_tuple = checkAndGetColumnConst<ColumnTuple>(left_arg.column.get());

View File

@ -40,6 +40,7 @@ void registerFunctionsMath(FunctionFactory &);
void registerFunctionsGeo(FunctionFactory &);
void registerFunctionsNull(FunctionFactory &);
void registerFunctionsFindCluster(FunctionFactory &);
void registerFunctionsJSON(FunctionFactory &);
void registerFunctionTransform(FunctionFactory &);
#if USE_ICU
@ -82,6 +83,7 @@ void registerFunctions()
registerFunctionsGeo(factory);
registerFunctionsNull(factory);
registerFunctionsFindCluster(factory);
registerFunctionsJSON(factory);
registerFunctionTransform(factory);
#if USE_ICU

View File

@ -19,6 +19,7 @@ void registerFunctionSleep(FunctionFactory &);
void registerFunctionSleepEachRow(FunctionFactory &);
void registerFunctionMaterialize(FunctionFactory &);
void registerFunctionIgnore(FunctionFactory &);
void registerFunctionIgnoreExceptNull(FunctionFactory &);
void registerFunctionIndexHint(FunctionFactory &);
void registerFunctionIdentity(FunctionFactory &);
void registerFunctionArrayJoin(FunctionFactory &);
@ -62,6 +63,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
registerFunctionSleepEachRow(factory);
registerFunctionMaterialize(factory);
registerFunctionIgnore(factory);
registerFunctionIgnoreExceptNull(factory);
registerFunctionIndexHint(factory);
registerFunctionIdentity(factory);
registerFunctionArrayJoin(factory);

View File

@ -1,3 +1,5 @@
#if defined(__linux__) || defined(__FreeBSD__)
#pragma GCC diagnostic ignored "-Wsign-compare"
#ifdef __clang__
#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
@ -69,3 +71,5 @@ TEST(ReadBufferAIOTest, TestReadAfterAIO)
EXPECT_EQ(read_after_eof_big, data.length());
EXPECT_TRUE(testbuf.eof());
}
#endif

View File

@ -328,10 +328,10 @@ void ActionsVisitor::visit(const ASTPtr & ast)
if (!only_consts)
{
/// We are in the part of the tree that we are not going to compute. You just need to define types.
/// Do not subquery and create sets. We treat "IN" as "ignore" function.
/// Do not subquery and create sets. We treat "IN" as "ignoreExceptNull" function.
actions_stack.addAction(ExpressionAction::applyFunction(
FunctionFactory::instance().get("ignore", context),
FunctionFactory::instance().get("ignoreExceptNull", context),
{ node->arguments->children.at(0)->getColumnName() },
getColumnName()));
}

View File

@ -513,13 +513,14 @@ void ExpressionAnalyzer::addJoinAction(ExpressionActionsPtr & actions, bool only
columns_added_by_join_list));
}
static void appendRequiredColumns(NameSet & required_columns, const Block & sample, const AnalyzedJoin & analyzed_join)
static void appendRequiredColumns(
NameSet & required_columns, const Block & sample, const Names & key_names_right, const JoinedColumnsList & columns_added_by_join)
{
for (auto & column : analyzed_join.key_names_right)
for (auto & column : key_names_right)
if (!sample.has(column))
required_columns.insert(column);
for (auto & column : analyzed_join.columns_from_joined_table)
for (auto & column : columns_added_by_join)
if (!sample.has(column.name_and_type.name))
required_columns.insert(column.name_and_type.name);
}
@ -606,7 +607,8 @@ bool ExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, bool only_ty
Names action_columns = joined_block_actions->getRequiredColumns();
NameSet required_columns(action_columns.begin(), action_columns.end());
appendRequiredColumns(required_columns, joined_block_actions->getSampleBlock(), analyzed_join);
appendRequiredColumns(
required_columns, joined_block_actions->getSampleBlock(), analyzed_join.key_names_right, columns_added_by_join);
Names original_columns = analyzed_join.getOriginalColumnNames(required_columns);

View File

@ -377,11 +377,11 @@ namespace
template <typename Map, typename KeyGetter>
struct Inserter<ASTTableJoin::Strictness::Any, Map, KeyGetter>
{
static ALWAYS_INLINE void insert(const Join &, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool)
static ALWAYS_INLINE void insert(const Join & join, Map & map, KeyGetter & key_getter, Block * stored_block, size_t i, Arena & pool)
{
auto emplace_result = key_getter.emplaceKey(map, i, pool);
if (emplace_result.isInserted() || emplace_result.getMapped().overwrite)
if (emplace_result.isInserted() || join.anyTakeLastRow())
new (&emplace_result.getMapped()) typename Map::mapped_type(stored_block, i);
}
};
@ -659,7 +659,7 @@ void addFoundRow(const typename Map::mapped_type & mapped, AddedColumns & added,
if constexpr (STRICTNESS == ASTTableJoin::Strictness::All)
{
for (auto current = &static_cast<const typename Map::mapped_type::Base_t &>(mapped); current != nullptr; current = current->next)
for (auto current = &static_cast<const typename Map::mapped_type::Base &>(mapped); current != nullptr; current = current->next)
{
added.appendFromBlock(*current->block, current->row_num);
++current_offset;
@ -1078,10 +1078,7 @@ void Join::joinGet(Block & block, const String & column_name) const
if (kind == ASTTableJoin::Kind::Left && strictness == ASTTableJoin::Strictness::Any)
{
if (any_take_last_row)
joinGetImpl(block, column_name, std::get<MapsAnyOverwrite>(maps));
else
joinGetImpl(block, column_name, std::get<MapsAny>(maps));
joinGetImpl(block, column_name, std::get<MapsAny>(maps));
}
else
throw Exception("joinGet only supports StorageJoin of type Left Any", ErrorCodes::LOGICAL_ERROR);
@ -1156,7 +1153,7 @@ struct AdderNonJoined<ASTTableJoin::Strictness::All, Mapped>
{
static void add(const Mapped & mapped, size_t & rows_added, MutableColumns & columns_right)
{
for (auto current = &static_cast<const typename Mapped::Base_t &>(mapped); current != nullptr; current = current->next)
for (auto current = &static_cast<const typename Mapped::Base &>(mapped); current != nullptr; current = current->next)
{
for (size_t j = 0; j < columns_right.size(); ++j)
columns_right[j]->insertFrom(*current->block->getByPosition(j).column.get(), current->row_num);

View File

@ -25,6 +25,43 @@
namespace DB
{
namespace JoinStuff
{
/// Base class with optional flag attached that's needed to implement RIGHT and FULL JOINs.
template <typename T, bool with_used>
struct WithFlags;
template <typename T>
struct WithFlags<T, true> : T
{
using Base = T;
using T::T;
mutable std::atomic<bool> used {};
void setUsed() const { used.store(true, std::memory_order_relaxed); } /// Could be set simultaneously from different threads.
bool getUsed() const { return used; }
};
template <typename T>
struct WithFlags<T, false> : T
{
using Base = T;
using T::T;
void setUsed() const {}
bool getUsed() const { return true; }
};
using MappedAny = WithFlags<RowRef, false>;
using MappedAll = WithFlags<RowRefList, false>;
using MappedAnyFull = WithFlags<RowRef, true>;
using MappedAllFull = WithFlags<RowRefList, true>;
using MappedAsof = WithFlags<AsofRowRefs, false>;
}
/** Data structure for implementation of JOIN.
* It is just a hash table: keys -> rows of joined ("right") table.
* Additionally, CROSS JOIN is supported: instead of hash table, it use just set of blocks without keys.
@ -132,36 +169,7 @@ public:
ASTTableJoin::Kind getKind() const { return kind; }
AsofRowRefs::Type getAsofType() const { return *asof_type; }
/** Depending on template parameter, adds or doesn't add a flag, that element was used (row was joined).
* Depending on template parameter, decide whether to overwrite existing values when encountering the same key again
* with_used is for implementation of RIGHT and FULL JOINs.
* overwrite is for implementation of StorageJoin with overwrite setting enabled
* NOTE: It is possible to store the flag in one bit of pointer to block or row_num. It seems not reasonable, because memory saving is minimal.
*/
template <bool with_used, bool overwrite_, typename Base>
struct WithFlags;
template <bool overwrite_, typename Base>
struct WithFlags<true, overwrite_, Base> : Base
{
static constexpr bool overwrite = overwrite_;
mutable std::atomic<bool> used {};
using Base::Base;
using Base_t = Base;
void setUsed() const { used.store(true, std::memory_order_relaxed); } /// Could be set simultaneously from different threads.
bool getUsed() const { return used; }
};
template <bool overwrite_, typename Base>
struct WithFlags<false, overwrite_, Base> : Base
{
static constexpr bool overwrite = overwrite_;
using Base::Base;
using Base_t = Base;
void setUsed() const {}
bool getUsed() const { return true; }
};
bool anyTakeLastRow() const { return any_take_last_row; }
/// Different types of keys for maps.
#define APPLY_FOR_JOIN_VARIANTS(M) \
@ -257,13 +265,11 @@ public:
}
};
using MapsAny = MapsTemplate<WithFlags<false, false, RowRef>>;
using MapsAnyOverwrite = MapsTemplate<WithFlags<false, true, RowRef>>;
using MapsAll = MapsTemplate<WithFlags<false, false, RowRefList>>;
using MapsAnyFull = MapsTemplate<WithFlags<true, false, RowRef>>;
using MapsAnyFullOverwrite = MapsTemplate<WithFlags<true, true, RowRef>>;
using MapsAllFull = MapsTemplate<WithFlags<true, false, RowRefList>>;
using MapsAsof = MapsTemplate<WithFlags<false, false, AsofRowRefs>>;
using MapsAny = MapsTemplate<JoinStuff::MappedAny>;
using MapsAll = MapsTemplate<JoinStuff::MappedAll>;
using MapsAnyFull = MapsTemplate<JoinStuff::MappedAnyFull>;
using MapsAllFull = MapsTemplate<JoinStuff::MappedAllFull>;
using MapsAsof = MapsTemplate<JoinStuff::MappedAsof>;
template <ASTTableJoin::Kind KIND>
struct KindTrait
@ -276,13 +282,14 @@ public:
static constexpr bool fill_right = static_in_v<KIND, ASTTableJoin::Kind::Right, ASTTableJoin::Kind::Full>;
};
template <bool fill_right, typename ASTTableJoin::Strictness, bool overwrite>
template <bool fill_right, typename ASTTableJoin::Strictness>
struct MapGetterImpl;
template <ASTTableJoin::Kind kind, ASTTableJoin::Strictness strictness, bool overwrite>
using Map = typename MapGetterImpl<KindTrait<kind>::fill_right, strictness, overwrite>::Map;
template <ASTTableJoin::Kind kind, ASTTableJoin::Strictness strictness>
using Map = typename MapGetterImpl<KindTrait<kind>::fill_right, strictness>::Map;
static constexpr std::array<ASTTableJoin::Strictness, 3> STRICTNESSES = {ASTTableJoin::Strictness::Any, ASTTableJoin::Strictness::All, ASTTableJoin::Strictness::Asof};
static constexpr std::array<ASTTableJoin::Strictness, 3> STRICTNESSES
= {ASTTableJoin::Strictness::Any, ASTTableJoin::Strictness::All, ASTTableJoin::Strictness::Asof};
static constexpr std::array<ASTTableJoin::Kind, 4> KINDS
= {ASTTableJoin::Kind::Left, ASTTableJoin::Kind::Inner, ASTTableJoin::Kind::Full, ASTTableJoin::Kind::Right};
@ -298,12 +305,12 @@ public:
if (kind == KINDS[i] && strictness == ASTTableJoin::Strictness::Any)
{
if constexpr (std::is_same_v<Func, MapInitTag>)
maps = Map<KINDS[i], ASTTableJoin::Strictness::Any, true>();
maps = Map<KINDS[i], ASTTableJoin::Strictness::Any>();
else
func(
std::integral_constant<ASTTableJoin::Kind, KINDS[i]>(),
std::integral_constant<ASTTableJoin::Strictness, ASTTableJoin::Strictness::Any>(),
std::get<Map<KINDS[i], ASTTableJoin::Strictness::Any, true>>(maps));
std::get<Map<KINDS[i], ASTTableJoin::Strictness::Any>>(maps));
return true;
}
return false;
@ -320,12 +327,12 @@ public:
if (kind == KINDS[i] && strictness == STRICTNESSES[j])
{
if constexpr (std::is_same_v<Func, MapInitTag>)
maps = Map<KINDS[i], STRICTNESSES[j], false>();
maps = Map<KINDS[i], STRICTNESSES[j]>();
else
func(
std::integral_constant<ASTTableJoin::Kind, KINDS[i]>(),
std::integral_constant<ASTTableJoin::Strictness, STRICTNESSES[j]>(),
std::get<Map<KINDS[i], STRICTNESSES[j], false>>(maps));
std::get<Map<KINDS[i], STRICTNESSES[j]>>(maps));
return true;
}
return false;
@ -359,7 +366,7 @@ private:
*/
BlocksList blocks;
std::variant<MapsAny, MapsAnyOverwrite, MapsAll, MapsAnyFull, MapsAnyFullOverwrite, MapsAllFull, MapsAsof> maps;
std::variant<MapsAny, MapsAll, MapsAnyFull, MapsAllFull, MapsAsof> maps;
/// Additional data - strings for string keys and continuation elements of single-linked lists of references to rows.
Arena pool;
@ -421,32 +428,32 @@ private:
using JoinPtr = std::shared_ptr<Join>;
using Joins = std::vector<JoinPtr>;
template <bool overwrite_>
struct Join::MapGetterImpl<false, ASTTableJoin::Strictness::Any, overwrite_>
template <>
struct Join::MapGetterImpl<false, ASTTableJoin::Strictness::Any>
{
using Map = std::conditional_t<overwrite_, MapsAnyOverwrite, MapsAny>;
};
template <bool overwrite_>
struct Join::MapGetterImpl<true, ASTTableJoin::Strictness::Any, overwrite_>
{
using Map = std::conditional_t<overwrite_, MapsAnyFullOverwrite, MapsAnyFull>;
using Map = MapsAny;
};
template <>
struct Join::MapGetterImpl<false, ASTTableJoin::Strictness::All, false>
struct Join::MapGetterImpl<true, ASTTableJoin::Strictness::Any>
{
using Map = MapsAnyFull;
};
template <>
struct Join::MapGetterImpl<false, ASTTableJoin::Strictness::All>
{
using Map = MapsAll;
};
template <>
struct Join::MapGetterImpl<true, ASTTableJoin::Strictness::All, false>
struct Join::MapGetterImpl<true, ASTTableJoin::Strictness::All>
{
using Map = MapsAllFull;
};
template <bool fill_right>
struct Join::MapGetterImpl<fill_right, ASTTableJoin::Strictness::Asof, false>
struct Join::MapGetterImpl<fill_right, ASTTableJoin::Strictness::Asof>
{
using Map = MapsAsof;
};

View File

@ -84,30 +84,30 @@ Block MergeTreeBaseSelectBlockInputStream::readFromPart()
MergeTreeReadTask & current_task, MergeTreeRangeReader & current_reader)
{
if (!current_task.size_predictor)
return current_max_block_size_rows;
return static_cast<size_t>(current_max_block_size_rows);
/// Calculates number of rows will be read using preferred_block_size_bytes.
/// Can't be less than avg_index_granularity.
UInt64 rows_to_read = current_task.size_predictor->estimateNumRows(current_preferred_block_size_bytes);
size_t rows_to_read = current_task.size_predictor->estimateNumRows(current_preferred_block_size_bytes);
if (!rows_to_read)
return rows_to_read;
UInt64 total_row_in_current_granule = current_reader.numRowsInCurrentGranule();
rows_to_read = std::max<UInt64>(total_row_in_current_granule, rows_to_read);
auto total_row_in_current_granule = current_reader.numRowsInCurrentGranule();
rows_to_read = std::max(total_row_in_current_granule, rows_to_read);
if (current_preferred_max_column_in_block_size_bytes)
{
/// Calculates number of rows will be read using preferred_max_column_in_block_size_bytes.
UInt64 rows_to_read_for_max_size_column
auto rows_to_read_for_max_size_column
= current_task.size_predictor->estimateNumRowsForMaxSizeColumn(current_preferred_max_column_in_block_size_bytes);
double filtration_ratio = std::max(min_filtration_ratio, 1.0 - current_task.size_predictor->filtered_rows_ratio);
auto rows_to_read_for_max_size_column_with_filtration
= static_cast<UInt64>(rows_to_read_for_max_size_column / filtration_ratio);
= static_cast<size_t>(rows_to_read_for_max_size_column / filtration_ratio);
/// If preferred_max_column_in_block_size_bytes is used, number of rows to read can be less than current_index_granularity.
rows_to_read = std::min(rows_to_read, rows_to_read_for_max_size_column_with_filtration);
}
UInt64 unread_rows_in_current_granule = current_reader.numPendingRowsInCurrentGranule();
auto unread_rows_in_current_granule = current_reader.numPendingRowsInCurrentGranule();
if (unread_rows_in_current_granule >= rows_to_read)
return rows_to_read;

View File

@ -338,7 +338,7 @@ private:
throw Exception("ASOF join storage is not implemented yet", ErrorCodes::NOT_IMPLEMENTED);
}
else
for (auto current = &static_cast<const typename Map::mapped_type::Base_t &>(it->getSecond()); current != nullptr;
for (auto current = &static_cast<const typename Map::mapped_type::Base &>(it->getSecond()); current != nullptr;
current = current->next)
{
for (size_t j = 0; j < columns.size(); ++j)

View File

@ -56,7 +56,7 @@ if [ "$DATA_DIR_PATTERN" != "$DATA_DIR" ]; then
cat $CLICKHOUSE_CONFIG | sed -e s!$DATA_DIR_PATTERN!$DATA_DIR! > $DATA_DIR/etc/server-config.xml
export CLICKHOUSE_CONFIG=$DATA_DIR/etc/server-config.xml
cp $CLICKHOUSE_CONFIG_USERS $DATA_DIR/etc
cp -r -L $CLICKHOUSE_CONFIG_USERS_D $DATA_DIR/etc
cp -R -L $CLICKHOUSE_CONFIG_USERS_D $DATA_DIR/etc
fi
CLICKHOUSE_EXTRACT_CONFIG=${CLICKHOUSE_EXTRACT_CONFIG:="${CLICKHOUSE_EXTRACT} --config=$CLICKHOUSE_CONFIG"}

View File

@ -22,6 +22,8 @@
</any_of>
</stop_conditions>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE match(URL, ' *tranio\\.ru/spain/*/commercial/*') settings max_threads=5]]></query>
<query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiSearchAllPositions(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_100m_single]]></query>

View File

@ -21,13 +21,16 @@
<total_time_ms>60000</total_time_ms>
</any_of>
</stop_conditions>
<query>SELECT DISTINCT URL,Title, ngramDistance(Title, URL) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT SearchPhrase,Title, ngramDistance(Title, SearchPhrase) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT Title, ngramDistance(Title, 'baby dont hurt me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT Title, ngramDistance(Title, 'no more') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'wHAt Is lovE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'BABY DonT hUrT me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT URL,Title, ngramDistanceUTF8(Title, URL) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT SearchPhrase,Title, ngramDistanceUTF8(Title, SearchPhrase) AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
<query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'чем занимаешься') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>

View File

@ -0,0 +1,30 @@
<test>
<name>Simple Join Query</name>
<type>once</type>
<stop_conditions>
<all_of>
<total_time_ms>30000</total_time_ms>
</all_of>
<any_of>
<min_time_not_changing_for_ms>5000</min_time_not_changing_for_ms>
<total_time_ms>60000</total_time_ms>
</any_of>
</stop_conditions>
<main_metric>
<total_time />
</main_metric>
<create_query>CREATE TABLE join_table(A Int64, S0 String, S1 String, S2 String, S3 String)ENGINE = MergeTree ORDER BY A</create_query>
<fill_query>INSERT INTO join_table SELECT number AS A, toString(arrayMap(x->x, range(100))) S0, S0 AS S1, S0 AS S2, S0 AS S3 from numbers(500000)</fill_query>
<query tag='UsingJoinWithoutSubquery'>SELECT COUNT() FROM join_table LEFT JOIN join_table USING A</query>
<query tag='UsingJoinWithSubquery'>SELECT COUNT() FROM join_table LEFT JOIN (SELECT A FROM join_table) USING A</query>
<query tag='OnExpressionJoinWithoutSubquery'>SELECT COUNT() FROM join_table AS left LEFT JOIN join_table AS right ON left.A = right.A</query>
<query tag='OnExpressionJoinWithoutSubquery'>SELECT COUNT() FROM join_table AS left LEFT JOIN (SELECT A FROM join_table) AS right ON left.A = right.A</query>
<drop_query>DROP TABLE IF EXISTS join_table</drop_query>
</test>

View File

@ -5,12 +5,12 @@
0
0
1
0
\N
1
0
0
1
0
\N
1
0
0
@ -27,7 +27,7 @@
1
0
1
0
\N
0
1
0
@ -35,12 +35,12 @@
0
0
1
0
\N
1
0
0
1
0
\N
1
0
0
@ -57,7 +57,7 @@
1
0
1
0
\N
0
1
0

View File

@ -33,6 +33,76 @@
1000
1000
1000
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1000
1000
@ -40,6 +110,39 @@
77
636
1000
привет как дела?... Херсон 0
привет как дела клип - Яндекс.Видео 0
привет 0
пап привет как дела - Яндекс.Видео 0
привет братан как дела - Яндекс.Видео 0
http://metric.ru/ 0
http://autometric.ru/ 0
http://metrica.yandex.com/ 0
http://metris.ru/ 0
http://metrika.ru/ 0
0
0
привет как дела?... Херсон 600
пап привет как дела - Яндекс.Видео 684
привет как дела клип - Яндекс.Видео 692
привет братан как дела - Яндекс.Видео 707
привет 1000
http://metric.ru/ 1000
http://autometric.ru/ 1000
http://metrica.yandex.com/ 1000
http://metris.ru/ 1000
http://metrika.ru/ 1000
0
http://metric.ru/ 765
http://metris.ru/ 765
http://metrika.ru/ 778
http://autometric.ru/ 810
http://metrica.yandex.com/ 846
привет как дела?... Херсон 1000
привет как дела клип - Яндекс.Видео 1000
привет 1000
пап привет как дела - Яндекс.Видео 1000
привет братан как дела - Яндекс.Видео 1000
привет как дела?... Херсон 297
пап привет как дела - Яндекс.Видео 422
привет как дела клип - Яндекс.Видео 435
@ -152,6 +255,76 @@ http://metrika.ru/ 1000
1000
1000
1000
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1000
1000
@ -159,6 +332,39 @@ http://metrika.ru/ 1000
77
636
1000
привет как дела?... Херсон 0
привет как дела клип - Яндекс.Видео 0
привет 0
пап привет как дела - Яндекс.Видео 0
привет братан как дела - Яндекс.Видео 0
http://metric.ru/ 0
http://autometric.ru/ 0
http://metrica.yandex.com/ 0
http://metris.ru/ 0
http://metrika.ru/ 0
0
0
привет как дела?... Херсон 600
пап привет как дела - Яндекс.Видео 684
привет как дела клип - Яндекс.Видео 692
привет братан как дела - Яндекс.Видео 707
привет 1000
http://metric.ru/ 1000
http://autometric.ru/ 1000
http://metrica.yandex.com/ 1000
http://metris.ru/ 1000
http://metrika.ru/ 1000
0
http://metric.ru/ 765
http://metris.ru/ 765
http://metrika.ru/ 778
http://autometric.ru/ 810
http://metrica.yandex.com/ 846
привет как дела?... Херсон 1000
привет как дела клип - Яндекс.Видео 1000
привет 1000
пап привет как дела - Яндекс.Видео 1000
привет братан как дела - Яндекс.Видео 1000
привет как дела?... Херсон 297
пап привет как дела - Яндекс.Видео 422
привет как дела клип - Яндекс.Видео 435
@ -293,6 +499,76 @@ http://metrika.ru/ 1000
1000
1000
1000
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
@ -412,6 +688,76 @@ http://metrika.ru/ 1000
1000
1000
1000
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0

View File

@ -6,6 +6,22 @@ select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абв
select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize(''), materialize('')))=round(1000 * ngramDistanceUTF8(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize('абв'), materialize('')))=round(1000 * ngramDistanceUTF8(materialize('абв'), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize(''), materialize('абв')))=round(1000 * ngramDistanceUTF8(materialize(''), 'абв')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('абвгдеёжз')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('абвгдеёж')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('гдеёзд')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), materialize('ёёёёёёёё')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('', materialize('')))=round(1000 * ngramDistanceUTF8(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('абв', materialize('')))=round(1000 * ngramDistanceUTF8(materialize('абв'), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('', materialize('абв')))=round(1000 * ngramDistanceUTF8(materialize(''), 'абв')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('абвгдеёжз')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('абвгдеёж')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('гдеёзд')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('абвгдеёжз', materialize('ёёёёёёёё')))=round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5;
select round(1000 * ngramDistanceUTF8('', ''));
select round(1000 * ngramDistanceUTF8('абв', ''));
select round(1000 * ngramDistanceUTF8('', 'абв'));
@ -18,6 +34,10 @@ drop table if exists test_distance;
create table test_distance (Title String) engine = Memory;
insert into test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), ('');
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, Title) as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, extract(Title, 'как дела')) as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, extract(Title, 'metr')) as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, 'привет как дела') as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, 'как привет дела') as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceUTF8(Title, 'metrika') as distance;
@ -35,6 +55,23 @@ select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеё
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''),materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'),materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), materialize('абв'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), 'абв')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), materialize('АбвгдЕёжз'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), 'АбвгдЕёжз')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), materialize('АбвГдеёж'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), 'АбвГдеёж')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), materialize('гдеёЗД'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), materialize('ЁЁЁЁЁЁЁЁ'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('', materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('абв',materialize(''))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('', materialize('абв'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), 'абв')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвГДЕёжз', materialize('АбвгдЕёжз'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), 'АбвгдЕёжз')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('аБВГдеёЖз', materialize('АбвГдеёж'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), 'АбвГдеёж')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', materialize('гдеёЗД'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', materialize('ЁЁЁЁЁЁЁЁ'))) = round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitiveUTF8('', ''));
select round(1000 * ngramDistanceCaseInsensitiveUTF8('абв', ''));
select round(1000 * ngramDistanceCaseInsensitiveUTF8('', 'абв'));
@ -43,6 +80,10 @@ select round(1000 * ngramDistanceCaseInsensitiveUTF8('аБВГдеёЖз', 'Аб
select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', 'гдеёЗД'));
select round(1000 * ngramDistanceCaseInsensitiveUTF8('АБВГДеёжз', 'ЁЁЁЁЁЁЁЁ'));
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, Title) as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, extract(Title, 'как дела')) as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, extract(Title, 'metr')) as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'ПрИвЕт кАК ДЕЛа') as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'как ПРИВЕТ дела') as distance;
SELECT Title, round(1000 * distance) FROM test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrika') as distance;
@ -62,6 +103,23 @@ select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from syst
select round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize(''),materialize('')))=round(1000 * ngramDistance(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize('abc'),materialize('')))=round(1000 * ngramDistance(materialize('abc'), '')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize(''), materialize('abc')))=round(1000 * ngramDistance(materialize(''), 'abc')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('abcdefgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefgh')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('abcdefg')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('defgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5;
select round(1000 * ngramDistance(materialize('abcdefgh'), materialize('aaaaaaaa')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5;
select round(1000 * ngramDistance('',materialize('')))=round(1000 * ngramDistance(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistance('abc', materialize('')))=round(1000 * ngramDistance(materialize('abc'), '')) from system.numbers limit 5;
select round(1000 * ngramDistance('', materialize('abc')))=round(1000 * ngramDistance(materialize(''), 'abc')) from system.numbers limit 5;
select round(1000 * ngramDistance('abcdefgh', materialize('abcdefgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefgh')) from system.numbers limit 5;
select round(1000 * ngramDistance('abcdefgh', materialize('abcdefg')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from system.numbers limit 5;
select round(1000 * ngramDistance('abcdefgh', materialize('defgh')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5;
select round(1000 * ngramDistance('abcdefgh', materialize('aaaaaaaa')))=round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5;
select round(1000 * ngramDistance('', ''));
select round(1000 * ngramDistance('abc', ''));
select round(1000 * ngramDistance('', 'abc'));
@ -86,6 +144,22 @@ select round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcde
select round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize(''), materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize(''), materialize('abc')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), 'abc')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), materialize('Abcdefgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), 'Abcdefgh')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), materialize('abcdeFG')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcdeFG')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), materialize('defgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), materialize('aaaaaaaa')))=round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('', materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('abc', materialize('')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), '')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('', materialize('abc')))=round(1000 * ngramDistanceCaseInsensitive(materialize(''), 'abc')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('abCdefgH', materialize('Abcdefgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), 'Abcdefgh')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('abcdefgh', materialize('abcdeFG')))=round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcdeFG')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('AAAAbcdefgh', materialize('defgh')))=round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('ABCdefgH', materialize('aaaaaaaa')))=round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5;
select round(1000 * ngramDistanceCaseInsensitive('', ''));
select round(1000 * ngramDistanceCaseInsensitive('abc', ''));
select round(1000 * ngramDistanceCaseInsensitive('', 'abc'));

View File

@ -0,0 +1,43 @@
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
SimpleAggregateFunction(sum, Float64)
0 0
1 2
2 4
3 6
4 8
5 10
6 12
7 14
8 16
9 18
0 0
1 2
2 4
3 6
4 8
5 10
6 12
7 14
8 16
9 18
1 1 2 2.2.2.2
SimpleAggregateFunction(anyLast, Nullable(String)) SimpleAggregateFunction(anyLast, LowCardinality(Nullable(String))) SimpleAggregateFunction(anyLast, IPv4)

View File

@ -0,0 +1,27 @@
-- basic test
drop table if exists test.simple;
create table test.simple (id UInt64,val SimpleAggregateFunction(sum,Double)) engine=AggregatingMergeTree order by id;
insert into test.simple select number,number from system.numbers limit 10;
select * from test.simple;
select * from test.simple final;
select toTypeName(val) from test.simple limit 1;
-- merge
insert into test.simple select number,number from system.numbers limit 10;
select * from test.simple final;
optimize table test.simple final;
select * from test.simple;
-- complex types
drop table if exists test.simple;
create table test.simple (id UInt64,nullable_str SimpleAggregateFunction(anyLast,Nullable(String)),low_str SimpleAggregateFunction(anyLast,LowCardinality(Nullable(String))),ip SimpleAggregateFunction(anyLast,IPv4)) engine=AggregatingMergeTree order by id;
insert into test.simple values(1,'1','1','1.1.1.1');
insert into test.simple values(1,null,'2','2.2.2.2');
select * from test.simple final;
select toTypeName(nullable_str),toTypeName(low_str),toTypeName(ip) from test.simple limit 1;

View File

@ -0,0 +1,16 @@
4
Object
1
1
a
hello
hello
3
Array
-100
200
300
('a','hello','b',[-100,200,300])
[-100,NULL,300]
['a','hello','b',NULL]
[(NULL,NULL,NULL),(NULL,NULL,NULL),(NULL,NULL,NULL),(-100,200,44)]

View File

@ -0,0 +1,16 @@
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}');
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}');
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1);
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 2);
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1);
select jsonExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2);
select jsonExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1);
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, String, String, Array(Float64))');
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Array(Int32)', 'b');
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Array(String)');
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Array(Tuple(Int16, Float32, UInt8))');

View File

@ -0,0 +1,9 @@
1
\N
\N
1
\N
\N
1
\N
\N

View File

@ -0,0 +1,11 @@
DROP TABLE IF EXISTS test.nullt;
CREATE TABLE test.nullt (c1 Nullable(UInt32), c2 Nullable(String))ENGINE = Log;
INSERT INTO test.nullt VALUES (1, 'abc'), (2, NULL), (NULL, NULL);
SELECT c2 = ('abc') FROM test.nullt;
SELECT c2 IN ('abc') FROM test.nullt;
SELECT c2 IN ('abc', NULL) FROM test.nullt;
DROP TABLE IF EXISTS test.nullt;

View File

@ -235,10 +235,20 @@ forcestop()
}
service_or_func()
{
if [ -x "/bin/systemctl" ] && [ -f /etc/systemd/system/clickhouse-server.service ] && [ -d /run/systemd/system ]; then
service $PROGRAM $1
else
$1
fi
}
forcerestart()
{
forcestop
start
# Should not use 'start' function if systemd active
service_or_func start
}
use_cron()
@ -291,7 +301,9 @@ main()
restart && enable_cron
;;
forcestop)
disable_cron && forcestop
# disable_cron returns false if cron disabled (with systemd) - not checking return status
disable_cron
forcestop
;;
forcerestart)
forcerestart && enable_cron
@ -300,16 +312,16 @@ main()
restart
;;
condstart)
is_running || start
is_running || service_or_func start
;;
condstop)
is_running && stop
is_running && service_or_func stop
;;
condrestart)
is_running && restart
is_running && service_or_func restart
;;
condreload)
is_running && restart
is_running && service_or_func restart
;;
initdb)
initdb

View File

@ -7,7 +7,8 @@ User=clickhouse
Group=clickhouse
Restart=always
RestartSec=30
ExecStart=/usr/bin/clickhouse-server --config=/etc/clickhouse-server/config.xml
RuntimeDirectory=clickhouse-server
ExecStart=/usr/bin/clickhouse-server --config=/etc/clickhouse-server/config.xml --pid-file=/run/clickhouse-server/clickhouse-server.pid
LimitCORE=infinity
LimitNOFILE=500000
CapabilityBoundingSet=CAP_NET_ADMIN CAP_IPC_LOCK

View File

@ -351,7 +351,8 @@ CREATE TABLE IF NOT EXISTS example_table
- If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type).
- If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`.
Enabling the option can affect the performance of inserts.
!!! note "Warning"
When inserting data with `insert_sample_with_metadata = 1`, ClickHouse consumes more computational resources, compared to insertion with `insert_sample_with_metadata = 0`.
### Selecting Data

View File

@ -110,9 +110,9 @@ When ClickHouse merges data parts, each group of consecutive rows with the same
For each resulting data part ClickHouse saves:
1. The first "cancel" and the last "state" rows, if the number of "state" and "cancel" rows matches.
1. The last "state" row, if there is one more "state" row than "cancel" rows.
1. The first "cancel" row, if there is one more "cancel" row than "state" rows.
1. None of the rows, in all other cases.
2. The last "state" row, if there is one more "state" row than "cancel" rows.
3. The first "cancel" row, if there is one more "cancel" row than "state" rows.
4. None of the rows, in all other cases.
The merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once.

View File

@ -0,0 +1,80 @@
# JDBC
Allows ClickHouse to connect to external databases via [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity).
To implement JDBC connection, ClickHouse uses the separate program [clickhouse-jdbc-bridge](https://github.com/alex-krash/clickhouse-jdbc-bridge). You should run it as a daemon.
This engine supports the [Nullable](../../data_types/nullable.md) data type.
## Creating a Table
```
CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = JDBC(dbms_uri, external_database, external_table)
```
**Engine Parameters**
- `dbms_uri` — URI of an external DBMS.
Format: `jdbc:<driver_name>://<host_name>:<port>/?user=<username>&password=<password>`.
Example for MySQL: `jdbc:mysql://localhost:3306/?user=root&password=root`.
- `external_database` — Database in an external DBMS.
- `external_table` — A name of the table in `external_database`.
## Usage Example
Creating a table in MySQL (using native MySQL engine):
```
mysql> CREATE TABLE `test`.`test` (
-> `int_id` INT NOT NULL AUTO_INCREMENT,
-> `int_nullable` INT NULL DEFAULT NULL,
-> `float` FLOAT NOT NULL,
-> `float_nullable` FLOAT NULL DEFAULT NULL,
-> PRIMARY KEY (`int_id`));
Query OK, 0 rows affected (0,09 sec)
mysql> insert into test (`int_id`, `float`) VALUES (1,2);
Query OK, 1 row affected (0,00 sec)
mysql> select * from test;
+--------+--------------+-------+----------------+
| int_id | int_nullable | float | float_nullable |
+--------+--------------+-------+----------------+
| 1 | NULL | 2 | NULL |
+--------+--------------+-------+----------------+
1 row in set (0,00 sec)
```
Selecting data from the table in ClickHouse:
```
CREATE TABLE jdbc_table ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test')
Ok.
DESCRIBE TABLE jdbc_table
┌─name───────────────┬─type───────────────┬─default_type─┬─default_expression─┐
│ int_id │ Int32 │ │ │
│ int_nullable │ Nullable(Int32) │ │ │
│ float │ Float32 │ │ │
│ float_nullable │ Nullable(Float32) │ │ │
└────────────────────┴────────────────────┴──────────────┴────────────────────┘
10 rows in set. Elapsed: 0.031 sec.
SELECT *
FROM jdbc_table
┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐
│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │
└────────┴──────────────┴───────┴────────────────┘
1 rows in set. Elapsed: 0.055 sec.
```
## See Also
- [JDBC table function](../../query_language/table_functions/jdbc.md).

View File

@ -49,7 +49,7 @@ For a description of request parameters, see [request description](../../query_l
**Query clauses**
- `ENGINE` - Name and parameters of the engine. `ENGINE = MergeTree()`. `MergeTree` engine does not have parameters.
- `ENGINE` Name and parameters of the engine. `ENGINE = MergeTree()`. `MergeTree` engine does not have parameters.
- `PARTITION BY` — The [partitioning key](custom_partitioning_key.md).
@ -59,7 +59,7 @@ For a description of request parameters, see [request description](../../query_l
A tuple of columns or arbitrary expressions. Example: `ORDER BY (CounterID, EventDate)`.
- `PRIMARY KEY` - The primary key if it [differs from the sorting key](mergetree.md).
- `PRIMARY KEY` The primary key if it [differs from the sorting key](mergetree.md).
By default the primary key is the same as the sorting key (which is specified by the `ORDER BY` clause). Thus in most cases it is unnecessary to specify a separate `PRIMARY KEY` clause.
@ -67,7 +67,7 @@ For a description of request parameters, see [request description](../../query_l
If a sampling expression is used, the primary key must contain it. Example: `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`.
- `TTL` - An expression for setting storage time for rows.
- `TTL` — An expression for setting storage time for rows.
It must depends on `Date` or `DateTime` column and has one `Date` or `DateTime` column as a result. Example:
`TTL date + INTERVAL 1 DAY`
@ -78,7 +78,7 @@ For a description of request parameters, see [request description](../../query_l
- `index_granularity` — The granularity of an index. The number of data rows between the "marks" of an index. By default, 8192. The list of all available parameters you can see in [MergeTreeSettings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h).
- `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information refer the [setting description](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in the "Server configuration parameters" chapter.
- `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation required for using of the direct I/O access to the storage disk. During the merging of the data parts, ClickHouse calculates summary storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, then ClickHouse reads and writes the data using direct I/O interface (`O_DIRECT` option) to the storage disk. If `min_merge_bytes_to_use_direct_io = 0`, then the direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes.
- `merge_with_ttl_timeout` - Minimal time in seconds, when merge with TTL can be repeated. Default value: 86400 (1 day).
- `merge_with_ttl_timeout` Minimal time in seconds, when merge with TTL can be repeated. Default value: 86400 (1 day).
**Example of sections setting**
@ -310,7 +310,7 @@ Reading from a table is automatically parallelized.
## TTL for columns and tables
Data with expired TTL is removed while executing merges.
Data with expired TTL is removed while executing merges.
If TTL is set for column, when it expires, value will be replaced by default. If all values in columns were zeroed in part, data for this column will be deleted from disk for part. You are not allowed to set TTL for all key columns. If TTL is set for table, when it expires, row will be deleted.

View File

@ -57,3 +57,101 @@ There is currently no support for code points in the format `\uXXXX\uYYYY` that
[Original article](https://clickhouse.yandex/docs/en/query_language/functions/json_functions/) <!--hide-->
The following functions are based on [simdjson](https://github.com/lemire/simdjson) designed for more complex JSON parsing requirements. The assumption 2 mentioned above still applies.
## jsonHas(params[, accessors]...)
If the value exists in the JSON document, `1` will be returned.
If the value does not exist, `null` will be returned.
Examples:
```
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = null
```
An accessor can be either a string, a positive integer or a negative integer.
* String = access object member by key.
* Positive integer = access the n-th member/key from the beginning.
* Negative integer = access the n-th member/key from the end.
You may use integers to access both JSON arrays and JSON objects. JSON objects are accessed as an array with the `[key, value, key, value, ...]` layout.
So, for example:
```
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a'
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'hello'
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'b'
```
## jsonLength(params[, accessors]...)
Return the length of a JSON array or a JSON object. For JSON objects, both keys and values are included.
If the value does not exist or has a wrong type, `null` will be returned.
Examples:
```
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 4
```
The usage of accessors is the same as above.
## jsonType(params[, accessors]...)
Return the type of a JSON value.
If the value does not exist, `null` will be returned.
Examples:
```
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object'
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String'
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array'
```
The usage of accessors is the same as above.
## jsonExtractUInt(params[, accessors]...)
## jsonExtractInt(params[, accessors]...)
## jsonExtractFloat(params[, accessors]...)
## jsonExtractBool(params[, accessors]...)
## jsonExtractString(params[, accessors]...)
Parse data from JSON values which is similar to `visitParam` functions.
If the value does not exist or has a wrong type, `null` will be returned.
Examples:
```
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello'
select jsonExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100
select jsonExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0
select jsonExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300
```
The usage of accessors is the same as above.
## jsonExtract(params, type[, accessors]...)
Parse data from JSON values with a given ClickHouse data type.
If the value does not exist or has a wrong type, `null` will be returned.
Examples:
```
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Int8', 'b', 1) = -100
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, String, String, Array(Float64))') = ('a', 'hello', 'b', [-100.0, 200.0, 300.0])
```
The usage of accessors is the same as above.

View File

@ -102,7 +102,7 @@ The same thing as 'like', but negative.
## ngramDistance(haystack, needle)
Calculates the 4-gram distance between `haystack` and `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 -- the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throws an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
Calculates the 4-gram distance between `haystack` and `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 -- the closer to zero, the more strings are similar to each other. If the constant `needle` or `haystack` is more than 32Kb, throws an exception. If some of the non-constant `haystack` or `needle` strings are more than 32Kb, the distance is always one.
For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.

View File

@ -17,7 +17,7 @@ SELECT [DISTINCT] expr_list
[UNION ALL ...]
[INTO OUTFILE filename]
[FORMAT format]
[LIMIT n BY columns]
[LIMIT [offset_value, ]n BY columns]
```
All the clauses are optional, except for the required list of expressions immediately after SELECT.
@ -46,7 +46,7 @@ The FINAL modifier can be used only for a SELECT from a CollapsingMergeTree tabl
### SAMPLE Clause {#select-sample-clause}
The `SAMPLE` clause allows for approximated query processing.
The `SAMPLE` clause allows for approximated query processing.
When data sampling is enabled, the query is not performed on all the data, but only on a certain fraction of data (sample). For example, if you need to calculate statistics for all the visits, it is enough to execute the query on the 1/10 fraction of all the visits and then multiply the result by 10.
@ -67,11 +67,11 @@ The features of data sampling are listed below:
For the `SAMPLE` clause the following syntax is supported:
| SAMPLE&#160;Clause&#160;Syntax | Description |
| ---------------- | --------- |
| SAMPLE&#160;Clause&#160;Syntax | Description |
| ---------------- | --------- |
| `SAMPLE k` | Here `k` is the number from 0 to 1.</br>The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k)|
| `SAMPLE n` | Here `n` is a sufficiently large integer.</br>The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) |
| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1.</br>The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) |
| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1.</br>The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) |
#### SAMPLE k {#select-sample-k}
@ -129,7 +129,7 @@ SELECT avg(Duration)
FROM visits
SAMPLE 10000000
```
#### SAMPLE k OFFSET m {#select-sample-offset}
Here `k` and `m` are numbers from 0 to 1. Examples are shown below.
@ -216,10 +216,10 @@ The next example uses the `LEFT ARRAY JOIN` clause:
``` sql
SELECT s, arr
FROM arrays_test
FROM arrays_test
LEFT ARRAY JOIN arr;
```
```
```
┌─s───────────┬─arr─┐
│ Hello │ 1 │
│ Hello │ 2 │
@ -228,7 +228,7 @@ LEFT ARRAY JOIN arr;
│ World │ 5 │
│ Goodbye │ 0 │
└─────────────┴─────┘
```
```
#### Using Aliases
@ -240,7 +240,7 @@ FROM arrays_test
ARRAY JOIN arr AS a;
```
```
```
┌─s─────┬─arr─────┬─a─┐
│ Hello │ [1,2] │ 1 │
│ Hello │ [1,2] │ 2 │
@ -254,7 +254,7 @@ Using aliases, you can perform `ARRAY JOIN` with an external array. For example:
``` sql
SELECT s, arr_external
FROM arrays_test
FROM arrays_test
ARRAY JOIN [1, 2, 3] AS arr_external;
```
@ -325,7 +325,7 @@ INSERT INTO nested_test
VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []);
```
```
```
┌─s───────┬─nest.x──┬─nest.y─────┐
│ Hello │ [1,2] │ [10,20] │
│ World │ [3,4,5] │ [30,40,50] │
@ -339,7 +339,7 @@ FROM nested_test
ARRAY JOIN nest;
```
```
```
┌─s─────┬─nest.x─┬─nest.y─┐
│ Hello │ 1 │ 10 │
│ Hello │ 2 │ 20 │
@ -357,7 +357,7 @@ FROM nested_test
ARRAY JOIN `nest.x`, `nest.y`;
```
```
```
┌─s─────┬─nest.x─┬─nest.y─┐
│ Hello │ 1 │ 10 │
│ Hello │ 2 │ 20 │
@ -375,7 +375,7 @@ FROM nested_test
ARRAY JOIN `nest.x`;
```
```
```
┌─s─────┬─nest.x─┬─nest.y─────┐
│ Hello │ 1 │ [10,20] │
│ Hello │ 2 │ [10,20] │
@ -393,7 +393,7 @@ FROM nested_test
ARRAY JOIN nest AS n;
```
```
```
┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐
│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │
│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │
@ -411,7 +411,7 @@ FROM nested_test
ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num;
```
```
```
┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐
│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │
│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │
@ -669,11 +669,55 @@ When external aggregation is enabled, if there was less than ` max_bytes_before_
If you have an ORDER BY with a small LIMIT after GROUP BY, then the ORDER BY CLAUSE will not use significant amounts of RAM.
But if the ORDER BY doesn't have LIMIT, don't forget to enable external sorting (`max_bytes_before_external_sort`).
### LIMIT N BY Clause
### LIMIT BY Clause
LIMIT N BY COLUMNS selects the top N rows for each group of COLUMNS. LIMIT N BY is not related to LIMIT; they can both be used in the same query. The key for LIMIT N BY can contain any number of columns or expressions.
The query with the `LIMIT n BY expressions` clause selects the first `n` rows for each distinct value of `expressions`. The key for `LIMIT BY` can contain any number of [expressions](syntax.md#syntax-expressions).
Example:
ClickHouse supports the following syntax:
- `LIMIT [offset_value, ]n BY expressions`
- `LIMIT n OFFSET offset_value BY expressions`
During the query processing, ClickHouse selects data ordered by sorting key. Sorting key is set explicitly by [ORDER BY](#select-order-by) clause or implicitly as a property of table engine. Then ClickHouse applies `LIMIT n BY expressions` and returns the first `n` rows for each distinct combination of `expressions`. If `OFFSET` is specified, then for each data block, belonging to a distinct combination of `expressions`, ClickHouse skips `offset_value` rows from the beginning of the block, and returns not more than `n` rows as a result. If `offset_value` is bigger than the number of rows in the data block, then ClickHouse returns no rows from the block.
`LIMIT BY` is not related to `LIMIT`, they can both be used in the same query.
**Examples**
Sample table:
```sql
CREATE TABLE limit_by(id Int, val Int) ENGINE = Memory;
INSERT INTO limit_by values(1, 10), (1, 11), (1, 12), (2, 20), (2, 21);
```
Queries:
```sql
SELECT * FROM limit_by ORDER BY id, val LIMIT 2 BY id
```
```text
┌─id─┬─val─┐
│ 1 │ 10 │
│ 1 │ 11 │
│ 2 │ 20 │
│ 2 │ 21 │
└────┴─────┘
```
```sql
SELECT * FROM limit_by ORDER BY id, val LIMIT 1, 2 BY id
```
```text
┌─id─┬─val─┐
│ 1 │ 11 │
│ 1 │ 12 │
│ 2 │ 21 │
└────┴─────┘
```
The `SELECT * FROM limit_by ORDER BY id, val LIMIT 2 OFFSET 1 BY id` query returns the same result.
The following query returns the top 5 referrers for each `domain, device_type` pair, but not more than 100 rows (`LIMIT n BY + LIMIT`).
``` sql
SELECT
@ -688,8 +732,6 @@ LIMIT 5 BY domain, device_type
LIMIT 100
```
The query will select the top 5 referrers for each `domain, device_type` pair, but not more than 100 rows (`LIMIT n BY + LIMIT`).
### HAVING Clause
Allows filtering the result received after GROUP BY, similar to the WHERE clause.
@ -697,7 +739,7 @@ WHERE and HAVING differ in that WHERE is performed before aggregation (GROUP BY)
If aggregation is not performed, HAVING can't be used.
### ORDER BY Clause
### ORDER BY Clause {#select-order-by}
The ORDER BY clause contains a list of expressions, which can each be assigned DESC or ASC (the sorting direction). If the direction is not specified, ASC is assumed. ASC is sorted in ascending order, and DESC in descending order. The sorting direction applies to a single expression, not to the entire list. Example: `ORDER BY Visits DESC, SearchPhrase`

View File

@ -349,7 +349,8 @@ CREATE TABLE IF NOT EXISTS example_table
- Если `input_format_defaults_for_omitted_fields = 0`, то значение по умолчанию для `x` и `a` равняется `0` (поскольку это значение по умолчанию для типа данных `UInt32`.)
- Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`.
Включение этой опции может негативно влиять на производительность вставок.
!!! note "Предупреждение"
Если `insert_sample_with_metadata = 1`, то при обработке запросов ClickHouse потребляет больше вычислительных ресурсов, чем если `insert_sample_with_metadata = 0`.
### Выборка данных

View File

@ -91,7 +91,7 @@
## ngramDistance(haystack, needle)
Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 -- чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строки из `haystack` больше 32КБ, расстояние всегда равно единице.
Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 -- чем ближе к нулю, тем больше строки похожи друг на друга. Если константный `needle` или `haystack` больше чем 32КБ, кидается исключение. Если некоторые строки из неконстантного `haystack` или `needle` больше 32КБ, расстояние всегда равно единице.
Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.

View File

@ -91,6 +91,7 @@ nav:
- 'MaterializedView': 'operations/table_engines/materializedview.md'
- 'Memory': 'operations/table_engines/memory.md'
- 'Buffer': 'operations/table_engines/buffer.md'
- 'JDBC': 'operations/table_engines/jdbc.md'
- 'SQL Reference':
- 'hidden': 'query_language/index.md'

View File

@ -1,12 +1,12 @@
# CollapsingMergeTree {#table_engine-collapsingmergetree}
The engine inherits from [MergeTree](mergetree.md) and adds the logic of rows collapsing to data parts merge algorithm.
该引擎继承于 [MergeTree](mergetree.md),并在数据块合并算法中添加了折叠行的逻辑。
`CollapsingMergeTree` asynchronously deletes (collapses) pairs of rows if all of the fields in a row are equivalent excepting the particular field `Sign` which can have `1` and `-1` values. Rows without a pair are kept. For more details see the [Collapsing](#collapsing) section of the document.
`CollapsingMergeTree` 会异步的删除(折叠)这些除了特定列 `Sign``1``-1` 的值以外,其余所有字段的值都相等的成对的行。没有成对的行会被保留。更多的细节请看本文的[折叠](#table_engine-collapsingmergetree-collapsing)部分。
The engine may significantly reduce the volume of storage and increase efficiency of `SELECT` query as a consequence.
因此,该引擎可以显著的降低存储量并提高 `SELECT` 查询效率。
## Creating a Table
## 建表
```sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -21,22 +21,22 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[SETTINGS name=value, ...]
```
For a description of request parameters, see [request description](../../query_language/create.md).
请求参数的描述,参考[请求参数](../../query_language/create.md)。
**CollapsingMergeTree Parameters**
**CollapsingMergeTree 参数**
- `sign`Name of the column with the type of row: `1` is a "state" row, `-1` is a "cancel" row.
- `sign`类型列的名称: `1` 是“状态”行,`-1` 是“取消”行。
Column data type — `Int8`.
列数据类型 — `Int8`
**Query clauses**
**子句**
When creating a `CollapsingMergeTree` table, the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table.
创建 `CollapsingMergeTree` 表时,需要与创建 `MergeTree` 表时相同的[子句](mergetree.md#table_engine-mergetree-creating-a-table)。
<details markdown="1"><summary>Deprecated Method for Creating a Table</summary>
<details markdown="1"><summary>已弃用的建表方法</summary>
!!! attention
Do not use this method in new projects and, if possible, switch the old projects to the method described above.
!!! attention "注意"
不要在新项目中使用该方法,可能的话,请将旧项目切换到上述方法。
```sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -47,23 +47,23 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
) ENGINE [=] CollapsingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, sign)
```
All of the parameters excepting `sign` have the same meaning as in `MergeTree`.
除了 `sign` 的所有参数都与 `MergeTree` 中的含义相同。
- `sign`Name of the column with the type of row: `1` — "state" row, `-1` — "cancel" row.
- `sign`类型列的名称: `1` 是“状态”行,`-1` 是“取消”行。
Column Data Type — `Int8`.
列数据类型 — `Int8`
</details>
## Collapsing
## 折叠 {#table_engine-collapsingmergetree-collapsing}
### Data
### 数据
Consider the situation where you need to save continually changing data for some object. It sounds logical to have one row for an object and update it at any change, but update operation is expensive and slow for DBMS because it requires rewriting of the data in the storage. If you need to write data quickly, update not acceptable, but you can write the changes of an object sequentially as follows.
考虑你需要为某个对象保存不断变化的数据的情景。似乎为一个对象保存一行记录并在其发生任何变化时更新记录是合乎逻辑的,但是更新操作对 DBMS 来说是昂贵且缓慢的,因为它需要重写存储中的数据。如果你需要快速的写入数据,则更新操作是不可接受的,但是你可以按下面的描述顺序地更新一个对象的变化。
Use the particular column `Sign` when writing row. If `Sign = 1` it means that the row is a state of an object, let's call it "state" row. If `Sign = -1` it means the cancellation of the state of an object with the same attributes, let's call it "cancel" row.
在写入行的时候使用特定的列 `Sign`。如果 `Sign = 1` 则表示这一行是对象的状态,我们称之为“状态”行。如果 `Sign = -1` 则表示是对具有相同属性的状态行的取消,我们称之为“取消”行。
For example, we want to calculate how much pages users checked at some site and how long they were there. At some moment of time we write the following row with the state of user activity:
例如,我们想要计算用户在某个站点访问的页面页面数以及他们在那里停留的时间。在某个时候,我们将用户的活动状态写入下面这样的行。
```
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
@ -71,7 +71,7 @@ For example, we want to calculate how much pages users checked at some site and
└─────────────────────┴───────────┴──────────┴──────┘
```
At some moment later we register the change of user activity and write it with the following two rows.
一段时间后,我们写入下面的两行来记录用户活动的变化。
```
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
@ -80,11 +80,11 @@ At some moment later we register the change of user activity and write it with t
└─────────────────────┴───────────┴──────────┴──────┘
```
The first row cancels the previous state of the object (user). It should copy all of the fields of the canceled state excepting `Sign`.
第一行取消了这个对象(用户)的状态。它需要复制被取消的状态行的所有除了 `Sign` 的属性。
The second row contains the current state.
第二行包含了当前的状态。
As we need only the last state of user activity, the rows
因为我们只需要用户活动的最后状态,这些行
```
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
@ -93,43 +93,43 @@ As we need only the last state of user activity, the rows
└─────────────────────┴───────────┴──────────┴──────┘
```
can be deleted collapsing the invalid (old) state of an object. `CollapsingMergeTree` does this while merging of the data parts.
可以在折叠对象的失效(老的)状态的时候被删除。`CollapsingMergeTree` 会在合并数据片段的时候做这件事。
Why we need 2 rows for each change read in the "Algorithm" paragraph.
为什么我们每次改变需要 2 行可以阅读[算法](#table_engine-collapsingmergetree-collapsing-algorithm)段。
**Peculiar properties of such approach**
**这种方法的特殊属性**
1. The program that writes the data should remember the state of an object to be able to cancel it. "Cancel" string should be the copy of "state" string with the opposite `Sign`. It increases the initial size of storage but allows to write the data quickly.
2. Long growing arrays in columns reduce the efficiency of the engine due to load for writing. The more straightforward data, the higher efficiency.
3. `SELECT` results depend strongly on the consistency of object changes history. Be accurate when preparing data for inserting. You can get unpredictable results in inconsistent data, for example, negative values for non-negative metrics such as session depth.
1. 写入的程序应该记住对象的状态从而可以取消它。“取消”字符串应该是“状态”字符串的复制,除了相反的 `Sign`。它增加了存储的初始数据的大小,但使得写入数据更快速。
2. 由于写入的负载,列中长的增长阵列会降低引擎的效率。数据越简单,效率越高。
3. `SELECT` 的结果很大程度取决于对象变更历史的一致性。在准备插入数据时要准确。在不一致的数据中会得到不可预料的结果,例如,像会话深度这种非负指标的负值。
### Algorithm
### 算法 {#table_engine-collapsingmergetree-collapsing-algorithm}
When ClickHouse merges data parts, each group of consecutive rows with the same primary key is reduced to not more than two rows, one with `Sign = 1` ("state" row) and another with `Sign = -1` ("cancel" row). In other words, entries collapse.
当 ClickHouse 合并数据片段时,每组具有相同主键的连续行被减少到不超过两行,一行 `Sign = 1`(“状态”行),另一行 `Sign = -1` (“取消”行),换句话说,数据项被折叠了。
For each resulting data part ClickHouse saves:
对每个结果的数据部分 ClickHouse 保存:
1. The first "cancel" and the last "state" rows, if the number of "state" and "cancel" rows matches.
1. The last "state" row, if there is one more "state" row than "cancel" rows.
1. The first "cancel" row, if there is one more "cancel" row than "state" rows.
1. None of the rows, in all other cases.
1. 第一个“取消”和最后一个“状态”行,如果“状态”和“取消”行的数量匹配
2. 最后一个“状态”行,如果“状态”行比“取消”行多一个。
3. 第一个“取消”行,如果“取消”行比“状态”行多一个。
4. 没有行,在其他所有情况下。
The merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once.
合并会继续,但是 ClickHouse 会把此情况视为逻辑错误并将其记录在服务日志中。这个错误会在相同的数据被插入超过一次时出现。
Thus, collapsing should not change the results of calculating statistics.
Changes gradually collapsed so that in the end only the last state of almost every object left.
因此,折叠不应该改变统计数据的结果。
变化逐渐地被折叠,因此最终几乎每个对象都只剩下了最后的状态。
The `Sign` is required because the merging algorithm doesn't guarantee that all of the rows with the same primary key will be in the same resulting data part and even on the same physical server. ClickHouse process `SELECT` queries with multiple threads, and it can not predict the order of rows in the result. The aggregation is required if there is a need to get completely "collapsed" data from `CollapsingMergeTree` table.
`Sign` 是必须的因为合并算法不保证所有有相同主键的行都会在同一个结果数据片段中甚至是在同一台物理服务器上。ClickHouse 用多线程来处理 `SELECT` 请求,所以它不能预测结果中行的顺序。如果要从 `CollapsingMergeTree` 表中获取完全“折叠”后的数据,则需要聚合。
To finalize collapsing write a query with `GROUP BY` clause and aggregate functions that account for the sign. For example, to calculate quantity, use `sum(Sign)` instead of `count()`. To calculate the sum of something, use `sum(Sign * x)` instead of `sum(x)`, and so on, and also add `HAVING sum(Sign) > 0`.
要完成折叠,请使用 `GROUP BY` 子句和用于处理符号的聚合函数编写请求。例如,要计算数量,使用 `sum(Sign)` 而不是 `count()`。要计算某物的总和,使用 `sum(Sign * x)` 而不是 `sum(x)`,并添加 `HAVING sum(Sign) > 0` 子句。
The aggregates `count`, `sum` and `avg` could be calculated this way. The aggregate `uniq` could be calculated if an object has at list one state not collapsed. The aggregates `min` and `max` could not be calculated because `CollapsingMergeTree` does not save values history of the collapsed states.
聚合体 `count`,`sum` 和 `avg` 可以用这种方式计算。如果一个对象至少有一个未被折叠的状态,则可以计算 `uniq` 聚合。`min` 和 `max` 聚合无法计算,因为 `CollaspingMergeTree` 不会保存折叠状态的值的历史记录。
If you need to extract data without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is significantly less efficient.
如果你需要在不进行聚合的情况下获取数据(例如,要检查是否存在最新值与特定条件匹配的行),你可以在 `FROM` 从句中使用 `FINAL` 修饰符。这种方法显然是更低效的。
## Example of use
## 示例
Example data:
示例数据:
```
┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
@ -139,7 +139,7 @@ Example data:
└─────────────────────┴───────────┴──────────┴──────┘
```
Creation of the table:
建表:
```sql
CREATE TABLE UAct
@ -153,7 +153,7 @@ ENGINE = CollapsingMergeTree(Sign)
ORDER BY UserID
```
Insertion of the data:
插入数据:
```sql
INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1)
@ -162,9 +162,9 @@ INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1)
INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1)
```
We use two `INSERT` queries to create two different data parts. If we insert the data with one query ClickHouse creates one data part and will not perform any merge ever.
我们使用两次 `INSERT` 请求来创建两个不同的数据片段。如果我们使用一个请求插入数据ClickHouse 只会创建一个数据片段且不会执行任何合并操作。
Getting the data:
获取数据:
```
SELECT * FROM UAct
@ -180,11 +180,11 @@ SELECT * FROM UAct
└─────────────────────┴───────────┴──────────┴──────┘
```
What do we see and where is collapsing?
With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows.
Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment of time which we can not predict.
我们看到了什么,哪里有折叠?
Thus we need aggregation:
通过两个 `INSERT` 请求,我们创建了两个数据片段。`SELECT` 请求在两个线程中被执行我们得到了随机顺序的行。没有发生折叠是因为还没有合并数据片段。ClickHouse 在一个我们无法预料的未知时刻合并数据片段。
因此我们需要聚合:
```sql
SELECT
@ -201,7 +201,7 @@ HAVING sum(Sign) > 0
└─────────────────────┴───────────┴──────────┘
```
If we do not need aggregation and want to force collapsing, we can use `FINAL` modifier for `FROM` clause.
如果我们不需要聚合并想要强制进行折叠,我们可以在 `FROM` 从句中使用 `FINAL` 修饰语。
```sql
SELECT * FROM UAct FINAL
@ -212,6 +212,6 @@ SELECT * FROM UAct FINAL
└─────────────────────┴───────────┴──────────┴──────┘
```
This way of selecting the data is very inefficient. Don't use it for big tables.
这种查询数据的方法是非常低效的。不要在大表中使用它。
[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/collapsingmergetree/) <!--hide-->
[来源文章](https://clickhouse.yandex/docs/en/operations/table_engines/collapsingmergetree/) <!--hide-->

View File

@ -30,7 +30,7 @@ ORDER BY (CounterID, StartDate, intHash32(UserID));
新数据插入到表中时,这些数据会存储为按主键排序的新片段(块)。插入后 10-15 分钟,同一分区的各个片段会合并为一整个片段。
!!! 注意
!!! attention "注意"
那些有相同分区表达式值的数据片段才会合并。这意味着 **你不应该用太精细的分区方案**(超过一千个分区)。否则,会因为文件系统中的文件数量和需要找开的文件描述符过多,导致 `SELECT` 查询效率不佳。
可以通过 [system.parts](../system_tables.md#system_tables-parts) 表查看表片段和分区信息。例如,假设我们有一个 `visits` 表,按月分区。对 `system.parts` 表执行 `SELECT`
@ -67,7 +67,7 @@ WHERE table = 'visits'
- `3` 是数据块的最大编号。
- `1` 是块级别(即在由块组成的合并树中,该块在树中的深度)。
!!! 注意
!!! attention "注意"
旧类型表的片段名称为:`20190117_20190123_2_2_0`(最小日期 - 最大日期 - 最小块编号 - 最大块编号 - 块级别)。
`active` 列为片段状态。`1` 激活状态;`0` 非激活状态。非激活片段是那些在合并到较大片段之后剩余的源数据片段。损坏的数据片段也表示为非活动状态。

View File

@ -1083,7 +1083,7 @@ void BaseDaemon::initialize(Application & self)
}
/// Create pid file.
if (is_daemon && config().has("pid"))
if (config().has("pid"))
pid.seed(config().getString("pid"));
/// Change path for logging.