mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 00:52:02 +00:00
Merge remote-tracking branch 'upstream/master' into fix27
This commit is contained in:
commit
674d34e93e
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -134,3 +134,6 @@
|
||||
[submodule "contrib/libc-headers"]
|
||||
path = contrib/libc-headers
|
||||
url = https://github.com/ClickHouse-Extras/libc-headers.git
|
||||
[submodule "contrib/ryu"]
|
||||
path = contrib/ryu
|
||||
url = https://github.com/ClickHouse-Extras/ryu.git
|
||||
|
@ -210,7 +210,7 @@ set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g3 -ggdb3
|
||||
|
||||
if (COMPILER_CLANG)
|
||||
# Exception unwinding doesn't work in clang release build without this option
|
||||
# TODO investigate if contrib/libcxxabi is out of date
|
||||
# TODO investigate that
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer")
|
||||
endif ()
|
||||
|
@ -1,4 +1,8 @@
|
||||
option(ENABLE_ICU "Enable ICU" ${ENABLE_LIBRARIES})
|
||||
if (OS_LINUX)
|
||||
option(ENABLE_ICU "Enable ICU" ${ENABLE_LIBRARIES})
|
||||
else ()
|
||||
option(ENABLE_ICU "Enable ICU" 0)
|
||||
endif ()
|
||||
|
||||
if (ENABLE_ICU)
|
||||
|
||||
|
2
contrib/CMakeLists.txt
vendored
2
contrib/CMakeLists.txt
vendored
@ -32,6 +32,8 @@ if (USE_INTERNAL_DOUBLE_CONVERSION_LIBRARY)
|
||||
add_subdirectory (double-conversion-cmake)
|
||||
endif ()
|
||||
|
||||
add_subdirectory (ryu-cmake)
|
||||
|
||||
if (USE_INTERNAL_CITYHASH_LIBRARY)
|
||||
add_subdirectory (cityhash102)
|
||||
endif ()
|
||||
|
2
contrib/libc-headers
vendored
2
contrib/libc-headers
vendored
@ -1 +1 @@
|
||||
Subproject commit cd82fd9d8eefe50a47a0adf7c617c3ea7d558d11
|
||||
Subproject commit 9676d2645a713e679dc981ffd84dee99fcd68b8e
|
2
contrib/libcxx
vendored
2
contrib/libcxx
vendored
@ -1 +1 @@
|
||||
Subproject commit f7c63235238a71b7e0563fab8c7c5ec1b54831f6
|
||||
Subproject commit a8c453300879d0bf255f9d5959d42e2c8aac1bfb
|
@ -47,6 +47,11 @@ add_library(cxx ${SRCS})
|
||||
target_include_directories(cxx SYSTEM BEFORE PUBLIC $<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}/include>)
|
||||
target_compile_definitions(cxx PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DLIBCXX_BUILDING_LIBCXXABI)
|
||||
|
||||
# Enable capturing stack traces for all exceptions.
|
||||
if (USE_UNWIND)
|
||||
target_compile_definitions(cxx PUBLIC -DSTD_EXCEPTION_HAS_STACK_TRACE=1)
|
||||
endif ()
|
||||
|
||||
target_compile_options(cxx PUBLIC $<$<COMPILE_LANGUAGE:CXX>:-nostdinc++>)
|
||||
|
||||
check_cxx_compiler_flag(-Wreserved-id-macro HAVE_WARNING_RESERVED_ID_MACRO)
|
||||
|
@ -32,6 +32,11 @@ target_compile_definitions(cxxabi PRIVATE -D_LIBCPP_BUILDING_LIBRARY)
|
||||
target_compile_options(cxxabi PRIVATE -nostdinc++ -fno-sanitize=undefined -Wno-macro-redefined) # If we don't disable UBSan, infinite recursion happens in dynamic_cast.
|
||||
target_link_libraries(cxxabi PUBLIC ${EXCEPTION_HANDLING_LIBRARY})
|
||||
|
||||
# Enable capturing stack traces for all exceptions.
|
||||
if (USE_UNWIND)
|
||||
target_compile_definitions(cxxabi PUBLIC -DSTD_EXCEPTION_HAS_STACK_TRACE=1)
|
||||
endif ()
|
||||
|
||||
install(
|
||||
TARGETS cxxabi
|
||||
EXPORT global
|
||||
|
@ -7,10 +7,14 @@ ELSE(CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
ENDIF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
|
||||
IF(CMAKE_COMPILER_IS_GNUCXX)
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_COMPILER_VERSION)
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpfullversion OUTPUT_VARIABLE GCC_COMPILER_VERSION)
|
||||
|
||||
IF (NOT GCC_COMPILER_VERSION)
|
||||
MESSAGE(FATAL_ERROR "Cannot get gcc version")
|
||||
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_COMPILER_VERSION)
|
||||
|
||||
IF (NOT GCC_COMPILER_VERSION)
|
||||
MESSAGE(FATAL_ERROR "Cannot get gcc version")
|
||||
ENDIF (NOT GCC_COMPILER_VERSION)
|
||||
ENDIF (NOT GCC_COMPILER_VERSION)
|
||||
|
||||
STRING(REGEX MATCHALL "[0-9]+" GCC_COMPILER_VERSION ${GCC_COMPILER_VERSION})
|
||||
|
@ -28,6 +28,8 @@ if (ARCH_AMD64)
|
||||
endif ()
|
||||
|
||||
macro(perl_generate_asm FILE_IN FILE_OUT)
|
||||
get_filename_component(DIRNAME ${FILE_OUT} DIRECTORY)
|
||||
file(MAKE_DIRECTORY ${DIRNAME})
|
||||
add_custom_command(OUTPUT ${FILE_OUT}
|
||||
COMMAND /usr/bin/env perl ${FILE_IN} ${OPENSSL_SYSTEM} ${FILE_OUT}
|
||||
# ASM code has broken unwind tables (CFI), strip them.
|
||||
@ -70,6 +72,8 @@ if (ARCH_AMD64)
|
||||
elseif (ARCH_AARCH64)
|
||||
|
||||
macro(perl_generate_asm FILE_IN FILE_OUT)
|
||||
get_filename_component(DIRNAME ${FILE_OUT} DIRECTORY)
|
||||
file(MAKE_DIRECTORY ${DIRNAME})
|
||||
add_custom_command(OUTPUT ${FILE_OUT}
|
||||
COMMAND /usr/bin/env perl ${FILE_IN} "linux64" ${FILE_OUT})
|
||||
# Hope that the ASM code for AArch64 doesn't have broken CFI. Otherwise, add the same sed as for x86_64.
|
||||
|
1
contrib/ryu
vendored
Submodule
1
contrib/ryu
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 5b4a853534b47438b4d97935370f6b2397137c2b
|
10
contrib/ryu-cmake/CMakeLists.txt
Normal file
10
contrib/ryu-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,10 @@
|
||||
SET(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/ryu)
|
||||
|
||||
add_library(ryu
|
||||
${LIBRARY_DIR}/ryu/d2fixed.c
|
||||
${LIBRARY_DIR}/ryu/d2s.c
|
||||
${LIBRARY_DIR}/ryu/f2s.c
|
||||
${LIBRARY_DIR}/ryu/generic_128.c
|
||||
)
|
||||
|
||||
target_include_directories(ryu SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}")
|
2
contrib/zlib-ng
vendored
2
contrib/zlib-ng
vendored
@ -1 +1 @@
|
||||
Subproject commit 5673222fbd37ea89afb2ea73096f9bf5ec68ea31
|
||||
Subproject commit bba56a73be249514acfbc7d49aa2a68994dad8ab
|
@ -330,6 +330,7 @@ target_link_libraries (clickhouse_common_io
|
||||
${LINK_LIBRARIES_ONLY_ON_X86_64}
|
||||
PUBLIC
|
||||
${DOUBLE_CONVERSION_LIBRARIES}
|
||||
ryu
|
||||
PUBLIC
|
||||
${Poco_Net_LIBRARY}
|
||||
${Poco_Util_LIBRARY}
|
||||
|
@ -300,7 +300,7 @@ private:
|
||||
&& std::string::npos == embedded_stack_trace_pos)
|
||||
{
|
||||
std::cerr << "Stack trace:" << std::endl
|
||||
<< e.getStackTrace().toString();
|
||||
<< e.getStackTraceString();
|
||||
}
|
||||
|
||||
/// If exception code isn't zero, we should return non-zero return code anyway.
|
||||
@ -327,6 +327,78 @@ private:
|
||||
|| (now.month() == 1 && now.day() <= 5);
|
||||
}
|
||||
|
||||
bool isChineseNewYearMode(const String & local_tz)
|
||||
{
|
||||
/// Days of Dec. 20 in Chinese calendar starting from year 2019 to year 2105
|
||||
static constexpr UInt16 chineseNewYearIndicators[]
|
||||
= {18275, 18659, 19014, 19368, 19752, 20107, 20491, 20845, 21199, 21583, 21937, 22292, 22676, 23030, 23414, 23768, 24122, 24506,
|
||||
24860, 25215, 25599, 25954, 26308, 26692, 27046, 27430, 27784, 28138, 28522, 28877, 29232, 29616, 29970, 30354, 30708, 31062,
|
||||
31446, 31800, 32155, 32539, 32894, 33248, 33632, 33986, 34369, 34724, 35078, 35462, 35817, 36171, 36555, 36909, 37293, 37647,
|
||||
38002, 38386, 38740, 39095, 39479, 39833, 40187, 40571, 40925, 41309, 41664, 42018, 42402, 42757, 43111, 43495, 43849, 44233,
|
||||
44587, 44942, 45326, 45680, 46035, 46418, 46772, 47126, 47510, 47865, 48249, 48604, 48958, 49342};
|
||||
static constexpr size_t N = sizeof(chineseNewYearIndicators) / sizeof(chineseNewYearIndicators[0]);
|
||||
|
||||
/// All time zone names are acquired from https://www.iana.org/time-zones
|
||||
static constexpr const char * chineseNewYearTimeZoneIndicators[] = {
|
||||
/// Time zones celebrating Chinese new year.
|
||||
"Asia/Shanghai",
|
||||
"Asia/Chongqing",
|
||||
"Asia/Harbin",
|
||||
"Asia/Urumqi",
|
||||
"Asia/Hong_Kong",
|
||||
"Asia/Chungking",
|
||||
"Asia/Macao",
|
||||
"Asia/Macau",
|
||||
"Asia/Taipei",
|
||||
"Asia/Singapore",
|
||||
|
||||
/// Time zones celebrating Chinese new year but with different festival names. Let's not print the message for now.
|
||||
// "Asia/Brunei",
|
||||
// "Asia/Ho_Chi_Minh",
|
||||
// "Asia/Hovd",
|
||||
// "Asia/Jakarta",
|
||||
// "Asia/Jayapura",
|
||||
// "Asia/Kashgar",
|
||||
// "Asia/Kuala_Lumpur",
|
||||
// "Asia/Kuching",
|
||||
// "Asia/Makassar",
|
||||
// "Asia/Pontianak",
|
||||
// "Asia/Pyongyang",
|
||||
// "Asia/Saigon",
|
||||
// "Asia/Seoul",
|
||||
// "Asia/Ujung_Pandang",
|
||||
// "Asia/Ulaanbaatar",
|
||||
// "Asia/Ulan_Bator",
|
||||
};
|
||||
static constexpr size_t M = sizeof(chineseNewYearTimeZoneIndicators) / sizeof(chineseNewYearTimeZoneIndicators[0]);
|
||||
|
||||
time_t current_time = time(nullptr);
|
||||
|
||||
if (chineseNewYearTimeZoneIndicators + M
|
||||
== std::find_if(chineseNewYearTimeZoneIndicators, chineseNewYearTimeZoneIndicators + M, [&local_tz](const char * tz)
|
||||
{
|
||||
return tz == local_tz;
|
||||
}))
|
||||
return false;
|
||||
|
||||
/// It's bad to be intrusive.
|
||||
if (current_time % 3 != 0)
|
||||
return false;
|
||||
|
||||
auto days = DateLUT::instance().toDayNum(current_time).toUnderType();
|
||||
for (auto i = 0ul; i < N; ++i)
|
||||
{
|
||||
auto d = chineseNewYearIndicators[i];
|
||||
|
||||
/// Let's celebrate until Lantern Festival
|
||||
if (d <= days && d + 25u >= days)
|
||||
return true;
|
||||
else if (d > days)
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int mainImpl()
|
||||
{
|
||||
UseSSL use_ssl;
|
||||
@ -374,7 +446,7 @@ private:
|
||||
connect();
|
||||
|
||||
/// Initialize DateLUT here to avoid counting time spent here as query execution time.
|
||||
DateLUT::instance();
|
||||
const auto local_tz = DateLUT::instance().getTimeZone();
|
||||
if (!context.getSettingsRef().use_client_time_zone)
|
||||
{
|
||||
const auto & time_zone = connection->getServerTimezone(connection_parameters.timeouts);
|
||||
@ -540,7 +612,12 @@ private:
|
||||
|
||||
loop();
|
||||
|
||||
std::cout << (isNewYearMode() ? "Happy new year." : "Bye.") << std::endl;
|
||||
if (isNewYearMode())
|
||||
std::cout << "Happy new year." << std::endl;
|
||||
else if (isChineseNewYearMode(local_tz))
|
||||
std::cout << "Happy Chinese new year. 春节快乐!" << std::endl;
|
||||
else
|
||||
std::cout << "Bye." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
@ -714,7 +791,7 @@ private:
|
||||
|
||||
if (config().getBool("stacktrace", false))
|
||||
std::cerr << "Stack trace:" << std::endl
|
||||
<< e.getStackTrace().toString() << std::endl;
|
||||
<< e.getStackTraceString() << std::endl;
|
||||
|
||||
std::cerr << std::endl;
|
||||
|
||||
|
@ -115,7 +115,7 @@ void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne
|
||||
catch (const Exception & ex)
|
||||
{
|
||||
process_error("Invalid 'columns' parameter in request body '" + ex.message() + "'");
|
||||
LOG_WARNING(log, ex.getStackTrace().toString());
|
||||
LOG_WARNING(log, ex.getStackTraceString());
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -85,16 +85,6 @@ bool PerformanceTest::checkPreconditions() const
|
||||
|
||||
for (const std::string & precondition : preconditions)
|
||||
{
|
||||
if (precondition == "flush_disk_cache")
|
||||
{
|
||||
if (system(
|
||||
"(>&2 echo 'Flushing disk cache...') && (sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches') && (>&2 echo 'Flushed.')"))
|
||||
{
|
||||
LOG_WARNING(log, "Failed to flush disk cache");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (precondition == "ram_size")
|
||||
{
|
||||
size_t ram_size_needed = config->getUInt64("preconditions.ram_size");
|
||||
@ -337,7 +327,7 @@ void PerformanceTest::runQueries(
|
||||
{
|
||||
statistics.exception = "Code: " + std::to_string(e.code()) + ", e.displayText() = " + e.displayText();
|
||||
LOG_WARNING(log, "Code: " << e.code() << ", e.displayText() = " << e.displayText()
|
||||
<< ", Stack trace:\n\n" << e.getStackTrace().toString());
|
||||
<< ", Stack trace:\n\n" << e.getStackTraceString());
|
||||
}
|
||||
|
||||
if (!statistics.got_SIGINT)
|
||||
|
@ -45,21 +45,11 @@ namespace fs = std::filesystem;
|
||||
|
||||
PerformanceTestInfo::PerformanceTestInfo(
|
||||
XMLConfigurationPtr config,
|
||||
const std::string & profiles_file_,
|
||||
const Settings & global_settings_)
|
||||
: profiles_file(profiles_file_)
|
||||
, settings(global_settings_)
|
||||
: settings(global_settings_)
|
||||
{
|
||||
path = config->getString("path");
|
||||
test_name = fs::path(path).stem().string();
|
||||
if (config->has("main_metric"))
|
||||
{
|
||||
Strings main_metrics;
|
||||
config->keys("main_metric", main_metrics);
|
||||
if (main_metrics.size())
|
||||
main_metric = main_metrics[0];
|
||||
}
|
||||
|
||||
applySettings(config);
|
||||
extractQueries(config);
|
||||
extractAuxiliaryQueries(config);
|
||||
@ -75,38 +65,8 @@ void PerformanceTestInfo::applySettings(XMLConfigurationPtr config)
|
||||
SettingsChanges settings_to_apply;
|
||||
Strings config_settings;
|
||||
config->keys("settings", config_settings);
|
||||
|
||||
auto settings_contain = [&config_settings] (const std::string & setting)
|
||||
{
|
||||
auto position = std::find(config_settings.begin(), config_settings.end(), setting);
|
||||
return position != config_settings.end();
|
||||
|
||||
};
|
||||
/// Preprocess configuration file
|
||||
if (settings_contain("profile"))
|
||||
{
|
||||
if (!profiles_file.empty())
|
||||
{
|
||||
std::string profile_name = config->getString("settings.profile");
|
||||
XMLConfigurationPtr profiles_config(new XMLConfiguration(profiles_file));
|
||||
|
||||
Strings profile_settings;
|
||||
profiles_config->keys("profiles." + profile_name, profile_settings);
|
||||
|
||||
extractSettings(profiles_config, "profiles." + profile_name, profile_settings, settings_to_apply);
|
||||
}
|
||||
}
|
||||
|
||||
extractSettings(config, "settings", config_settings, settings_to_apply);
|
||||
settings.applyChanges(settings_to_apply);
|
||||
|
||||
if (settings_contain("average_rows_speed_precision"))
|
||||
TestStats::avg_rows_speed_precision =
|
||||
config->getDouble("settings.average_rows_speed_precision");
|
||||
|
||||
if (settings_contain("average_bytes_speed_precision"))
|
||||
TestStats::avg_bytes_speed_precision =
|
||||
config->getDouble("settings.average_bytes_speed_precision");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -26,15 +26,13 @@ using StringToVector = std::map<std::string, Strings>;
|
||||
class PerformanceTestInfo
|
||||
{
|
||||
public:
|
||||
PerformanceTestInfo(XMLConfigurationPtr config, const std::string & profiles_file_, const Settings & global_settings_);
|
||||
PerformanceTestInfo(XMLConfigurationPtr config, const Settings & global_settings_);
|
||||
|
||||
std::string test_name;
|
||||
std::string path;
|
||||
std::string main_metric;
|
||||
|
||||
Strings queries;
|
||||
|
||||
std::string profiles_file;
|
||||
Settings settings;
|
||||
ExecutionType exec_type;
|
||||
StringToVector substitutions;
|
||||
|
@ -64,7 +64,6 @@ public:
|
||||
const std::string & password_,
|
||||
const Settings & cmd_settings,
|
||||
const bool lite_output_,
|
||||
const std::string & profiles_file_,
|
||||
Strings && input_files_,
|
||||
Strings && tests_tags_,
|
||||
Strings && skip_tags_,
|
||||
@ -86,7 +85,6 @@ public:
|
||||
, skip_names_regexp(std::move(skip_names_regexp_))
|
||||
, query_indexes(query_indexes_)
|
||||
, lite_output(lite_output_)
|
||||
, profiles_file(profiles_file_)
|
||||
, input_files(input_files_)
|
||||
, log(&Poco::Logger::get("PerformanceTestSuite"))
|
||||
{
|
||||
@ -139,7 +137,6 @@ private:
|
||||
using XMLConfigurationPtr = Poco::AutoPtr<XMLConfiguration>;
|
||||
|
||||
bool lite_output;
|
||||
std::string profiles_file;
|
||||
|
||||
Strings input_files;
|
||||
std::vector<XMLConfigurationPtr> tests_configurations;
|
||||
@ -197,7 +194,7 @@ private:
|
||||
|
||||
std::pair<std::string, bool> runTest(XMLConfigurationPtr & test_config)
|
||||
{
|
||||
PerformanceTestInfo info(test_config, profiles_file, global_context.getSettingsRef());
|
||||
PerformanceTestInfo info(test_config, global_context.getSettingsRef());
|
||||
LOG_INFO(log, "Config for test '" << info.test_name << "' parsed");
|
||||
PerformanceTest current(test_config, connection, timeouts, interrupt_listener, info, global_context, query_indexes[info.path]);
|
||||
|
||||
@ -332,7 +329,6 @@ try
|
||||
desc.add_options()
|
||||
("help", "produce help message")
|
||||
("lite", "use lite version of output")
|
||||
("profiles-file", value<std::string>()->default_value(""), "Specify a file with global profiles")
|
||||
("host,h", value<std::string>()->default_value("localhost"), "")
|
||||
("port", value<UInt16>()->default_value(9000), "")
|
||||
("secure,s", "Use TLS connection")
|
||||
@ -401,7 +397,6 @@ try
|
||||
options["password"].as<std::string>(),
|
||||
cmd_settings,
|
||||
options.count("lite") > 0,
|
||||
options["profiles-file"].as<std::string>(),
|
||||
std::move(input_files),
|
||||
std::move(tests_tags),
|
||||
std::move(skip_tags),
|
||||
|
@ -19,15 +19,10 @@ namespace
|
||||
{
|
||||
std::string getMainMetric(const PerformanceTestInfo & test_info)
|
||||
{
|
||||
std::string main_metric;
|
||||
if (test_info.main_metric.empty())
|
||||
if (test_info.exec_type == ExecutionType::Loop)
|
||||
main_metric = "min_time";
|
||||
else
|
||||
main_metric = "rows_per_second";
|
||||
if (test_info.exec_type == ExecutionType::Loop)
|
||||
return "min_time";
|
||||
else
|
||||
main_metric = test_info.main_metric;
|
||||
return main_metric;
|
||||
return "rows_per_second";
|
||||
}
|
||||
|
||||
bool isASCIIString(const std::string & str)
|
||||
@ -64,7 +59,6 @@ std::string ReportBuilder::buildFullReport(
|
||||
{
|
||||
FormatSettings settings;
|
||||
|
||||
|
||||
JSONString json_output;
|
||||
|
||||
json_output.set("hostname", hostname);
|
||||
@ -75,7 +69,6 @@ std::string ReportBuilder::buildFullReport(
|
||||
json_output.set("time", getCurrentTime());
|
||||
json_output.set("test_name", test_info.test_name);
|
||||
json_output.set("path", test_info.path);
|
||||
json_output.set("main_metric", getMainMetric(test_info));
|
||||
|
||||
if (!test_info.substitutions.empty())
|
||||
{
|
||||
|
@ -20,8 +20,6 @@
|
||||
#include <Compression/CompressedReadBuffer.h>
|
||||
#include <Compression/CompressedWriteBuffer.h>
|
||||
#include <IO/ReadBufferFromIStream.h>
|
||||
#include <IO/ZlibInflatingReadBuffer.h>
|
||||
#include <IO/BrotliReadBuffer.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/WriteBufferFromString.h>
|
||||
#include <IO/WriteBufferFromHTTPServerResponse.h>
|
||||
@ -300,32 +298,24 @@ void HTTPHandler::processQuery(
|
||||
|
||||
/// The client can pass a HTTP header indicating supported compression method (gzip or deflate).
|
||||
String http_response_compression_methods = request.get("Accept-Encoding", "");
|
||||
bool client_supports_http_compression = false;
|
||||
CompressionMethod http_response_compression_method {};
|
||||
CompressionMethod http_response_compression_method = CompressionMethod::None;
|
||||
|
||||
if (!http_response_compression_methods.empty())
|
||||
{
|
||||
/// If client supports brotli - it's preferred.
|
||||
/// Both gzip and deflate are supported. If the client supports both, gzip is preferred.
|
||||
/// NOTE parsing of the list of methods is slightly incorrect.
|
||||
if (std::string::npos != http_response_compression_methods.find("gzip"))
|
||||
{
|
||||
client_supports_http_compression = true;
|
||||
http_response_compression_method = CompressionMethod::Gzip;
|
||||
}
|
||||
else if (std::string::npos != http_response_compression_methods.find("deflate"))
|
||||
{
|
||||
client_supports_http_compression = true;
|
||||
http_response_compression_method = CompressionMethod::Zlib;
|
||||
}
|
||||
#if USE_BROTLI
|
||||
else if (http_response_compression_methods == "br")
|
||||
{
|
||||
client_supports_http_compression = true;
|
||||
|
||||
if (std::string::npos != http_response_compression_methods.find("br"))
|
||||
http_response_compression_method = CompressionMethod::Brotli;
|
||||
}
|
||||
#endif
|
||||
else if (std::string::npos != http_response_compression_methods.find("gzip"))
|
||||
http_response_compression_method = CompressionMethod::Gzip;
|
||||
else if (std::string::npos != http_response_compression_methods.find("deflate"))
|
||||
http_response_compression_method = CompressionMethod::Zlib;
|
||||
}
|
||||
|
||||
bool client_supports_http_compression = http_response_compression_method != CompressionMethod::None;
|
||||
|
||||
/// Client can pass a 'compress' flag in the query string. In this case the query result is
|
||||
/// compressed using internal algorithm. This is not reflected in HTTP headers.
|
||||
bool internal_compression = params.getParsed<bool>("compress", false);
|
||||
@ -344,8 +334,8 @@ void HTTPHandler::processQuery(
|
||||
unsigned keep_alive_timeout = config.getUInt("keep_alive_timeout", 10);
|
||||
|
||||
used_output.out = std::make_shared<WriteBufferFromHTTPServerResponse>(
|
||||
request, response, keep_alive_timeout,
|
||||
client_supports_http_compression, http_response_compression_method, buffer_size_http);
|
||||
request, response, keep_alive_timeout, client_supports_http_compression, http_response_compression_method);
|
||||
|
||||
if (internal_compression)
|
||||
used_output.out_maybe_compressed = std::make_shared<CompressedWriteBuffer>(*used_output.out);
|
||||
else
|
||||
@ -400,32 +390,9 @@ void HTTPHandler::processQuery(
|
||||
std::unique_ptr<ReadBuffer> in_post_raw = std::make_unique<ReadBufferFromIStream>(istr);
|
||||
|
||||
/// Request body can be compressed using algorithm specified in the Content-Encoding header.
|
||||
std::unique_ptr<ReadBuffer> in_post;
|
||||
String http_request_compression_method_str = request.get("Content-Encoding", "");
|
||||
if (!http_request_compression_method_str.empty())
|
||||
{
|
||||
if (http_request_compression_method_str == "gzip")
|
||||
{
|
||||
in_post = std::make_unique<ZlibInflatingReadBuffer>(std::move(in_post_raw), CompressionMethod::Gzip);
|
||||
}
|
||||
else if (http_request_compression_method_str == "deflate")
|
||||
{
|
||||
in_post = std::make_unique<ZlibInflatingReadBuffer>(std::move(in_post_raw), CompressionMethod::Zlib);
|
||||
}
|
||||
#if USE_BROTLI
|
||||
else if (http_request_compression_method_str == "br")
|
||||
{
|
||||
in_post = std::make_unique<BrotliReadBuffer>(std::move(in_post_raw));
|
||||
}
|
||||
#endif
|
||||
else
|
||||
{
|
||||
throw Exception("Unknown Content-Encoding of HTTP request: " + http_request_compression_method_str,
|
||||
ErrorCodes::UNKNOWN_COMPRESSION_METHOD);
|
||||
}
|
||||
}
|
||||
else
|
||||
in_post = std::move(in_post_raw);
|
||||
std::unique_ptr<ReadBuffer> in_post = wrapReadBufferWithCompressionMethod(
|
||||
std::make_unique<ReadBufferFromIStream>(istr), chooseCompressionMethod({}, http_request_compression_method_str));
|
||||
|
||||
/// The data can also be compressed using incompatible internal algorithm. This is indicated by
|
||||
/// 'decompress' query parameter.
|
||||
|
@ -112,7 +112,7 @@ void TCPHandler::runImpl()
|
||||
{
|
||||
Exception e("Database " + backQuote(default_database) + " doesn't exist", ErrorCodes::UNKNOWN_DATABASE);
|
||||
LOG_ERROR(log, "Code: " << e.code() << ", e.displayText() = " << e.displayText()
|
||||
<< ", Stack trace:\n\n" << e.getStackTrace().toString());
|
||||
<< ", Stack trace:\n\n" << e.getStackTraceString());
|
||||
sendException(e, connection_context.getSettingsRef().calculate_text_stack_trace);
|
||||
return;
|
||||
}
|
||||
@ -158,7 +158,7 @@ void TCPHandler::runImpl()
|
||||
/** An exception during the execution of request (it must be sent over the network to the client).
|
||||
* The client will be able to accept it, if it did not happen while sending another packet and the client has not disconnected yet.
|
||||
*/
|
||||
std::unique_ptr<Exception> exception;
|
||||
std::optional<DB::Exception> exception;
|
||||
bool network_error = false;
|
||||
|
||||
bool send_exception_with_stack_trace = connection_context.getSettingsRef().calculate_text_stack_trace;
|
||||
@ -280,7 +280,7 @@ void TCPHandler::runImpl()
|
||||
catch (const Exception & e)
|
||||
{
|
||||
state.io.onException();
|
||||
exception.reset(e.clone());
|
||||
exception.emplace(e);
|
||||
|
||||
if (e.code() == ErrorCodes::UNKNOWN_PACKET_FROM_CLIENT)
|
||||
throw;
|
||||
@ -298,22 +298,22 @@ void TCPHandler::runImpl()
|
||||
* We will try to send exception to the client in any case - see below.
|
||||
*/
|
||||
state.io.onException();
|
||||
exception = std::make_unique<Exception>(e.displayText(), ErrorCodes::POCO_EXCEPTION);
|
||||
exception.emplace(Exception::CreateFromPoco, e);
|
||||
}
|
||||
catch (const Poco::Exception & e)
|
||||
{
|
||||
state.io.onException();
|
||||
exception = std::make_unique<Exception>(e.displayText(), ErrorCodes::POCO_EXCEPTION);
|
||||
exception.emplace(Exception::CreateFromPoco, e);
|
||||
}
|
||||
catch (const std::exception & e)
|
||||
{
|
||||
state.io.onException();
|
||||
exception = std::make_unique<Exception>(e.what(), ErrorCodes::STD_EXCEPTION);
|
||||
exception.emplace(Exception::CreateFromSTD, e);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
state.io.onException();
|
||||
exception = std::make_unique<Exception>("Unknown exception", ErrorCodes::UNKNOWN_EXCEPTION);
|
||||
exception.emplace("Unknown exception", ErrorCodes::UNKNOWN_EXCEPTION);
|
||||
}
|
||||
|
||||
try
|
||||
|
@ -138,7 +138,6 @@ namespace ErrorCodes
|
||||
extern const int FUNCTION_IS_SPECIAL = 129;
|
||||
extern const int CANNOT_READ_ARRAY_FROM_TEXT = 130;
|
||||
extern const int TOO_LARGE_STRING_SIZE = 131;
|
||||
extern const int CANNOT_CREATE_TABLE_FROM_METADATA = 132;
|
||||
extern const int AGGREGATE_FUNCTION_DOESNT_ALLOW_PARAMETERS = 133;
|
||||
extern const int PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS = 134;
|
||||
extern const int ZERO_ARRAY_OR_TUPLE_INDEX = 135;
|
||||
@ -474,7 +473,6 @@ namespace ErrorCodes
|
||||
extern const int NOT_ENOUGH_PRIVILEGES = 497;
|
||||
extern const int LIMIT_BY_WITH_TIES_IS_NOT_SUPPORTED = 498;
|
||||
extern const int S3_ERROR = 499;
|
||||
extern const int CANNOT_CREATE_DICTIONARY_FROM_METADATA = 500;
|
||||
extern const int CANNOT_CREATE_DATABASE = 501;
|
||||
extern const int CANNOT_SIGQUEUE = 502;
|
||||
extern const int AGGREGATE_FUNCTION_THROW = 503;
|
||||
|
@ -25,6 +25,55 @@ namespace ErrorCodes
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
|
||||
Exception::Exception()
|
||||
{
|
||||
}
|
||||
|
||||
Exception::Exception(const std::string & msg, int code)
|
||||
: Poco::Exception(msg, code)
|
||||
{
|
||||
}
|
||||
|
||||
Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc)
|
||||
: Poco::Exception(exc.displayText(), ErrorCodes::POCO_EXCEPTION)
|
||||
{
|
||||
#ifdef STD_EXCEPTION_HAS_STACK_TRACE
|
||||
set_stack_trace(exc.get_stack_trace_frames(), exc.get_stack_trace_size());
|
||||
#endif
|
||||
}
|
||||
|
||||
Exception::Exception(CreateFromSTDTag, const std::exception & exc)
|
||||
: Poco::Exception(String(typeid(exc).name()) + ": " + String(exc.what()), ErrorCodes::STD_EXCEPTION)
|
||||
{
|
||||
#ifdef STD_EXCEPTION_HAS_STACK_TRACE
|
||||
set_stack_trace(exc.get_stack_trace_frames(), exc.get_stack_trace_size());
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
std::string getExceptionStackTraceString(const std::exception & e)
|
||||
{
|
||||
#ifdef STD_EXCEPTION_HAS_STACK_TRACE
|
||||
return StackTrace::toString(e.get_stack_trace_frames(), 0, e.get_stack_trace_size());
|
||||
#else
|
||||
if (const auto * db_exception = dynamic_cast<const Exception *>(&e))
|
||||
return db_exception->getStackTraceString();
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
std::string Exception::getStackTraceString() const
|
||||
{
|
||||
#ifdef STD_EXCEPTION_HAS_STACK_TRACE
|
||||
return StackTrace::toString(get_stack_trace_frames(), 0, get_stack_trace_size());
|
||||
#else
|
||||
return trace.toString();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
std::string errnoToString(int code, int e)
|
||||
{
|
||||
const size_t buf_size = 128;
|
||||
@ -141,6 +190,7 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded
|
||||
{
|
||||
stream << "Poco::Exception. Code: " << ErrorCodes::POCO_EXCEPTION << ", e.code() = " << e.code()
|
||||
<< ", e.displayText() = " << e.displayText()
|
||||
<< (with_stacktrace ? getExceptionStackTraceString(e) : "")
|
||||
<< (with_extra_info ? getExtraExceptionInfo(e) : "")
|
||||
<< " (version " << VERSION_STRING << VERSION_OFFICIAL;
|
||||
}
|
||||
@ -157,8 +207,9 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded
|
||||
name += " (demangling status: " + toString(status) + ")";
|
||||
|
||||
stream << "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what()
|
||||
<< (with_extra_info ? getExtraExceptionInfo(e) : "")
|
||||
<< ", version = " << VERSION_STRING << VERSION_OFFICIAL;
|
||||
<< (with_stacktrace ? getExceptionStackTraceString(e) : "")
|
||||
<< (with_extra_info ? getExtraExceptionInfo(e) : "")
|
||||
<< ", version = " << VERSION_STRING << VERSION_OFFICIAL;
|
||||
}
|
||||
catch (...) {}
|
||||
}
|
||||
@ -261,7 +312,7 @@ std::string getExceptionMessage(const Exception & e, bool with_stacktrace, bool
|
||||
stream << "Code: " << e.code() << ", e.displayText() = " << text;
|
||||
|
||||
if (with_stacktrace && !has_embedded_stack_trace)
|
||||
stream << ", Stack trace (when copying this message, always include the lines below):\n\n" << e.getStackTrace().toString();
|
||||
stream << ", Stack trace (when copying this message, always include the lines below):\n\n" << e.getStackTraceString();
|
||||
}
|
||||
catch (...) {}
|
||||
|
||||
|
@ -22,13 +22,14 @@ namespace ErrorCodes
|
||||
class Exception : public Poco::Exception
|
||||
{
|
||||
public:
|
||||
Exception() {} /// For deferred initialization.
|
||||
Exception(const std::string & msg, int code) : Poco::Exception(msg, code) {}
|
||||
Exception(const std::string & msg, const Exception & nested_exception, int code)
|
||||
: Poco::Exception(msg, nested_exception, code), trace(nested_exception.trace) {}
|
||||
Exception();
|
||||
Exception(const std::string & msg, int code);
|
||||
|
||||
enum CreateFromPocoTag { CreateFromPoco };
|
||||
Exception(CreateFromPocoTag, const Poco::Exception & exc) : Poco::Exception(exc.displayText(), ErrorCodes::POCO_EXCEPTION) {}
|
||||
enum CreateFromSTDTag { CreateFromSTD };
|
||||
|
||||
Exception(CreateFromPocoTag, const Poco::Exception & exc);
|
||||
Exception(CreateFromSTDTag, const std::exception & exc);
|
||||
|
||||
Exception * clone() const override { return new Exception(*this); }
|
||||
void rethrow() const override { throw *this; }
|
||||
@ -38,15 +39,20 @@ public:
|
||||
/// Add something to the existing message.
|
||||
void addMessage(const std::string & arg) { extendedMessage(arg); }
|
||||
|
||||
const StackTrace & getStackTrace() const { return trace; }
|
||||
std::string getStackTraceString() const;
|
||||
|
||||
private:
|
||||
#ifndef STD_EXCEPTION_HAS_STACK_TRACE
|
||||
StackTrace trace;
|
||||
#endif
|
||||
|
||||
const char * className() const throw() override { return "DB::Exception"; }
|
||||
};
|
||||
|
||||
|
||||
std::string getExceptionStackTraceString(const std::exception & e);
|
||||
|
||||
|
||||
/// Contains an additional member `saved_errno`. See the throwFromErrno function.
|
||||
class ErrnoException : public Exception
|
||||
{
|
||||
|
@ -37,6 +37,8 @@
|
||||
M(CreatedReadBufferOrdinary, "") \
|
||||
M(CreatedReadBufferAIO, "") \
|
||||
M(CreatedReadBufferAIOFailed, "") \
|
||||
M(CreatedReadBufferMMap, "") \
|
||||
M(CreatedReadBufferMMapFailed, "") \
|
||||
M(CreatedWriteBufferOrdinary, "") \
|
||||
M(CreatedWriteBufferAIO, "") \
|
||||
M(CreatedWriteBufferAIOFailed, "") \
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <Common/Elf.h>
|
||||
#include <Common/SymbolIndex.h>
|
||||
#include <Common/config.h>
|
||||
#include <Common/MemorySanitizer.h>
|
||||
#include <common/SimpleCache.h>
|
||||
#include <common/demangle.h>
|
||||
#include <Core/Defines.h>
|
||||
@ -226,6 +227,7 @@ void StackTrace::tryCapture()
|
||||
size = 0;
|
||||
#if USE_UNWIND
|
||||
size = unw_backtrace(frames.data(), capacity);
|
||||
__msan_unpoison(frames.data(), size * sizeof(frames[0]));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -328,3 +330,15 @@ std::string StackTrace::toString() const
|
||||
static SimpleCache<decltype(toStringImpl), &toStringImpl> func_cached;
|
||||
return func_cached(frames, offset, size);
|
||||
}
|
||||
|
||||
std::string StackTrace::toString(void ** frames_, size_t offset, size_t size)
|
||||
{
|
||||
__msan_unpoison(frames_, size * sizeof(*frames_));
|
||||
|
||||
StackTrace::Frames frames_copy{};
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
frames_copy[i] = frames_[i];
|
||||
|
||||
static SimpleCache<decltype(toStringImpl), &toStringImpl> func_cached;
|
||||
return func_cached(frames_copy, offset, size);
|
||||
}
|
||||
|
@ -41,6 +41,8 @@ public:
|
||||
const Frames & getFrames() const;
|
||||
std::string toString() const;
|
||||
|
||||
static std::string toString(void ** frames, size_t offset, size_t size);
|
||||
|
||||
void toStringEveryLine(std::function<void(const std::string &)> callback) const;
|
||||
|
||||
protected:
|
||||
|
@ -19,7 +19,7 @@ void CachedCompressedReadBuffer::initInput()
|
||||
{
|
||||
if (!file_in)
|
||||
{
|
||||
file_in = createReadBufferFromFileBase(path, estimated_size, aio_threshold, buf_size);
|
||||
file_in = createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, buf_size);
|
||||
compressed_in = file_in.get();
|
||||
|
||||
if (profile_callback)
|
||||
@ -73,10 +73,11 @@ bool CachedCompressedReadBuffer::nextImpl()
|
||||
|
||||
|
||||
CachedCompressedReadBuffer::CachedCompressedReadBuffer(
|
||||
const std::string & path_, UncompressedCache * cache_, size_t estimated_size_, size_t aio_threshold_,
|
||||
const std::string & path_, UncompressedCache * cache_,
|
||||
size_t estimated_size_, size_t aio_threshold_, size_t mmap_threshold_,
|
||||
size_t buf_size_)
|
||||
: ReadBuffer(nullptr, 0), path(path_), cache(cache_), buf_size(buf_size_), estimated_size(estimated_size_),
|
||||
aio_threshold(aio_threshold_), file_pos(0)
|
||||
aio_threshold(aio_threshold_), mmap_threshold(mmap_threshold_), file_pos(0)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -26,6 +26,7 @@ private:
|
||||
size_t buf_size;
|
||||
size_t estimated_size;
|
||||
size_t aio_threshold;
|
||||
size_t mmap_threshold;
|
||||
|
||||
std::unique_ptr<ReadBufferFromFileBase> file_in;
|
||||
size_t file_pos;
|
||||
@ -42,7 +43,8 @@ private:
|
||||
|
||||
public:
|
||||
CachedCompressedReadBuffer(
|
||||
const std::string & path_, UncompressedCache * cache_, size_t estimated_size_, size_t aio_threshold_,
|
||||
const std::string & path_, UncompressedCache * cache_,
|
||||
size_t estimated_size_, size_t aio_threshold_, size_t mmap_threshold_,
|
||||
size_t buf_size_ = DBMS_DEFAULT_BUFFER_SIZE);
|
||||
|
||||
|
||||
|
@ -33,9 +33,9 @@ bool CompressedReadBufferFromFile::nextImpl()
|
||||
|
||||
|
||||
CompressedReadBufferFromFile::CompressedReadBufferFromFile(
|
||||
const std::string & path, size_t estimated_size, size_t aio_threshold, size_t buf_size)
|
||||
const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, size_t buf_size)
|
||||
: BufferWithOwnMemory<ReadBuffer>(0),
|
||||
p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, buf_size)),
|
||||
p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, buf_size)),
|
||||
file_in(*p_file_in)
|
||||
{
|
||||
compressed_in = &file_in;
|
||||
|
@ -30,7 +30,7 @@ private:
|
||||
|
||||
public:
|
||||
CompressedReadBufferFromFile(
|
||||
const std::string & path, size_t estimated_size, size_t aio_threshold, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
|
||||
const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
|
||||
|
||||
void seek(size_t offset_in_compressed_file, size_t offset_in_decompressed_block);
|
||||
|
||||
|
@ -26,7 +26,7 @@ extern const int CANNOT_DECOMPRESS;
|
||||
namespace
|
||||
{
|
||||
|
||||
Int64 getMaxValueForByteSize(UInt8 byte_size)
|
||||
inline Int64 getMaxValueForByteSize(Int8 byte_size)
|
||||
{
|
||||
switch (byte_size)
|
||||
{
|
||||
@ -51,11 +51,56 @@ struct WriteSpec
|
||||
const UInt8 data_bits;
|
||||
};
|
||||
|
||||
const std::array<UInt8, 5> DELTA_SIZES{7, 9, 12, 32, 64};
|
||||
// delta size prefix and data lengths based on few high bits peeked from binary stream
|
||||
static const WriteSpec WRITE_SPEC_LUT[32] = {
|
||||
// 0b0 - 1-bit prefix, no data to read
|
||||
/* 00000 */ {1, 0b0, 0},
|
||||
/* 00001 */ {1, 0b0, 0},
|
||||
/* 00010 */ {1, 0b0, 0},
|
||||
/* 00011 */ {1, 0b0, 0},
|
||||
/* 00100 */ {1, 0b0, 0},
|
||||
/* 00101 */ {1, 0b0, 0},
|
||||
/* 00110 */ {1, 0b0, 0},
|
||||
/* 00111 */ {1, 0b0, 0},
|
||||
/* 01000 */ {1, 0b0, 0},
|
||||
/* 01001 */ {1, 0b0, 0},
|
||||
/* 01010 */ {1, 0b0, 0},
|
||||
/* 01011 */ {1, 0b0, 0},
|
||||
/* 01100 */ {1, 0b0, 0},
|
||||
/* 01101 */ {1, 0b0, 0},
|
||||
/* 01110 */ {1, 0b0, 0},
|
||||
/* 01111 */ {1, 0b0, 0},
|
||||
|
||||
// 0b10 - 2 bit prefix, 7 bits of data
|
||||
/* 10000 */ {2, 0b10, 7},
|
||||
/* 10001 */ {2, 0b10, 7},
|
||||
/* 10010 */ {2, 0b10, 7},
|
||||
/* 10011 */ {2, 0b10, 7},
|
||||
/* 10100 */ {2, 0b10, 7},
|
||||
/* 10101 */ {2, 0b10, 7},
|
||||
/* 10110 */ {2, 0b10, 7},
|
||||
/* 10111 */ {2, 0b10, 7},
|
||||
|
||||
// 0b110 - 3 bit prefix, 9 bits of data
|
||||
/* 11000 */ {3, 0b110, 9},
|
||||
/* 11001 */ {3, 0b110, 9},
|
||||
/* 11010 */ {3, 0b110, 9},
|
||||
/* 11011 */ {3, 0b110, 9},
|
||||
|
||||
// 0b1110 - 4 bit prefix, 12 bits of data
|
||||
/* 11100 */ {4, 0b1110, 12},
|
||||
/* 11101 */ {4, 0b1110, 12},
|
||||
|
||||
// 5-bit prefixes
|
||||
/* 11110 */ {5, 0b11110, 32},
|
||||
/* 11111 */ {5, 0b11111, 64},
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
WriteSpec getDeltaWriteSpec(const T & value)
|
||||
{
|
||||
// TODO: to speed up things a bit by counting number of leading zeroes instead of doing lots of comparisons
|
||||
if (value > -63 && value < 64)
|
||||
{
|
||||
return WriteSpec{2, 0b10, 7};
|
||||
@ -107,14 +152,15 @@ UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)
|
||||
template <typename ValueType>
|
||||
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
{
|
||||
// Since only unsinged int has granted 2-compliment overflow handling, we are doing math here on unsigned types.
|
||||
// To simplify and booletproof code, we operate enforce ValueType to be unsigned too.
|
||||
// Since only unsinged int has granted 2-complement overflow handling,
|
||||
// we are doing math here only on unsigned types.
|
||||
// To simplify and booletproof code, we enforce ValueType to be unsigned too.
|
||||
static_assert(is_unsigned_v<ValueType>, "ValueType must be unsigned.");
|
||||
using UnsignedDeltaType = ValueType;
|
||||
|
||||
// We use signed delta type to turn huge unsigned values into smaller signed:
|
||||
// ffffffff => -1
|
||||
using SignedDeltaType = typename std::make_signed<UnsignedDeltaType>::type;
|
||||
using SignedDeltaType = typename std::make_signed_t<UnsignedDeltaType>;
|
||||
|
||||
if (source_size % sizeof(ValueType) != 0)
|
||||
throw Exception("Cannot compress, data size " + toString(source_size)
|
||||
@ -149,8 +195,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
prev_value = curr_value;
|
||||
}
|
||||
|
||||
WriteBuffer buffer(dest, getCompressedDataSize(sizeof(ValueType), source_size - sizeof(ValueType)*2));
|
||||
BitWriter writer(buffer);
|
||||
BitWriter writer(dest, getCompressedDataSize(sizeof(ValueType), source_size - sizeof(ValueType)*2));
|
||||
|
||||
int item = 2;
|
||||
for (; source < source_end; source += sizeof(ValueType), ++item)
|
||||
@ -170,7 +215,8 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
else
|
||||
{
|
||||
const SignedDeltaType signed_dd = static_cast<SignedDeltaType>(double_delta);
|
||||
const auto sign = std::signbit(signed_dd);
|
||||
const auto sign = signed_dd < 0;
|
||||
|
||||
// -1 shirnks dd down to fit into number of bits, and there can't be 0, so it is OK.
|
||||
const auto abs_value = static_cast<UnsignedDeltaType>(std::abs(signed_dd) - 1);
|
||||
const auto write_spec = getDeltaWriteSpec(signed_dd);
|
||||
@ -183,7 +229,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
|
||||
writer.flush();
|
||||
|
||||
return sizeof(items_count) + sizeof(prev_value) + sizeof(prev_delta) + buffer.count();
|
||||
return sizeof(items_count) + sizeof(prev_value) + sizeof(prev_delta) + writer.count() / 8;
|
||||
}
|
||||
|
||||
template <typename ValueType>
|
||||
@ -220,35 +266,28 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
dest += sizeof(prev_value);
|
||||
}
|
||||
|
||||
ReadBufferFromMemory buffer(source, source_size - sizeof(prev_value) - sizeof(prev_delta) - sizeof(items_count));
|
||||
BitReader reader(buffer);
|
||||
BitReader reader(source, source_size - sizeof(prev_value) - sizeof(prev_delta) - sizeof(items_count));
|
||||
|
||||
// since data is tightly packed, up to 1 bit per value, and last byte is padded with zeroes,
|
||||
// we have to keep track of items to avoid reading more that there is.
|
||||
for (UInt32 items_read = 2; items_read < items_count && !reader.eof(); ++items_read)
|
||||
{
|
||||
UnsignedDeltaType double_delta = 0;
|
||||
if (reader.readBit() == 1)
|
||||
{
|
||||
UInt8 i = 0;
|
||||
for (; i < sizeof(DELTA_SIZES) - 1; ++i)
|
||||
{
|
||||
const auto next_bit = reader.readBit();
|
||||
if (next_bit == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static_assert(sizeof(WRITE_SPEC_LUT)/sizeof(WRITE_SPEC_LUT[0]) == 32); // 5-bit prefix lookup table
|
||||
const auto write_spec = WRITE_SPEC_LUT[reader.peekByte() >> (8 - 5)]; // only 5 high bits of peeked byte value
|
||||
|
||||
reader.skipBufferedBits(write_spec.prefix_bits); // discard the prefix value, since we've already used it
|
||||
if (write_spec.data_bits != 0)
|
||||
{
|
||||
const UInt8 sign = reader.readBit();
|
||||
SignedDeltaType signed_dd = static_cast<SignedDeltaType>(reader.readBits(DELTA_SIZES[i] - 1) + 1);
|
||||
SignedDeltaType signed_dd = static_cast<SignedDeltaType>(reader.readBits(write_spec.data_bits - 1) + 1);
|
||||
if (sign)
|
||||
{
|
||||
signed_dd *= -1;
|
||||
}
|
||||
double_delta = static_cast<UnsignedDeltaType>(signed_dd);
|
||||
}
|
||||
// else if first bit is zero, no need to read more data.
|
||||
|
||||
const UnsignedDeltaType delta = double_delta + prev_delta;
|
||||
const ValueType curr_value = prev_value + delta;
|
||||
|
@ -5,6 +5,92 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** DoubleDelta column codec implementation.
|
||||
*
|
||||
* Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf, which was extended
|
||||
* to support 64bit types. The drawback is 1 extra bit for 32-byte wide deltas: 5-bit prefix
|
||||
* instead of 4-bit prefix.
|
||||
*
|
||||
* This codec is best used against monotonic integer sequences with constant (or almost contant)
|
||||
* stride, like event timestamp for some monitoring application.
|
||||
*
|
||||
* Given input sequence a: [a0, a1, ... an]:
|
||||
*
|
||||
* First, write number of items (sizeof(int32)*8 bits): n
|
||||
* Then write first item as is (sizeof(a[0])*8 bits): a[0]
|
||||
* Second item is written as delta (sizeof(a[0])*8 bits): a[1] - a[0]
|
||||
* Loop over remaining items and calculate double delta:
|
||||
* double_delta = a[i] - 2 * a[i - 1] + a[i - 2]
|
||||
* Write it in compact binary form with `BitWriter`
|
||||
* if double_delta == 0:
|
||||
* write 1bit: 0
|
||||
* else if -63 < double_delta < 64:
|
||||
* write 2 bit prefix: 10
|
||||
* write sign bit (1 if signed): x
|
||||
* write 7-1 bits of abs(double_delta - 1): xxxxxx
|
||||
* else if -255 < double_delta < 256:
|
||||
* write 3 bit prefix: 110
|
||||
* write sign bit (1 if signed): x
|
||||
* write 9-1 bits of abs(double_delta - 1): xxxxxxxx
|
||||
* else if -2047 < double_delta < 2048:
|
||||
* write 4 bit prefix: 1110
|
||||
* write sign bit (1 if signed): x
|
||||
* write 12-1 bits of abs(double_delta - 1): xxxxxxxxxxx
|
||||
* else if double_delta fits into 32-bit int:
|
||||
* write 5 bit prefix: 11110
|
||||
* write sign bit (1 if signed): x
|
||||
* write 32-1 bits of abs(double_delta - 1): xxxxxxxxxxx...
|
||||
* else
|
||||
* write 5 bit prefix: 11111
|
||||
* write sign bit (1 if signed): x
|
||||
* write 64-1 bits of abs(double_delta - 1): xxxxxxxxxxx...
|
||||
*
|
||||
* @example sequence of UInt8 values [1, 2, 3, 4, 5, 6, 7, 8, 9 10] is encoded as (codec header is ommited):
|
||||
*
|
||||
* .- 4-byte little-endian sequence length (10 == 0xa)
|
||||
* | .- 1 byte (sizeof(UInt8) a[0] : 0x01
|
||||
* | | .- 1 byte of delta: a[1] - a[0] = 2 - 1 = 1 : 0x01
|
||||
* | | | .- 8 zero bits since double delta for remaining 8 elements was 0 : 0x00
|
||||
* v_______________v___v___v___
|
||||
* \x0a\x00\x00\x00\x01\x01\x00
|
||||
*
|
||||
* @example sequence of Int16 values [-10, 10, -20, 20, -40, 40] is encoded as:
|
||||
*
|
||||
* .- 4-byte little endian sequence length = 6 : 0x00000006
|
||||
* | .- 2 bytes (sizeof(Int16) a[0] as UInt16 = -10 : 0xfff6
|
||||
* | | .- 2 bytes of delta: a[1] - a[0] = 10 - (-10) = 20 : 0x0014
|
||||
* | | | .- 4 encoded double deltas (see below)
|
||||
* v_______________ v______ v______ v______________________
|
||||
* \x06\x00\x00\x00\xf6\xff\x14\x00\xb8\xe2\x2e\xb1\xe4\x58
|
||||
*
|
||||
* 4 binary encoded double deltas (\xb8\xe2\x2e\xb1\xe4\x58):
|
||||
* double_delta (DD) = -20 - 2 * 10 + (-10) = -50
|
||||
* .- 2-bit prefix : 0b10
|
||||
* | .- sign-bit : 0b1
|
||||
* | |.- abs(DD - 1) = 49 : 0b110001
|
||||
* | ||
|
||||
* | || DD = 20 - 2 * (-20) + 10 = 70
|
||||
* | || .- 3-bit prefix : 0b110
|
||||
* | || | .- sign bit : 0b0
|
||||
* | || | |.- abs(DD - 1) = 69 : 0b1000101
|
||||
* | || | ||
|
||||
* | || | || DD = -40 - 2 * 20 + (-20) = -100
|
||||
* | || | || .- 3-bit prefix : 0b110
|
||||
* | || | || | .- sign-bit : 0b0
|
||||
* | || | || | |.- abs(DD - 1) = 99 : 0b1100011
|
||||
* | || | || | ||
|
||||
* | || | || | || DD = 40 - 2 * (-40) + 20 = 140
|
||||
* | || | || | || .- 3-bit prefix : 0b110
|
||||
* | || | || | || | .- sign bit : 0b0
|
||||
* | || | || | || | |.- abs(DD - 1) = 139 : 0b10001011
|
||||
* | || | || | || | ||
|
||||
* V_vv______V__vv________V____vv_______V__vv________,- padding bits
|
||||
* 10111000 11100010 00101110 10110001 11100100 01011000
|
||||
*
|
||||
* Please also see unit tests for:
|
||||
* * Examples on what output `BitWriter` produces on predefined input.
|
||||
* * Compatibility tests solidifying encoded binary output on set of predefined sequences.
|
||||
*/
|
||||
class CompressionCodecDoubleDelta : public ICompressionCodec
|
||||
{
|
||||
public:
|
||||
|
@ -112,8 +112,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest,
|
||||
dest += sizeof(prev_value);
|
||||
}
|
||||
|
||||
WriteBuffer buffer(dest, dest_end - dest);
|
||||
BitWriter writer(buffer);
|
||||
BitWriter writer(dest, dest_end - dest);
|
||||
|
||||
while (source < source_end)
|
||||
{
|
||||
@ -148,7 +147,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest,
|
||||
|
||||
writer.flush();
|
||||
|
||||
return sizeof(items_count) + sizeof(prev_value) + buffer.count();
|
||||
return sizeof(items_count) + sizeof(prev_value) + writer.count() / 8;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -174,8 +173,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
dest += sizeof(prev_value);
|
||||
}
|
||||
|
||||
ReadBufferFromMemory buffer(source, source_size - sizeof(items_count) - sizeof(prev_value));
|
||||
BitReader reader(buffer);
|
||||
BitReader reader(source, source_size - sizeof(items_count) - sizeof(prev_value));
|
||||
|
||||
binary_value_info prev_xored_info{0, 0, 0};
|
||||
|
||||
|
@ -5,6 +5,89 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** Gorilla column codec implementation.
|
||||
*
|
||||
* Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf
|
||||
*
|
||||
* This codec is best used against monotonic floating sequences, like CPU usage percentage
|
||||
* or any other gauge.
|
||||
*
|
||||
* Given input sequence a: [a0, a1, ... an]
|
||||
*
|
||||
* First, write number of items (sizeof(int32)*8 bits): n
|
||||
* Then write first item as is (sizeof(a[0])*8 bits): a[0]
|
||||
* Loop over remaining items and calculate xor_diff:
|
||||
* xor_diff = a[i] ^ a[i - 1] (e.g. 00000011'10110100)
|
||||
* Write it in compact binary form with `BitWriter`
|
||||
* if xor_diff == 0:
|
||||
* write 1 bit: 0
|
||||
* else:
|
||||
* calculate leading zero bits (lzb)
|
||||
* and trailing zero bits (tzb) of xor_diff,
|
||||
* compare to lzb and tzb of previous xor_diff
|
||||
* (X = sizeof(a[i]) * 8, e.g. X = 16, lzb = 6, tzb = 2)
|
||||
* if lzb >= prev_lzb && tzb >= prev_tzb:
|
||||
* (e.g. prev_lzb=4, prev_tzb=1)
|
||||
* write 2 bit prefix: 0b10
|
||||
* write xor_diff >> prev_tzb (X - prev_lzb - prev_tzb bits):0b00111011010
|
||||
* (where X = sizeof(a[i]) * 8, e.g. 16)
|
||||
* else:
|
||||
* write 2 bit prefix: 0b11
|
||||
* write 5 bits of lzb: 0b00110
|
||||
* write 6 bits of (X - lzb - tzb)=(16-6-2)=8: 0b001000
|
||||
* write (X - lzb - tzb) non-zero bits of xor_diff: 0b11101101
|
||||
* prev_lzb = lzb
|
||||
* prev_tzb = tzb
|
||||
*
|
||||
* @example sequence of Float32 values [0.1, 0.1, 0.11, 0.2, 0.1] is encoded as:
|
||||
*
|
||||
* .- 4-byte little endian sequence length: 5 : 0x00000005
|
||||
* | .- 4 byte (sizeof(Float32) a[0] as UInt32 : -10 : 0xcdcccc3d
|
||||
* | | .- 4 encoded xor diffs (see below)
|
||||
* v_______________ v______________ v__________________________________________________
|
||||
* \x05\x00\x00\x00\xcd\xcc\xcc\x3d\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00
|
||||
*
|
||||
* 4 binary encoded xor diffs (\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00):
|
||||
*
|
||||
* ...........................................
|
||||
* a[i-1] = 00111101110011001100110011001101
|
||||
* a[i] = 00111101110011001100110011001101
|
||||
* xor_diff = 00000000000000000000000000000000
|
||||
* .- 1-bit prefix : 0b0
|
||||
* |
|
||||
* | ...........................................
|
||||
* | a[i-1] = 00111101110011001100110011001101
|
||||
* ! a[i] = 00111101111000010100011110101110
|
||||
* | xor_diff = 00000000001011011000101101100011
|
||||
* | lzb = 10
|
||||
* | tzb = 0
|
||||
* |.- 2-bit prefix : 0b11
|
||||
* || .- lzb (10) : 0b1010
|
||||
* || | .- data length (32-10-0): 22 : 0b010110
|
||||
* || | | .- data : 0b1011011000101101100011
|
||||
* || | | |
|
||||
* || | | | ...........................................
|
||||
* || | | | a[i-1] = 00111101111000010100011110101110
|
||||
* || | | | a[i] = 00111110010011001100110011001101
|
||||
* || | | | xor_diff = 00000011101011011000101101100011
|
||||
* || | | | .- 2-bit prefix : 0b11
|
||||
* || | | | | .- lzb = 6 : 0b00110
|
||||
* || | | | | | .- data length = (32 - 6) = 26 : 0b011010
|
||||
* || | | | | | | .- data : 0b11101011011000101101100011
|
||||
* || | | | | | | |
|
||||
* || | | | | | | | ...........................................
|
||||
* || | | | | | | | a[i-1] = 00111110010011001100110011001101
|
||||
* || | | | | | | | a[i] = 00111101110011001100110011001101
|
||||
* || | | | | | | | xor_diff = 00000011100000000000000000000000
|
||||
* || | | | | | | | .- 2-bit prefix : 0b10
|
||||
* || | | | | | | | | .- data : 0b11100000000000000000000000
|
||||
* VV_v____ v_____v________________________V_v_____v______v____________________________V_v_____________________________
|
||||
* 01101010 01011010 11011000 10110110 00111100 11001101 01110101 10110001 01101100 01110111 00000000 00000000 00000000
|
||||
*
|
||||
* Please also see unit tests for:
|
||||
* * Examples on what output `BitWriter` produces on predefined input.
|
||||
* * Compatibility tests solidifying encoded binary output on set of predefined sequences.
|
||||
*/
|
||||
class CompressionCodecGorilla : public ICompressionCodec
|
||||
{
|
||||
public:
|
||||
|
@ -32,7 +32,7 @@ int main(int argc, char ** argv)
|
||||
|
||||
{
|
||||
Stopwatch watch;
|
||||
CachedCompressedReadBuffer in(path, &cache, 0, 0);
|
||||
CachedCompressedReadBuffer in(path, &cache, 0, 0, 0);
|
||||
WriteBufferFromFile out("/dev/null");
|
||||
copyData(in, out);
|
||||
|
||||
@ -44,7 +44,7 @@ int main(int argc, char ** argv)
|
||||
|
||||
{
|
||||
Stopwatch watch;
|
||||
CachedCompressedReadBuffer in(path, &cache, 0, 0);
|
||||
CachedCompressedReadBuffer in(path, &cache, 0, 0, 0);
|
||||
WriteBufferFromFile out("/dev/null");
|
||||
copyData(in, out);
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <Compression/CompressionFactory.h>
|
||||
|
||||
#include <Common/PODArray.h>
|
||||
#include <Common/Stopwatch.h>
|
||||
#include <Core/Types.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
@ -62,6 +63,32 @@ std::vector<T> operator+(std::vector<T> && left, std::vector<T> && right)
|
||||
namespace
|
||||
{
|
||||
|
||||
template <typename T>
|
||||
struct AsHexStringHelper
|
||||
{
|
||||
const T & container;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::ostream & operator << (std::ostream & ostr, const AsHexStringHelper<T> & helper)
|
||||
{
|
||||
ostr << std::hex;
|
||||
for (const auto & e : helper.container)
|
||||
{
|
||||
ostr << "\\x" << std::setw(2) << std::setfill('0') << (static_cast<unsigned int>(e) & 0xFF);
|
||||
}
|
||||
|
||||
return ostr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
AsHexStringHelper<T> AsHexString(const T & container)
|
||||
{
|
||||
static_assert (sizeof(container[0]) == 1 && std::is_pod<std::decay_t<decltype(container[0])>>::value, "Only works on containers of byte-size PODs.");
|
||||
|
||||
return AsHexStringHelper<T>{container};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::string bin(const T & value, size_t bits = sizeof(T)*8)
|
||||
{
|
||||
@ -113,10 +140,71 @@ DataTypePtr makeDataType()
|
||||
|
||||
#undef MAKE_DATA_TYPE
|
||||
|
||||
assert(false && "unsupported size");
|
||||
assert(false && "unknown datatype");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename T, typename Container>
|
||||
class BinaryDataAsSequenceOfValuesIterator
|
||||
{
|
||||
const Container & container;
|
||||
const void * data;
|
||||
const void * data_end;
|
||||
|
||||
T current_value;
|
||||
|
||||
public:
|
||||
using Self = BinaryDataAsSequenceOfValuesIterator<T, Container>;
|
||||
|
||||
explicit BinaryDataAsSequenceOfValuesIterator(const Container & container_)
|
||||
: container(container_),
|
||||
data(&container[0]),
|
||||
data_end(reinterpret_cast<const char *>(data) + container.size()),
|
||||
current_value(T{})
|
||||
{
|
||||
static_assert(sizeof(container[0]) == 1 && std::is_pod<std::decay_t<decltype(container[0])>>::value, "Only works on containers of byte-size PODs.");
|
||||
read();
|
||||
}
|
||||
|
||||
const T & operator*() const
|
||||
{
|
||||
return current_value;
|
||||
}
|
||||
|
||||
size_t ItemsLeft() const
|
||||
{
|
||||
return reinterpret_cast<const char *>(data_end) - reinterpret_cast<const char *>(data);
|
||||
}
|
||||
|
||||
Self & operator++()
|
||||
{
|
||||
read();
|
||||
return *this;
|
||||
}
|
||||
|
||||
operator bool() const
|
||||
{
|
||||
return ItemsLeft() > 0;
|
||||
}
|
||||
|
||||
private:
|
||||
void read()
|
||||
{
|
||||
if (!*this)
|
||||
{
|
||||
throw std::runtime_error("No more data to read");
|
||||
}
|
||||
|
||||
current_value = unalignedLoad<T>(data);
|
||||
data = reinterpret_cast<const char *>(data) + sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Container>
|
||||
BinaryDataAsSequenceOfValuesIterator<T, Container> AsSequenceOf(const Container & container)
|
||||
{
|
||||
return BinaryDataAsSequenceOfValuesIterator<T, Container>(container);
|
||||
}
|
||||
|
||||
template <typename T, typename ContainerLeft, typename ContainerRight>
|
||||
::testing::AssertionResult EqualByteContainersAs(const ContainerLeft & left, const ContainerRight & right)
|
||||
@ -126,9 +214,6 @@ template <typename T, typename ContainerLeft, typename ContainerRight>
|
||||
|
||||
::testing::AssertionResult result = ::testing::AssertionSuccess();
|
||||
|
||||
ReadBufferFromMemory left_read_buffer(left.data(), left.size());
|
||||
ReadBufferFromMemory right_read_buffer(right.data(), right.size());
|
||||
|
||||
const auto l_size = left.size() / sizeof(T);
|
||||
const auto r_size = right.size() / sizeof(T);
|
||||
const auto size = std::min(l_size, r_size);
|
||||
@ -137,16 +222,25 @@ template <typename T, typename ContainerLeft, typename ContainerRight>
|
||||
{
|
||||
result = ::testing::AssertionFailure() << "size mismatch" << " expected: " << l_size << " got:" << r_size;
|
||||
}
|
||||
if (l_size == 0 || r_size == 0)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
auto l = AsSequenceOf<T>(left);
|
||||
auto r = AsSequenceOf<T>(right);
|
||||
|
||||
const auto MAX_MISMATCHING_ITEMS = 5;
|
||||
int mismatching_items = 0;
|
||||
for (int i = 0; i < size; ++i)
|
||||
{
|
||||
T left_value{};
|
||||
left_read_buffer.readStrict(reinterpret_cast<char*>(&left_value), sizeof(left_value));
|
||||
size_t i = 0;
|
||||
|
||||
T right_value{};
|
||||
right_read_buffer.readStrict(reinterpret_cast<char*>(&right_value), sizeof(right_value));
|
||||
while (l && r)
|
||||
{
|
||||
const auto left_value = *l;
|
||||
const auto right_value = *r;
|
||||
++l;
|
||||
++r;
|
||||
++i;
|
||||
|
||||
if (left_value != right_value)
|
||||
{
|
||||
@ -157,25 +251,47 @@ template <typename T, typename ContainerLeft, typename ContainerRight>
|
||||
|
||||
if (++mismatching_items <= MAX_MISMATCHING_ITEMS)
|
||||
{
|
||||
result << "mismatching " << sizeof(T) << "-byte item #" << i
|
||||
result << "\nmismatching " << sizeof(T) << "-byte item #" << i
|
||||
<< "\nexpected: " << bin(left_value) << " (0x" << std::hex << left_value << ")"
|
||||
<< "\ngot : " << bin(right_value) << " (0x" << std::hex << right_value << ")"
|
||||
<< std::endl;
|
||||
<< "\ngot : " << bin(right_value) << " (0x" << std::hex << right_value << ")";
|
||||
if (mismatching_items == MAX_MISMATCHING_ITEMS)
|
||||
{
|
||||
result << "..." << std::endl;
|
||||
result << "\n..." << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mismatching_items > 0)
|
||||
{
|
||||
result << "\ntotal mismatching items:" << mismatching_items << " of " << size;
|
||||
result << "total mismatching items:" << mismatching_items << " of " << size;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename ContainerLeft, typename ContainerRight>
|
||||
::testing::AssertionResult EqualByteContainers(UInt8 element_size, const ContainerLeft & left, const ContainerRight & right)
|
||||
{
|
||||
switch (element_size)
|
||||
{
|
||||
case 1:
|
||||
return EqualByteContainersAs<UInt8>(left, right);
|
||||
break;
|
||||
case 2:
|
||||
return EqualByteContainersAs<UInt16>(left, right);
|
||||
break;
|
||||
case 4:
|
||||
return EqualByteContainersAs<UInt32>(left, right);
|
||||
break;
|
||||
case 8:
|
||||
return EqualByteContainersAs<UInt64>(left, right);
|
||||
break;
|
||||
default:
|
||||
assert(false && "Invalid element_size");
|
||||
return ::testing::AssertionFailure() << "Invalid element_size: " << element_size;
|
||||
}
|
||||
}
|
||||
|
||||
struct Codec
|
||||
{
|
||||
std::string codec_statement;
|
||||
@ -214,20 +330,23 @@ struct CodecTestSequence
|
||||
CodecTestSequence & operator=(const CodecTestSequence &) = default;
|
||||
CodecTestSequence(CodecTestSequence &&) = default;
|
||||
CodecTestSequence & operator=(CodecTestSequence &&) = default;
|
||||
|
||||
CodecTestSequence & append(const CodecTestSequence & other)
|
||||
{
|
||||
assert(data_type->equals(*other.data_type));
|
||||
|
||||
serialized_data.insert(serialized_data.end(), other.serialized_data.begin(), other.serialized_data.end());
|
||||
if (!name.empty())
|
||||
name += " + ";
|
||||
name += other.name;
|
||||
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
CodecTestSequence operator+(CodecTestSequence && left, CodecTestSequence && right)
|
||||
CodecTestSequence operator+(CodecTestSequence && left, const CodecTestSequence & right)
|
||||
{
|
||||
assert(left.data_type->equals(*right.data_type));
|
||||
|
||||
std::vector<char> data(std::move(left.serialized_data));
|
||||
data.insert(data.end(), right.serialized_data.begin(), right.serialized_data.end());
|
||||
|
||||
return CodecTestSequence{
|
||||
left.name + " + " + right.name,
|
||||
std::move(data),
|
||||
std::move(left.data_type)
|
||||
};
|
||||
return left.append(right);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -288,17 +407,22 @@ CodecTestSequence makeSeq(Args && ... args)
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename Generator>
|
||||
CodecTestSequence generateSeq(Generator gen, const char* gen_name, size_t Begin = 0, size_t End = 10000)
|
||||
template <typename T, typename Generator, typename B = int, typename E = int>
|
||||
CodecTestSequence generateSeq(Generator gen, const char* gen_name, B Begin = 0, E End = 10000)
|
||||
{
|
||||
assert (End >= Begin);
|
||||
|
||||
const auto direction = std::signbit(End - Begin) ? -1 : 1;
|
||||
std::vector<char> data(sizeof(T) * (End - Begin));
|
||||
char * write_pos = data.data();
|
||||
|
||||
for (size_t i = Begin; i < End; ++i)
|
||||
for (auto i = Begin; i < End; i += direction)
|
||||
{
|
||||
const T v = gen(static_cast<T>(i));
|
||||
|
||||
// if constexpr (debug_log_items)
|
||||
// {
|
||||
// std::cerr << "#" << i << " " << type_name<T>() << "(" << sizeof(T) << " bytes) : " << v << std::endl;
|
||||
// }
|
||||
|
||||
unalignedStore<T>(write_pos, v);
|
||||
write_pos += sizeof(v);
|
||||
}
|
||||
@ -310,6 +434,96 @@ CodecTestSequence generateSeq(Generator gen, const char* gen_name, size_t Begin
|
||||
};
|
||||
}
|
||||
|
||||
struct NoOpTimer
|
||||
{
|
||||
void start() {}
|
||||
void report(const char*) {}
|
||||
};
|
||||
|
||||
struct StopwatchTimer
|
||||
{
|
||||
explicit StopwatchTimer(clockid_t clock_type, size_t estimated_marks = 32)
|
||||
: stopwatch(clock_type)
|
||||
{
|
||||
results.reserve(estimated_marks);
|
||||
}
|
||||
|
||||
void start()
|
||||
{
|
||||
stopwatch.restart();
|
||||
}
|
||||
|
||||
void report(const char * mark)
|
||||
{
|
||||
results.emplace_back(mark, stopwatch.elapsed());
|
||||
}
|
||||
|
||||
void stop()
|
||||
{
|
||||
stopwatch.stop();
|
||||
}
|
||||
|
||||
const std::vector<std::tuple<const char*, UInt64>> & getResults() const
|
||||
{
|
||||
return results;
|
||||
}
|
||||
|
||||
private:
|
||||
Stopwatch stopwatch;
|
||||
std::vector<std::tuple<const char*, UInt64>> results;
|
||||
};
|
||||
|
||||
CompressionCodecPtr makeCodec(const std::string & codec_string, const DataTypePtr data_type)
|
||||
{
|
||||
const std::string codec_statement = "(" + codec_string + ")";
|
||||
Tokens tokens(codec_statement.begin().base(), codec_statement.end().base());
|
||||
IParser::Pos token_iterator(tokens);
|
||||
|
||||
Expected expected;
|
||||
ASTPtr codec_ast;
|
||||
ParserCodec parser;
|
||||
|
||||
parser.parse(token_iterator, codec_ast, expected);
|
||||
|
||||
return CompressionCodecFactory::instance().get(codec_ast, data_type);
|
||||
}
|
||||
|
||||
template <typename Timer>
|
||||
void testTranscoding(Timer & timer, ICompressionCodec & codec, const CodecTestSequence & test_sequence, std::optional<double> expected_compression_ratio = std::optional<double>{})
|
||||
{
|
||||
const auto & source_data = test_sequence.serialized_data;
|
||||
|
||||
const UInt32 encoded_max_size = codec.getCompressedReserveSize(source_data.size());
|
||||
PODArray<char> encoded(encoded_max_size);
|
||||
|
||||
timer.start();
|
||||
|
||||
const UInt32 encoded_size = codec.compress(source_data.data(), source_data.size(), encoded.data());
|
||||
timer.report("encoding");
|
||||
|
||||
encoded.resize(encoded_size);
|
||||
|
||||
PODArray<char> decoded(source_data.size());
|
||||
|
||||
timer.start();
|
||||
const UInt32 decoded_size = codec.decompress(encoded.data(), encoded.size(), decoded.data());
|
||||
timer.report("decoding");
|
||||
|
||||
decoded.resize(decoded_size);
|
||||
|
||||
ASSERT_TRUE(EqualByteContainers(test_sequence.data_type->getSizeOfValueInMemory(), source_data, decoded));
|
||||
|
||||
const auto header_size = codec.getHeaderSize();
|
||||
const auto compression_ratio = (encoded_size - header_size) / (source_data.size() * 1.0);
|
||||
|
||||
if (expected_compression_ratio)
|
||||
{
|
||||
ASSERT_LE(compression_ratio, *expected_compression_ratio)
|
||||
<< "\n\tdecoded size: " << source_data.size()
|
||||
<< "\n\tencoded size: " << encoded_size
|
||||
<< "(no header: " << encoded_size - header_size << ")";
|
||||
}
|
||||
}
|
||||
|
||||
class CodecTest : public ::testing::TestWithParam<std::tuple<Codec, CodecTestSequence>>
|
||||
{
|
||||
@ -320,67 +534,18 @@ public:
|
||||
CODEC_WITHOUT_DATA_TYPE,
|
||||
};
|
||||
|
||||
CompressionCodecPtr makeCodec(MakeCodecParam with_data_type) const
|
||||
CompressionCodecPtr makeCodec(MakeCodecParam with_data_type)
|
||||
{
|
||||
const auto & codec_string = std::get<0>(GetParam()).codec_statement;
|
||||
const auto & data_type = with_data_type == CODEC_WITH_DATA_TYPE ? std::get<1>(GetParam()).data_type : nullptr;
|
||||
|
||||
const std::string codec_statement = "(" + codec_string + ")";
|
||||
Tokens tokens(codec_statement.begin().base(), codec_statement.end().base());
|
||||
IParser::Pos token_iterator(tokens);
|
||||
|
||||
Expected expected;
|
||||
ASTPtr codec_ast;
|
||||
ParserCodec parser;
|
||||
|
||||
parser.parse(token_iterator, codec_ast, expected);
|
||||
|
||||
return CompressionCodecFactory::instance().get(codec_ast, data_type);
|
||||
return ::makeCodec(codec_string, data_type);
|
||||
}
|
||||
|
||||
void testTranscoding(ICompressionCodec & codec)
|
||||
{
|
||||
const auto & test_sequence = std::get<1>(GetParam());
|
||||
const auto & source_data = test_sequence.serialized_data;
|
||||
|
||||
const UInt32 encoded_max_size = codec.getCompressedReserveSize(source_data.size());
|
||||
PODArray<char> encoded(encoded_max_size);
|
||||
|
||||
const UInt32 encoded_size = codec.compress(source_data.data(), source_data.size(), encoded.data());
|
||||
encoded.resize(encoded_size);
|
||||
|
||||
PODArray<char> decoded(source_data.size());
|
||||
const UInt32 decoded_size = codec.decompress(encoded.data(), encoded.size(), decoded.data());
|
||||
decoded.resize(decoded_size);
|
||||
|
||||
switch (test_sequence.data_type->getSizeOfValueInMemory())
|
||||
{
|
||||
case 1:
|
||||
ASSERT_TRUE(EqualByteContainersAs<UInt8>(source_data, decoded));
|
||||
break;
|
||||
case 2:
|
||||
ASSERT_TRUE(EqualByteContainersAs<UInt16>(source_data, decoded));
|
||||
break;
|
||||
case 4:
|
||||
ASSERT_TRUE(EqualByteContainersAs<UInt32>(source_data, decoded));
|
||||
break;
|
||||
case 8:
|
||||
ASSERT_TRUE(EqualByteContainersAs<UInt64>(source_data, decoded));
|
||||
break;
|
||||
default:
|
||||
FAIL() << "Invalid test sequence data type: " << test_sequence.data_type->getName();
|
||||
}
|
||||
const auto header_size = codec.getHeaderSize();
|
||||
const auto compression_ratio = (encoded_size - header_size) / (source_data.size() * 1.0);
|
||||
|
||||
const auto & codec_spec = std::get<0>(GetParam());
|
||||
if (codec_spec.expected_compression_ratio)
|
||||
{
|
||||
ASSERT_LE(compression_ratio, *codec_spec.expected_compression_ratio)
|
||||
<< "\n\tdecoded size: " << source_data.size()
|
||||
<< "\n\tencoded size: " << encoded_size
|
||||
<< "(no header: " << encoded_size - header_size << ")";
|
||||
}
|
||||
NoOpTimer timer;
|
||||
::testTranscoding(timer, codec, std::get<1>(GetParam()), std::get<0>(GetParam()).expected_compression_ratio);
|
||||
}
|
||||
};
|
||||
|
||||
@ -396,10 +561,121 @@ TEST_P(CodecTest, TranscodingWithoutDataType)
|
||||
testTranscoding(*codec);
|
||||
}
|
||||
|
||||
// Param is tuple-of-tuple to simplify instantiating with values, since typically group of cases test only one codec.
|
||||
class CodecTest_Compatibility : public ::testing::TestWithParam<std::tuple<Codec, std::tuple<CodecTestSequence, std::string>>>
|
||||
{};
|
||||
|
||||
// Check that iput sequence when encoded matches the encoded string binary.
|
||||
TEST_P(CodecTest_Compatibility, Encoding)
|
||||
{
|
||||
const auto & codec_spec = std::get<0>(GetParam());
|
||||
const auto & [data_sequence, expected] = std::get<1>(GetParam());
|
||||
const auto codec = makeCodec(codec_spec.codec_statement, data_sequence.data_type);
|
||||
|
||||
const auto & source_data = data_sequence.serialized_data;
|
||||
|
||||
// Just encode the data with codec
|
||||
const UInt32 encoded_max_size = codec->getCompressedReserveSize(source_data.size());
|
||||
PODArray<char> encoded(encoded_max_size);
|
||||
|
||||
const UInt32 encoded_size = codec->compress(source_data.data(), source_data.size(), encoded.data());
|
||||
encoded.resize(encoded_size);
|
||||
SCOPED_TRACE(::testing::Message("encoded: ") << AsHexString(encoded));
|
||||
|
||||
ASSERT_TRUE(EqualByteContainersAs<UInt8>(expected, encoded));
|
||||
}
|
||||
|
||||
// Check that binary string is exactly decoded into input sequence.
|
||||
TEST_P(CodecTest_Compatibility, Decoding)
|
||||
{
|
||||
const auto & codec_spec = std::get<0>(GetParam());
|
||||
const auto & [expected, encoded_data] = std::get<1>(GetParam());
|
||||
const auto codec = makeCodec(codec_spec.codec_statement, expected.data_type);
|
||||
|
||||
PODArray<char> decoded(expected.serialized_data.size());
|
||||
const UInt32 decoded_size = codec->decompress(encoded_data.c_str(), encoded_data.size(), decoded.data());
|
||||
decoded.resize(decoded_size);
|
||||
|
||||
ASSERT_TRUE(EqualByteContainers(expected.data_type->getSizeOfValueInMemory(), expected.serialized_data, decoded));
|
||||
}
|
||||
|
||||
class CodecTest_Performance : public ::testing::TestWithParam<std::tuple<Codec, CodecTestSequence>>
|
||||
{};
|
||||
|
||||
TEST_P(CodecTest_Performance, TranscodingWithDataType)
|
||||
{
|
||||
const auto & [codec_spec, test_seq] = GetParam();
|
||||
const auto codec = ::makeCodec(codec_spec.codec_statement, test_seq.data_type);
|
||||
|
||||
const auto runs = 10;
|
||||
std::map<std::string, std::vector<UInt64>> results;
|
||||
|
||||
for (size_t i = 0; i < runs; ++i)
|
||||
{
|
||||
StopwatchTimer timer{CLOCK_THREAD_CPUTIME_ID};
|
||||
::testTranscoding(timer, *codec, test_seq);
|
||||
timer.stop();
|
||||
|
||||
for (const auto & [label, value] : timer.getResults())
|
||||
{
|
||||
results[label].push_back(value);
|
||||
}
|
||||
}
|
||||
|
||||
auto computeMeanAndStdDev = [](const auto & values)
|
||||
{
|
||||
double mean{};
|
||||
|
||||
if (values.size() < 2)
|
||||
return std::make_tuple(mean, double{});
|
||||
|
||||
using ValueType = typename std::decay_t<decltype(values)>::value_type;
|
||||
std::vector<ValueType> tmp_v(std::begin(values), std::end(values));
|
||||
std::sort(tmp_v.begin(), tmp_v.end());
|
||||
|
||||
// remove min and max
|
||||
tmp_v.erase(tmp_v.begin());
|
||||
tmp_v.erase(tmp_v.end() - 1);
|
||||
|
||||
for (const auto & v : tmp_v)
|
||||
{
|
||||
mean += v;
|
||||
}
|
||||
|
||||
mean = mean / tmp_v.size();
|
||||
double std_dev = 0.0;
|
||||
for (const auto & v : tmp_v)
|
||||
{
|
||||
const auto d = (v - mean);
|
||||
std_dev += (d * d);
|
||||
}
|
||||
std_dev = std::sqrt(std_dev / tmp_v.size());
|
||||
|
||||
return std::make_tuple(mean, std_dev);
|
||||
};
|
||||
|
||||
std::cerr << codec_spec.codec_statement
|
||||
<< " " << test_seq.data_type->getName()
|
||||
<< " (" << test_seq.serialized_data.size() << " bytes, "
|
||||
<< std::hex << CityHash_v1_0_2::CityHash64(test_seq.serialized_data.data(), test_seq.serialized_data.size()) << std::dec
|
||||
<< ", average of " << runs << " runs, μs)";
|
||||
|
||||
for (const auto & k : {"encoding", "decoding"})
|
||||
{
|
||||
const auto & values = results[k];
|
||||
const auto & [mean, std_dev] = computeMeanAndStdDev(values);
|
||||
// Ensure that Coefficient of variation is reasonably low, otherwise these numbers are meaningless
|
||||
EXPECT_GT(0.05, std_dev / mean);
|
||||
std::cerr << "\t" << std::fixed << std::setprecision(1) << mean / 1000.0;
|
||||
}
|
||||
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here we use generators to produce test payload for codecs.
|
||||
// Generator is a callable that can produce infinite number of values,
|
||||
// output value MUST be of the same type input value.
|
||||
// output value MUST be of the same type as input value.
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
auto SameValueGenerator = [](auto value)
|
||||
@ -543,6 +819,23 @@ std::vector<CodecTestSequence> generatePyramidOfSequences(const size_t sequences
|
||||
return sequences;
|
||||
};
|
||||
|
||||
// Just as if all sequences from generatePyramidOfSequences were appended to one-by-one to the first one.
|
||||
template <typename T, typename Generator>
|
||||
CodecTestSequence generatePyramidSequence(const size_t sequences_count, Generator && generator, const char* generator_name)
|
||||
{
|
||||
CodecTestSequence sequence;
|
||||
sequence.data_type = makeDataType<T>();
|
||||
sequence.serialized_data.reserve(sequences_count * sequences_count * sizeof(T));
|
||||
|
||||
for (size_t i = 1; i < sequences_count; ++i)
|
||||
{
|
||||
std::string name = generator_name + std::string(" from 0 to ") + std::to_string(i);
|
||||
sequence.append(generateSeq<T>(std::forward<decltype(generator)>(generator), name.c_str(), 0, i));
|
||||
}
|
||||
|
||||
return sequence;
|
||||
};
|
||||
|
||||
|
||||
// helper macro to produce human-friendly sequence name from generator
|
||||
#define G(generator) generator, #generator
|
||||
@ -575,7 +868,7 @@ INSTANTIATE_TEST_CASE_P(SmallSequences,
|
||||
::testing::Combine(
|
||||
DefaultCodecsToTest,
|
||||
::testing::ValuesIn(
|
||||
generatePyramidOfSequences<Int8 >(42, G(SequentialGenerator(1)))
|
||||
generatePyramidOfSequences<Int8 >(42, G(SequentialGenerator(1)))
|
||||
+ generatePyramidOfSequences<Int16 >(42, G(SequentialGenerator(1)))
|
||||
+ generatePyramidOfSequences<Int32 >(42, G(SequentialGenerator(1)))
|
||||
+ generatePyramidOfSequences<Int64 >(42, G(SequentialGenerator(1)))
|
||||
@ -609,7 +902,7 @@ INSTANTIATE_TEST_CASE_P(SameValueInt,
|
||||
::testing::Combine(
|
||||
DefaultCodecsToTest,
|
||||
::testing::Values(
|
||||
generateSeq<Int8 >(G(SameValueGenerator(1000))),
|
||||
generateSeq<Int8>(G(SameValueGenerator(1000))),
|
||||
generateSeq<Int16 >(G(SameValueGenerator(1000))),
|
||||
generateSeq<Int32 >(G(SameValueGenerator(1000))),
|
||||
generateSeq<Int64 >(G(SameValueGenerator(1000))),
|
||||
@ -626,7 +919,7 @@ INSTANTIATE_TEST_CASE_P(SameNegativeValueInt,
|
||||
::testing::Combine(
|
||||
DefaultCodecsToTest,
|
||||
::testing::Values(
|
||||
generateSeq<Int8 >(G(SameValueGenerator(-1000))),
|
||||
generateSeq<Int8>(G(SameValueGenerator(-1000))),
|
||||
generateSeq<Int16 >(G(SameValueGenerator(-1000))),
|
||||
generateSeq<Int32 >(G(SameValueGenerator(-1000))),
|
||||
generateSeq<Int64 >(G(SameValueGenerator(-1000))),
|
||||
@ -671,7 +964,7 @@ INSTANTIATE_TEST_CASE_P(SequentialInt,
|
||||
::testing::Combine(
|
||||
DefaultCodecsToTest,
|
||||
::testing::Values(
|
||||
generateSeq<Int8 >(G(SequentialGenerator(1))),
|
||||
generateSeq<Int8>(G(SequentialGenerator(1))),
|
||||
generateSeq<Int16 >(G(SequentialGenerator(1))),
|
||||
generateSeq<Int32 >(G(SequentialGenerator(1))),
|
||||
generateSeq<Int64 >(G(SequentialGenerator(1))),
|
||||
@ -690,7 +983,7 @@ INSTANTIATE_TEST_CASE_P(SequentialReverseInt,
|
||||
::testing::Combine(
|
||||
DefaultCodecsToTest,
|
||||
::testing::Values(
|
||||
generateSeq<Int8 >(G(SequentialGenerator(-1))),
|
||||
generateSeq<Int8>(G(SequentialGenerator(-1))),
|
||||
generateSeq<Int16 >(G(SequentialGenerator(-1))),
|
||||
generateSeq<Int32 >(G(SequentialGenerator(-1))),
|
||||
generateSeq<Int64 >(G(SequentialGenerator(-1))),
|
||||
@ -735,10 +1028,10 @@ INSTANTIATE_TEST_CASE_P(MonotonicInt,
|
||||
::testing::Combine(
|
||||
DefaultCodecsToTest,
|
||||
::testing::Values(
|
||||
generateSeq<Int8 >(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<Int16 >(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<Int32 >(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<Int64 >(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<Int8>(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<Int16>(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<Int32>(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<Int64>(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<UInt8 >(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<UInt16>(G(MonotonicGenerator(1, 5))),
|
||||
generateSeq<UInt32>(G(MonotonicGenerator(1, 5))),
|
||||
@ -752,11 +1045,11 @@ INSTANTIATE_TEST_CASE_P(MonotonicReverseInt,
|
||||
::testing::Combine(
|
||||
DefaultCodecsToTest,
|
||||
::testing::Values(
|
||||
generateSeq<Int8 >(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<Int16 >(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<Int32 >(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<Int64 >(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<UInt8 >(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<Int8>(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<Int16>(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<Int32>(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<Int64>(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<UInt8>(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<UInt16>(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<UInt32>(G(MonotonicGenerator(-1, 5))),
|
||||
generateSeq<UInt64>(G(MonotonicGenerator(-1, 5)))
|
||||
@ -862,4 +1155,191 @@ INSTANTIATE_TEST_CASE_P(OverflowFloat,
|
||||
),
|
||||
);
|
||||
|
||||
template <typename ValueType>
|
||||
auto DDCompatibilityTestSequence()
|
||||
{
|
||||
// Generates sequences with double delta in given range.
|
||||
auto ddGenerator = [prev_delta = static_cast<Int64>(0), prev = static_cast<Int64>(0)](auto dd) mutable
|
||||
{
|
||||
const auto curr = dd + prev + prev_delta;
|
||||
prev = curr;
|
||||
prev_delta = dd + prev_delta;
|
||||
return curr;
|
||||
};
|
||||
|
||||
auto ret = generateSeq<ValueType>(G(SameValueGenerator(42)), 0, 3);
|
||||
|
||||
// These values are from DoubleDelta paper (and implementation) and represent points at which DD encoded length is changed.
|
||||
// DD value less that this point is encoded in shorter binary form (bigger - longer binary).
|
||||
const Int64 dd_corner_points[] = {-63, 64, -255, 256, -2047, 2048, std::numeric_limits<Int32>::min(), std::numeric_limits<Int32>::max()};
|
||||
for (const auto & p : dd_corner_points)
|
||||
{
|
||||
if (std::abs(p) > std::numeric_limits<ValueType>::max())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// - 4 is to allow DD value to settle before transitioning through important point,
|
||||
// since DD depends on 2 previous values of data, + 2 is arbitrary.
|
||||
ret.append(generateSeq<ValueType>(G(ddGenerator), p - 4, p + 2));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BIN_STR(x) std::string{x, sizeof(x) - 1}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DoubleDelta,
|
||||
CodecTest_Compatibility,
|
||||
::testing::Combine(
|
||||
::testing::Values(Codec("DoubleDelta")),
|
||||
::testing::ValuesIn(std::initializer_list<std::tuple<CodecTestSequence, std::string>>{
|
||||
{
|
||||
DDCompatibilityTestSequence<Int8>(),
|
||||
BIN_STR("\x94\x21\x00\x00\x00\x0f\x00\x00\x00\x01\x00\x0f\x00\x00\x00\x2a\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xb1\xaa\xf4\xf6\x7d\x87\xf8\x80")
|
||||
},
|
||||
{
|
||||
DDCompatibilityTestSequence<UInt8>(),
|
||||
BIN_STR("\x94\x27\x00\x00\x00\x15\x00\x00\x00\x01\x00\x15\x00\x00\x00\x2a\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xb1\xaa\xf4\xf6\x7d\x87\xf8\x81\x8e\xd0\xca\x02\x01\x01")
|
||||
},
|
||||
{
|
||||
DDCompatibilityTestSequence<Int16>(),
|
||||
BIN_STR("\x94\x70\x00\x00\x00\x4e\x00\x00\x00\x02\x00\x27\x00\x00\x00\x2a\x00\x00\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xbc\xe3\x5d\xa3\xd3\xd9\xf6\x1f\xe2\x07\x7c\x47\x20\x67\x48\x07\x47\xff\x47\xf6\xfe\xf8\x00\x00\x70\x6b\xd0\x00\x02\x83\xd9\xfb\x9f\xdc\x1f\xfc\x20\x1e\x80\x00\x22\xc8\xf0\x00\x00\x66\x67\xa0\x00\x02\x00\x3d\x00\x00\x0f\xff\xe8\x00\x00\x7f\xee\xff\xdf\x40\x00\x0f\xf2\x78\x00\x01\x7f\x83\x9f\xf7\x9f\xfb\xc0\x00\x00\xff\xfe\x00\x00\x08\x00")
|
||||
},
|
||||
{
|
||||
DDCompatibilityTestSequence<UInt16>(),
|
||||
BIN_STR("\x94\x70\x00\x00\x00\x4e\x00\x00\x00\x02\x00\x27\x00\x00\x00\x2a\x00\x00\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xbc\xe3\x5d\xa3\xd3\xd9\xf6\x1f\xe2\x07\x7c\x47\x20\x67\x48\x07\x47\xff\x47\xf6\xfe\xf8\x00\x00\x70\x6b\xd0\x00\x02\x83\xd9\xfb\x9f\xdc\x1f\xfc\x20\x1e\x80\x00\x22\xc8\xf0\x00\x00\x66\x67\xa0\x00\x02\x00\x3d\x00\x00\x0f\xff\xe8\x00\x00\x7f\xee\xff\xdf\x40\x00\x0f\xf2\x78\x00\x01\x7f\x83\x9f\xf7\x9f\xfb\xc0\x00\x00\xff\xfe\x00\x00\x08\x00")
|
||||
},
|
||||
{
|
||||
DDCompatibilityTestSequence<Int32>(),
|
||||
BIN_STR("\x94\x74\x00\x00\x00\x9c\x00\x00\x00\x04\x00\x27\x00\x00\x00\x2a\x00\x00\x00\x00\x00\x00\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xbc\xe3\x5d\xa3\xd3\xd9\xf6\x1f\xe2\x07\x7c\x47\x20\x67\x48\x07\x47\xff\x47\xf6\xfe\xf8\x00\x00\x70\x6b\xd0\x00\x02\x83\xd9\xfb\x9f\xdc\x1f\xfc\x20\x1e\x80\x00\x22\xc8\xf0\x00\x00\x66\x67\xa0\x00\x02\x00\x3d\x00\x00\x0f\xff\xe8\x00\x00\x7f\xee\xff\xdf\x00\x00\x70\x0d\x7a\x00\x02\x80\x7b\x9f\xf7\x9f\xfb\xc0\x00\x00\xff\xfe\x00\x00\x08\x00")
|
||||
},
|
||||
{
|
||||
DDCompatibilityTestSequence<UInt32>(),
|
||||
BIN_STR("\x94\xb5\x00\x00\x00\xcc\x00\x00\x00\x04\x00\x33\x00\x00\x00\x2a\x00\x00\x00\x00\x00\x00\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xbc\xe3\x5d\xa3\xd3\xd9\xf6\x1f\xe2\x07\x7c\x47\x20\x67\x48\x07\x47\xff\x47\xf6\xfe\xf8\x00\x00\x70\x6b\xd0\x00\x02\x83\xd9\xfb\x9f\xdc\x1f\xfc\x20\x1e\x80\x00\x22\xc8\xf0\x00\x00\x66\x67\xa0\x00\x02\x00\x3d\x00\x00\x0f\xff\xe8\x00\x00\x7f\xee\xff\xdf\x00\x00\x70\x0d\x7a\x00\x02\x80\x7b\x9f\xf7\x9f\xfb\xc0\x00\x00\xff\xfe\x00\x00\x08\x00\xf3\xff\xf9\x41\xaf\xbf\xff\xd6\x0c\xfc\xff\xff\xff\xfb\xf0\x00\x00\x00\x07\xff\xff\xff\xef\xc0\x00\x00\x00\x3f\xff\xff\xff\xfb\xff\xff\xff\xfa\x69\x74\xf3\xff\xff\xff\xe7\x9f\xff\xff\xff\x7e\x00\x00\x00\x00\xff\xff\xff\xfd\xf8\x00\x00\x00\x07\xff\xff\xff\xf0")
|
||||
},
|
||||
{
|
||||
DDCompatibilityTestSequence<Int64>(),
|
||||
BIN_STR("\x94\xd4\x00\x00\x00\x98\x01\x00\x00\x08\x00\x33\x00\x00\x00\x2a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xbc\xe3\x5d\xa3\xd3\xd9\xf6\x1f\xe2\x07\x7c\x47\x20\x67\x48\x07\x47\xff\x47\xf6\xfe\xf8\x00\x00\x70\x6b\xd0\x00\x02\x83\xd9\xfb\x9f\xdc\x1f\xfc\x20\x1e\x80\x00\x22\xc8\xf0\x00\x00\x66\x67\xa0\x00\x02\x00\x3d\x00\x00\x0f\xff\xe8\x00\x00\x7f\xee\xff\xdf\x00\x00\x70\x0d\x7a\x00\x02\x80\x7b\x9f\xf7\x9f\xfb\xc0\x00\x00\xff\xfe\x00\x00\x08\x00\xfc\x00\x00\x00\x04\x00\x06\xbe\x4f\xbf\xff\xd6\x0c\xff\x00\x00\x00\x01\x00\x00\x00\x03\xf8\x00\x00\x00\x08\x00\x00\x00\x0f\xc0\x00\x00\x00\x3f\xff\xff\xff\xfb\xff\xff\xff\xfb\xe0\x00\x00\x01\xc0\x00\x00\x06\x9f\x80\x00\x00\x0a\x00\x00\x00\x34\xf3\xff\xff\xff\xe7\x9f\xff\xff\xff\x7e\x00\x00\x00\x00\xff\xff\xff\xfd\xf0\x00\x00\x00\x07\xff\xff\xff\xf0")
|
||||
},
|
||||
{
|
||||
DDCompatibilityTestSequence<UInt64>(),
|
||||
BIN_STR("\x94\xd4\x00\x00\x00\x98\x01\x00\x00\x08\x00\x33\x00\x00\x00\x2a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x6b\x65\x5f\x50\x34\xff\x4f\xaf\xbc\xe3\x5d\xa3\xd3\xd9\xf6\x1f\xe2\x07\x7c\x47\x20\x67\x48\x07\x47\xff\x47\xf6\xfe\xf8\x00\x00\x70\x6b\xd0\x00\x02\x83\xd9\xfb\x9f\xdc\x1f\xfc\x20\x1e\x80\x00\x22\xc8\xf0\x00\x00\x66\x67\xa0\x00\x02\x00\x3d\x00\x00\x0f\xff\xe8\x00\x00\x7f\xee\xff\xdf\x00\x00\x70\x0d\x7a\x00\x02\x80\x7b\x9f\xf7\x9f\xfb\xc0\x00\x00\xff\xfe\x00\x00\x08\x00\xfc\x00\x00\x00\x04\x00\x06\xbe\x4f\xbf\xff\xd6\x0c\xff\x00\x00\x00\x01\x00\x00\x00\x03\xf8\x00\x00\x00\x08\x00\x00\x00\x0f\xc0\x00\x00\x00\x3f\xff\xff\xff\xfb\xff\xff\xff\xfb\xe0\x00\x00\x01\xc0\x00\x00\x06\x9f\x80\x00\x00\x0a\x00\x00\x00\x34\xf3\xff\xff\xff\xe7\x9f\xff\xff\xff\x7e\x00\x00\x00\x00\xff\xff\xff\xfd\xf0\x00\x00\x00\x07\xff\xff\xff\xf0")
|
||||
},
|
||||
})
|
||||
),
|
||||
);
|
||||
|
||||
template <typename ValueType>
|
||||
auto DDperformanceTestSequence()
|
||||
{
|
||||
const auto times = 100'000;
|
||||
return DDCompatibilityTestSequence<ValueType>() * times // average case
|
||||
+ generateSeq<ValueType>(G(MinMaxGenerator()), 0, times) // worst
|
||||
+ generateSeq<ValueType>(G(SameValueGenerator(42)), 0, times); // best
|
||||
}
|
||||
|
||||
// prime numbers in ascending order with some random repitions hit all the cases of Gorilla.
|
||||
auto PrimesWithMultiplierGenerator = [](int multiplier = 1)
|
||||
{
|
||||
return [multiplier](auto i)
|
||||
{
|
||||
static const int vals[] = {
|
||||
2, 3, 5, 7, 11, 11, 13, 17, 19, 23, 29, 29, 31, 37, 41, 43,
|
||||
47, 47, 53, 59, 61, 61, 67, 71, 73, 79, 83, 89, 89, 97, 101, 103,
|
||||
107, 107, 109, 113, 113, 127, 127, 127
|
||||
};
|
||||
static const size_t count = sizeof(vals)/sizeof(vals[0]);
|
||||
|
||||
using T = decltype(i);
|
||||
return static_cast<T>(vals[i % count] * static_cast<T>(multiplier));
|
||||
};
|
||||
};
|
||||
|
||||
template <typename ValueType>
|
||||
auto GCompatibilityTestSequence()
|
||||
{
|
||||
// Also multiply result by some factor to test large values on types that can hold those.
|
||||
return generateSeq<ValueType>(G(PrimesWithMultiplierGenerator(intExp10(sizeof(ValueType)))), 0, 42);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Gorilla,
|
||||
CodecTest_Compatibility,
|
||||
::testing::Combine(
|
||||
::testing::Values(Codec("Gorilla")),
|
||||
::testing::ValuesIn(std::initializer_list<std::tuple<CodecTestSequence, std::string>>{
|
||||
{
|
||||
GCompatibilityTestSequence<Int8>(),
|
||||
BIN_STR("\x95\x35\x00\x00\x00\x2a\x00\x00\x00\x01\x00\x2a\x00\x00\x00\x14\xe1\xdd\x25\xe5\x7b\x29\x86\xee\x2a\x16\x5a\xc5\x0b\x23\x75\x1b\x3c\xb1\x97\x8b\x5f\xcb\x43\xd9\xc5\x48\xab\x23\xaf\x62\x93\x71\x4a\x73\x0f\xc6\x0a")
|
||||
},
|
||||
{
|
||||
GCompatibilityTestSequence<UInt8>(),
|
||||
BIN_STR("\x95\x35\x00\x00\x00\x2a\x00\x00\x00\x01\x00\x2a\x00\x00\x00\x14\xe1\xdd\x25\xe5\x7b\x29\x86\xee\x2a\x16\x5a\xc5\x0b\x23\x75\x1b\x3c\xb1\x97\x8b\x5f\xcb\x43\xd9\xc5\x48\xab\x23\xaf\x62\x93\x71\x4a\x73\x0f\xc6\x0a")
|
||||
},
|
||||
{
|
||||
GCompatibilityTestSequence<Int16>(),
|
||||
BIN_STR("\x95\x52\x00\x00\x00\x54\x00\x00\x00\x02\x00\x2a\x00\x00\x00\xc8\x00\xdc\xfe\x66\xdb\x1f\x4e\xa7\xde\xdc\xd5\xec\x6e\xf7\x37\x3a\x23\xe7\x63\xf5\x6a\x8e\x99\x37\x34\xf9\xf8\x2e\x76\x35\x2d\x51\xbb\x3b\xc3\x6d\x13\xbf\x86\x53\x9e\x25\xe4\xaf\xaf\x63\xd5\x6a\x6e\x76\x35\x3a\x27\xd3\x0f\x91\xae\x6b\x33\x57\x6e\x64\xcc\x55\x81\xe4")
|
||||
},
|
||||
{
|
||||
GCompatibilityTestSequence<UInt16>(),
|
||||
BIN_STR("\x95\x52\x00\x00\x00\x54\x00\x00\x00\x02\x00\x2a\x00\x00\x00\xc8\x00\xdc\xfe\x66\xdb\x1f\x4e\xa7\xde\xdc\xd5\xec\x6e\xf7\x37\x3a\x23\xe7\x63\xf5\x6a\x8e\x99\x37\x34\xf9\xf8\x2e\x76\x35\x2d\x51\xbb\x3b\xc3\x6d\x13\xbf\x86\x53\x9e\x25\xe4\xaf\xaf\x63\xd5\x6a\x6e\x76\x35\x3a\x27\xd3\x0f\x91\xae\x6b\x33\x57\x6e\x64\xcc\x55\x81\xe4")
|
||||
},
|
||||
{
|
||||
GCompatibilityTestSequence<Int32>(),
|
||||
BIN_STR("\x95\x65\x00\x00\x00\xa8\x00\x00\x00\x04\x00\x2a\x00\x00\x00\x20\x4e\x00\x00\xe4\x57\x63\xc0\xbb\x67\xbc\xce\x91\x97\x99\x15\x9e\xe3\x36\x3f\x89\x5f\x8e\xf2\xec\x8e\xd3\xbf\x75\x43\x58\xc4\x7e\xcf\x93\x43\x38\xc6\x91\x36\x1f\xe7\xb6\x11\x6f\x02\x73\x46\xef\xe0\xec\x50\xfb\x79\xcb\x9c\x14\xfa\x13\xea\x8d\x66\x43\x48\xa0\xde\x3a\xcf\xff\x26\xe0\x5f\x93\xde\x5e\x7f\x6e\x36\x5e\xe6\xb4\x66\x5d\xb0\x0e\xc4")
|
||||
},
|
||||
{
|
||||
GCompatibilityTestSequence<UInt32>(),
|
||||
BIN_STR("\x95\x65\x00\x00\x00\xa8\x00\x00\x00\x04\x00\x2a\x00\x00\x00\x20\x4e\x00\x00\xe4\x57\x63\xc0\xbb\x67\xbc\xce\x91\x97\x99\x15\x9e\xe3\x36\x3f\x89\x5f\x8e\xf2\xec\x8e\xd3\xbf\x75\x43\x58\xc4\x7e\xcf\x93\x43\x38\xc6\x91\x36\x1f\xe7\xb6\x11\x6f\x02\x73\x46\xef\xe0\xec\x50\xfb\x79\xcb\x9c\x14\xfa\x13\xea\x8d\x66\x43\x48\xa0\xde\x3a\xcf\xff\x26\xe0\x5f\x93\xde\x5e\x7f\x6e\x36\x5e\xe6\xb4\x66\x5d\xb0\x0e\xc4")
|
||||
},
|
||||
{
|
||||
GCompatibilityTestSequence<Int64>(),
|
||||
BIN_STR("\x95\x91\x00\x00\x00\x50\x01\x00\x00\x08\x00\x2a\x00\x00\x00\x00\xc2\xeb\x0b\x00\x00\x00\x00\xe3\x2b\xa0\xa6\x19\x85\x98\xdc\x45\x74\x74\x43\xc2\x57\x41\x4c\x6e\x42\x79\xd9\x8f\x88\xa5\x05\xf3\xf1\x94\xa3\x62\x1e\x02\xdf\x05\x10\xf1\x15\x97\x35\x2a\x50\x71\x0f\x09\x6c\x89\xf7\x65\x1d\x11\xb7\xcc\x7d\x0b\x70\xc1\x86\x88\x48\x47\x87\xb6\x32\x26\xa7\x86\x87\x88\xd3\x93\x3d\xfc\x28\x68\x85\x05\x0b\x13\xc6\x5f\xd4\x70\xe1\x5e\x76\xf1\x9f\xf3\x33\x2a\x14\x14\x5e\x40\xc1\x5c\x28\x3f\xec\x43\x03\x05\x11\x91\xe8\xeb\x8e\x0a\x0e\x27\x21\x55\xcb\x39\xbc\x6a\xff\x11\x5d\x81\xa0\xa6\x10")
|
||||
},
|
||||
{
|
||||
GCompatibilityTestSequence<UInt64>(),
|
||||
BIN_STR("\x95\x91\x00\x00\x00\x50\x01\x00\x00\x08\x00\x2a\x00\x00\x00\x00\xc2\xeb\x0b\x00\x00\x00\x00\xe3\x2b\xa0\xa6\x19\x85\x98\xdc\x45\x74\x74\x43\xc2\x57\x41\x4c\x6e\x42\x79\xd9\x8f\x88\xa5\x05\xf3\xf1\x94\xa3\x62\x1e\x02\xdf\x05\x10\xf1\x15\x97\x35\x2a\x50\x71\x0f\x09\x6c\x89\xf7\x65\x1d\x11\xb7\xcc\x7d\x0b\x70\xc1\x86\x88\x48\x47\x87\xb6\x32\x26\xa7\x86\x87\x88\xd3\x93\x3d\xfc\x28\x68\x85\x05\x0b\x13\xc6\x5f\xd4\x70\xe1\x5e\x76\xf1\x9f\xf3\x33\x2a\x14\x14\x5e\x40\xc1\x5c\x28\x3f\xec\x43\x03\x05\x11\x91\xe8\xeb\x8e\x0a\x0e\x27\x21\x55\xcb\x39\xbc\x6a\xff\x11\x5d\x81\xa0\xa6\x10")
|
||||
},
|
||||
})
|
||||
),
|
||||
);
|
||||
|
||||
// These 'tests' try to measure performance of encoding and decoding and hence only make sence to be run locally,
|
||||
// also they require pretty big data to run agains and generating this data slows down startup of unit test process.
|
||||
// So un-comment only at your discretion.
|
||||
|
||||
//INSTANTIATE_TEST_CASE_P(DoubleDelta,
|
||||
// CodecTest_Performance,
|
||||
// ::testing::Combine(
|
||||
// ::testing::Values(Codec("DoubleDelta")),
|
||||
// ::testing::Values(
|
||||
// DDperformanceTestSequence<Int8 >(),
|
||||
// DDperformanceTestSequence<UInt8 >(),
|
||||
// DDperformanceTestSequence<Int16 >(),
|
||||
// DDperformanceTestSequence<UInt16>(),
|
||||
// DDperformanceTestSequence<Int32 >(),
|
||||
// DDperformanceTestSequence<UInt32>(),
|
||||
// DDperformanceTestSequence<Int64 >(),
|
||||
// DDperformanceTestSequence<UInt64>()
|
||||
// )
|
||||
// ),
|
||||
//);
|
||||
|
||||
//INSTANTIATE_TEST_CASE_P(Gorilla,
|
||||
// CodecTest_Performance,
|
||||
// ::testing::Combine(
|
||||
// ::testing::Values(Codec("Gorilla")),
|
||||
// ::testing::Values(
|
||||
// generatePyramidSequence<Int8 >(42, G(PrimesWithMultiplierGenerator())) * 6'000,
|
||||
// generatePyramidSequence<UInt8 >(42, G(PrimesWithMultiplierGenerator())) * 6'000,
|
||||
// generatePyramidSequence<Int16 >(42, G(PrimesWithMultiplierGenerator())) * 6'000,
|
||||
// generatePyramidSequence<UInt16>(42, G(PrimesWithMultiplierGenerator())) * 6'000,
|
||||
// generatePyramidSequence<Int32 >(42, G(PrimesWithMultiplierGenerator())) * 6'000,
|
||||
// generatePyramidSequence<UInt32>(42, G(PrimesWithMultiplierGenerator())) * 6'000,
|
||||
// generatePyramidSequence<Int64 >(42, G(PrimesWithMultiplierGenerator())) * 6'000,
|
||||
// generatePyramidSequence<UInt64>(42, G(PrimesWithMultiplierGenerator())) * 6'000
|
||||
// )
|
||||
// ),
|
||||
//);
|
||||
|
||||
}
|
||||
|
@ -127,6 +127,7 @@ struct Settings : public SettingsCollection<Settings>
|
||||
M(SettingUInt64, optimize_min_equality_disjunction_chain_length, 3, "The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization ", 0) \
|
||||
\
|
||||
M(SettingUInt64, min_bytes_to_use_direct_io, 0, "The minimum number of bytes for reading the data with O_DIRECT option during SELECT queries execution. 0 - disabled.", 0) \
|
||||
M(SettingUInt64, min_bytes_to_use_mmap_io, 0, "The minimum number of bytes for reading the data with mmap option during SELECT queries execution. 0 - disabled.", 0) \
|
||||
\
|
||||
M(SettingBool, force_index_by_date, 0, "Throw an exception if there is a partition key in a table, and it is not used.", 0) \
|
||||
M(SettingBool, force_primary_key, 0, "Throw an exception if there is primary key in a table, and it is not used.", 0) \
|
||||
|
@ -22,8 +22,8 @@ namespace DB
|
||||
*/
|
||||
struct SortCursorImpl
|
||||
{
|
||||
ColumnRawPtrs all_columns;
|
||||
ColumnRawPtrs sort_columns;
|
||||
ColumnRawPtrs all_columns;
|
||||
SortDescription desc;
|
||||
size_t sort_columns_size = 0;
|
||||
size_t pos = 0;
|
||||
@ -110,21 +110,52 @@ using SortCursorImpls = std::vector<SortCursorImpl>;
|
||||
|
||||
|
||||
/// For easy copying.
|
||||
struct SortCursor
|
||||
template <typename Derived>
|
||||
struct SortCursorHelper
|
||||
{
|
||||
SortCursorImpl * impl;
|
||||
|
||||
SortCursor(SortCursorImpl * impl_) : impl(impl_) {}
|
||||
const Derived & derived() const { return static_cast<const Derived &>(*this); }
|
||||
|
||||
SortCursorHelper(SortCursorImpl * impl_) : impl(impl_) {}
|
||||
SortCursorImpl * operator-> () { return impl; }
|
||||
const SortCursorImpl * operator-> () const { return impl; }
|
||||
|
||||
bool greater(const SortCursorHelper & rhs) const
|
||||
{
|
||||
return derived().greaterAt(rhs.derived(), impl->pos, rhs.impl->pos);
|
||||
}
|
||||
|
||||
/// Inverted so that the priority queue elements are removed in ascending order.
|
||||
bool operator< (const SortCursorHelper & rhs) const
|
||||
{
|
||||
return derived().greater(rhs.derived());
|
||||
}
|
||||
|
||||
/// Checks that all rows in the current block of this cursor are less than or equal to all the rows of the current block of another cursor.
|
||||
bool totallyLessOrEquals(const SortCursorHelper & rhs) const
|
||||
{
|
||||
if (impl->rows == 0 || rhs.impl->rows == 0)
|
||||
return false;
|
||||
|
||||
/// The last row of this cursor is no larger than the first row of the another cursor.
|
||||
return !derived().greaterAt(rhs.derived(), impl->rows - 1, 0);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct SortCursor : SortCursorHelper<SortCursor>
|
||||
{
|
||||
using SortCursorHelper<SortCursor>::SortCursorHelper;
|
||||
|
||||
/// The specified row of this cursor is greater than the specified row of another cursor.
|
||||
bool greaterAt(const SortCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
|
||||
{
|
||||
for (size_t i = 0; i < impl->sort_columns_size; ++i)
|
||||
{
|
||||
int direction = impl->desc[i].direction;
|
||||
int nulls_direction = impl->desc[i].nulls_direction;
|
||||
const auto & desc = impl->desc[i];
|
||||
int direction = desc.direction;
|
||||
int nulls_direction = desc.nulls_direction;
|
||||
int res = direction * impl->sort_columns[i]->compareAt(lhs_pos, rhs_pos, *(rhs.impl->sort_columns[i]), nulls_direction);
|
||||
if (res > 0)
|
||||
return true;
|
||||
@ -133,45 +164,37 @@ struct SortCursor
|
||||
}
|
||||
return impl->order > rhs.impl->order;
|
||||
}
|
||||
};
|
||||
|
||||
/// Checks that all rows in the current block of this cursor are less than or equal to all the rows of the current block of another cursor.
|
||||
bool totallyLessOrEquals(const SortCursor & rhs) const
|
||||
|
||||
/// For the case with a single column and when there is no order between different cursors.
|
||||
struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
|
||||
{
|
||||
using SortCursorHelper<SimpleSortCursor>::SortCursorHelper;
|
||||
|
||||
bool greaterAt(const SimpleSortCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
|
||||
{
|
||||
if (impl->rows == 0 || rhs.impl->rows == 0)
|
||||
return false;
|
||||
|
||||
/// The last row of this cursor is no larger than the first row of the another cursor.
|
||||
return !greaterAt(rhs, impl->rows - 1, 0);
|
||||
}
|
||||
|
||||
bool greater(const SortCursor & rhs) const
|
||||
{
|
||||
return greaterAt(rhs, impl->pos, rhs.impl->pos);
|
||||
}
|
||||
|
||||
/// Inverted so that the priority queue elements are removed in ascending order.
|
||||
bool operator< (const SortCursor & rhs) const
|
||||
{
|
||||
return greater(rhs);
|
||||
const auto & desc = impl->desc[0];
|
||||
int direction = desc.direction;
|
||||
int nulls_direction = desc.nulls_direction;
|
||||
int res = impl->sort_columns[0]->compareAt(lhs_pos, rhs_pos, *(rhs.impl->sort_columns[0]), nulls_direction);
|
||||
return res != 0 && ((res > 0) == (direction > 0));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Separate comparator for locale-sensitive string comparisons
|
||||
struct SortCursorWithCollation
|
||||
struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
|
||||
{
|
||||
SortCursorImpl * impl;
|
||||
|
||||
SortCursorWithCollation(SortCursorImpl * impl_) : impl(impl_) {}
|
||||
SortCursorImpl * operator-> () { return impl; }
|
||||
const SortCursorImpl * operator-> () const { return impl; }
|
||||
using SortCursorHelper<SortCursorWithCollation>::SortCursorHelper;
|
||||
|
||||
bool greaterAt(const SortCursorWithCollation & rhs, size_t lhs_pos, size_t rhs_pos) const
|
||||
{
|
||||
for (size_t i = 0; i < impl->sort_columns_size; ++i)
|
||||
{
|
||||
int direction = impl->desc[i].direction;
|
||||
int nulls_direction = impl->desc[i].nulls_direction;
|
||||
const auto & desc = impl->desc[i];
|
||||
int direction = desc.direction;
|
||||
int nulls_direction = desc.nulls_direction;
|
||||
int res;
|
||||
if (impl->need_collation[i])
|
||||
{
|
||||
@ -189,29 +212,11 @@ struct SortCursorWithCollation
|
||||
}
|
||||
return impl->order > rhs.impl->order;
|
||||
}
|
||||
|
||||
bool totallyLessOrEquals(const SortCursorWithCollation & rhs) const
|
||||
{
|
||||
if (impl->rows == 0 || rhs.impl->rows == 0)
|
||||
return false;
|
||||
|
||||
/// The last row of this cursor is no larger than the first row of the another cursor.
|
||||
return !greaterAt(rhs, impl->rows - 1, 0);
|
||||
}
|
||||
|
||||
bool greater(const SortCursorWithCollation & rhs) const
|
||||
{
|
||||
return greaterAt(rhs, impl->pos, rhs.impl->pos);
|
||||
}
|
||||
|
||||
bool operator< (const SortCursorWithCollation & rhs) const
|
||||
{
|
||||
return greater(rhs);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/** Allows to fetch data from multiple sort cursors in sorted order (merging sorted data streams).
|
||||
* TODO: Replace with "Loser Tree", see https://en.wikipedia.org/wiki/K-way_merge_algorithm
|
||||
*/
|
||||
template <typename Cursor>
|
||||
class SortingHeap
|
||||
@ -225,7 +230,8 @@ public:
|
||||
size_t size = cursors.size();
|
||||
queue.reserve(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
queue.emplace_back(&cursors[i]);
|
||||
if (!cursors[i].empty())
|
||||
queue.emplace_back(&cursors[i]);
|
||||
std::make_heap(queue.begin(), queue.end());
|
||||
}
|
||||
|
||||
@ -233,6 +239,10 @@ public:
|
||||
|
||||
Cursor & current() { return queue.front(); }
|
||||
|
||||
size_t size() { return queue.size(); }
|
||||
|
||||
Cursor & nextChild() { return queue[nextChildIndex()]; }
|
||||
|
||||
void next()
|
||||
{
|
||||
assert(isValid());
|
||||
@ -246,34 +256,67 @@ public:
|
||||
removeTop();
|
||||
}
|
||||
|
||||
void replaceTop(Cursor new_top)
|
||||
{
|
||||
current() = new_top;
|
||||
updateTop();
|
||||
}
|
||||
|
||||
void removeTop()
|
||||
{
|
||||
std::pop_heap(queue.begin(), queue.end());
|
||||
queue.pop_back();
|
||||
next_idx = 0;
|
||||
}
|
||||
|
||||
void push(SortCursorImpl & cursor)
|
||||
{
|
||||
queue.emplace_back(&cursor);
|
||||
std::push_heap(queue.begin(), queue.end());
|
||||
next_idx = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
using Container = std::vector<Cursor>;
|
||||
Container queue;
|
||||
|
||||
/// Cache comparison between first and second child if the order in queue has not been changed.
|
||||
size_t next_idx = 0;
|
||||
|
||||
size_t nextChildIndex()
|
||||
{
|
||||
if (next_idx == 0)
|
||||
{
|
||||
next_idx = 1;
|
||||
|
||||
if (queue.size() > 2 && queue[1] < queue[2])
|
||||
++next_idx;
|
||||
}
|
||||
|
||||
return next_idx;
|
||||
}
|
||||
|
||||
/// This is adapted version of the function __sift_down from libc++.
|
||||
/// Why cannot simply use std::priority_queue?
|
||||
/// - because it doesn't support updating the top element and requires pop and push instead.
|
||||
/// Also look at "Boost.Heap" library.
|
||||
void updateTop()
|
||||
{
|
||||
size_t size = queue.size();
|
||||
if (size < 2)
|
||||
return;
|
||||
|
||||
size_t child_idx = 1;
|
||||
auto begin = queue.begin();
|
||||
auto child_it = begin + 1;
|
||||
|
||||
/// Right child exists and is greater than left child.
|
||||
if (size > 2 && *child_it < *(child_it + 1))
|
||||
{
|
||||
++child_it;
|
||||
++child_idx;
|
||||
}
|
||||
size_t child_idx = nextChildIndex();
|
||||
auto child_it = begin + child_idx;
|
||||
|
||||
/// Check if we are in order.
|
||||
if (*child_it < *begin)
|
||||
return;
|
||||
|
||||
next_idx = 0;
|
||||
|
||||
auto curr_it = begin;
|
||||
auto top(std::move(*begin));
|
||||
do
|
||||
@ -282,11 +325,12 @@ private:
|
||||
*curr_it = std::move(*child_it);
|
||||
curr_it = child_it;
|
||||
|
||||
if ((size - 2) / 2 < child_idx)
|
||||
break;
|
||||
|
||||
// recompute the child based off of the updated parent
|
||||
child_idx = 2 * child_idx + 1;
|
||||
|
||||
if (child_idx >= size)
|
||||
break;
|
||||
|
||||
child_it = begin + child_idx;
|
||||
|
||||
if ((child_idx + 1) < size && *child_it < *(child_it + 1))
|
||||
@ -300,12 +344,6 @@ private:
|
||||
} while (!(*child_it < top));
|
||||
*curr_it = std::move(top);
|
||||
}
|
||||
|
||||
void removeTop()
|
||||
{
|
||||
std::pop_heap(queue.begin(), queue.end());
|
||||
queue.pop_back();
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -138,14 +138,14 @@ Block AggregatingSortedBlockInputStream::readImpl()
|
||||
}
|
||||
|
||||
|
||||
void AggregatingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue)
|
||||
void AggregatingSortedBlockInputStream::merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue)
|
||||
{
|
||||
size_t merged_rows = 0;
|
||||
|
||||
/// We take the rows in the correct order and put them in `merged_block`, while the rows are no more than `max_block_size`
|
||||
while (!queue.empty())
|
||||
while (queue.isValid())
|
||||
{
|
||||
SortCursor current = queue.top();
|
||||
SortCursor current = queue.current();
|
||||
|
||||
setPrimaryKeyRef(next_key, current);
|
||||
|
||||
@ -167,8 +167,6 @@ void AggregatingSortedBlockInputStream::merge(MutableColumns & merged_columns, s
|
||||
return;
|
||||
}
|
||||
|
||||
queue.pop();
|
||||
|
||||
if (key_differs)
|
||||
{
|
||||
current_key.swap(next_key);
|
||||
@ -202,8 +200,7 @@ void AggregatingSortedBlockInputStream::merge(MutableColumns & merged_columns, s
|
||||
|
||||
if (!current->isLast())
|
||||
{
|
||||
current->next();
|
||||
queue.push(current);
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -55,7 +55,7 @@ private:
|
||||
/** We support two different cursors - with Collation and without.
|
||||
* Templates are used instead of polymorphic SortCursor and calls to virtual functions.
|
||||
*/
|
||||
void merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue);
|
||||
void merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue);
|
||||
|
||||
/** Extract all states of aggregate functions and merge them with the current group.
|
||||
*/
|
||||
|
@ -105,15 +105,15 @@ Block CollapsingSortedBlockInputStream::readImpl()
|
||||
}
|
||||
|
||||
|
||||
void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue)
|
||||
void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue)
|
||||
{
|
||||
|
||||
MergeStopCondition stop_condition(average_block_sizes, max_block_size);
|
||||
size_t current_block_granularity;
|
||||
/// Take rows in correct order and put them into `merged_columns` until the rows no more than `max_block_size`
|
||||
for (; !queue.empty(); ++current_pos)
|
||||
for (; queue.isValid(); ++current_pos)
|
||||
{
|
||||
SortCursor current = queue.top();
|
||||
SortCursor current = queue.current();
|
||||
current_block_granularity = current->rows;
|
||||
|
||||
if (current_key.empty())
|
||||
@ -131,8 +131,6 @@ void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, st
|
||||
return;
|
||||
}
|
||||
|
||||
queue.pop();
|
||||
|
||||
if (key_differs)
|
||||
{
|
||||
/// We write data for the previous primary key.
|
||||
@ -185,8 +183,7 @@ void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, st
|
||||
|
||||
if (!current->isLast())
|
||||
{
|
||||
current->next();
|
||||
queue.push(current);
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -73,7 +73,7 @@ private:
|
||||
/** We support two different cursors - with Collation and without.
|
||||
* Templates are used instead of polymorphic SortCursors and calls to virtual functions.
|
||||
*/
|
||||
void merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue);
|
||||
void merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue);
|
||||
|
||||
/// Output to result rows for the current primary key.
|
||||
void insertRows(MutableColumns & merged_columns, size_t block_size, MergeStopCondition & condition);
|
||||
|
@ -161,7 +161,7 @@ Block GraphiteRollupSortedBlockInputStream::readImpl()
|
||||
}
|
||||
|
||||
|
||||
void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue)
|
||||
void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue)
|
||||
{
|
||||
const DateLUTImpl & date_lut = DateLUT::instance();
|
||||
|
||||
@ -173,9 +173,9 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
|
||||
/// contribute towards current output row.
|
||||
/// Variables starting with next_* refer to the row at the top of the queue.
|
||||
|
||||
while (!queue.empty())
|
||||
while (queue.isValid())
|
||||
{
|
||||
SortCursor next_cursor = queue.top();
|
||||
SortCursor next_cursor = queue.current();
|
||||
|
||||
StringRef next_path = next_cursor->all_columns[path_column_num]->getDataAt(next_cursor->pos);
|
||||
bool new_path = is_first || next_path != current_group_path;
|
||||
@ -253,12 +253,9 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
|
||||
current_group_path = next_path;
|
||||
}
|
||||
|
||||
queue.pop();
|
||||
|
||||
if (!next_cursor->isLast())
|
||||
{
|
||||
next_cursor->next();
|
||||
queue.push(next_cursor);
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -225,7 +225,7 @@ private:
|
||||
UInt32 selectPrecision(const Graphite::Retentions & retentions, time_t time) const;
|
||||
|
||||
|
||||
void merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue);
|
||||
void merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue);
|
||||
|
||||
/// Insert the values into the resulting columns, which will not be changed in the future.
|
||||
template <typename TSortCursor>
|
||||
|
@ -150,10 +150,12 @@ MergeSortingBlocksBlockInputStream::MergeSortingBlocksBlockInputStream(
|
||||
|
||||
blocks.swap(nonempty_blocks);
|
||||
|
||||
if (!has_collation)
|
||||
if (has_collation)
|
||||
queue_with_collation = SortingHeap<SortCursorWithCollation>(cursors);
|
||||
else if (description.size() > 1)
|
||||
queue_without_collation = SortingHeap<SortCursor>(cursors);
|
||||
else
|
||||
queue_with_collation = SortingHeap<SortCursorWithCollation>(cursors);
|
||||
queue_simple = SortingHeap<SimpleSortCursor>(cursors);
|
||||
}
|
||||
|
||||
|
||||
@ -169,9 +171,12 @@ Block MergeSortingBlocksBlockInputStream::readImpl()
|
||||
return res;
|
||||
}
|
||||
|
||||
return !has_collation
|
||||
? mergeImpl(queue_without_collation)
|
||||
: mergeImpl(queue_with_collation);
|
||||
if (has_collation)
|
||||
return mergeImpl(queue_with_collation);
|
||||
else if (description.size() > 1)
|
||||
return mergeImpl(queue_without_collation);
|
||||
else
|
||||
return mergeImpl(queue_simple);
|
||||
}
|
||||
|
||||
|
||||
@ -179,9 +184,18 @@ template <typename TSortingHeap>
|
||||
Block MergeSortingBlocksBlockInputStream::mergeImpl(TSortingHeap & queue)
|
||||
{
|
||||
size_t num_columns = header.columns();
|
||||
|
||||
MutableColumns merged_columns = header.cloneEmptyColumns();
|
||||
/// TODO: reserve (in each column)
|
||||
|
||||
/// Reserve
|
||||
if (queue.isValid() && !blocks.empty())
|
||||
{
|
||||
/// The expected size of output block is the same as input block
|
||||
size_t size_to_reserve = blocks[0].rows();
|
||||
for (auto & column : merged_columns)
|
||||
column->reserve(size_to_reserve);
|
||||
}
|
||||
|
||||
/// TODO: Optimization when a single block left.
|
||||
|
||||
/// Take rows from queue in right order and push to 'merged'.
|
||||
size_t merged_rows = 0;
|
||||
@ -210,6 +224,9 @@ Block MergeSortingBlocksBlockInputStream::mergeImpl(TSortingHeap & queue)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!queue.isValid())
|
||||
blocks.clear();
|
||||
|
||||
if (merged_rows == 0)
|
||||
return {};
|
||||
|
||||
|
@ -59,6 +59,7 @@ private:
|
||||
bool has_collation = false;
|
||||
|
||||
SortingHeap<SortCursor> queue_without_collation;
|
||||
SortingHeap<SimpleSortCursor> queue_simple;
|
||||
SortingHeap<SortCursorWithCollation> queue_with_collation;
|
||||
|
||||
/** Two different cursors are supported - with and without Collation.
|
||||
|
@ -59,9 +59,9 @@ void MergingSortedBlockInputStream::init(MutableColumns & merged_columns)
|
||||
}
|
||||
|
||||
if (has_collation)
|
||||
initQueue(queue_with_collation);
|
||||
queue_with_collation = SortingHeap<SortCursorWithCollation>(cursors);
|
||||
else
|
||||
initQueue(queue_without_collation);
|
||||
queue_without_collation = SortingHeap<SortCursor>(cursors);
|
||||
}
|
||||
|
||||
/// Let's check that all source blocks have the same structure.
|
||||
@ -82,15 +82,6 @@ void MergingSortedBlockInputStream::init(MutableColumns & merged_columns)
|
||||
}
|
||||
|
||||
|
||||
template <typename TSortCursor>
|
||||
void MergingSortedBlockInputStream::initQueue(std::priority_queue<TSortCursor> & queue)
|
||||
{
|
||||
for (size_t i = 0; i < cursors.size(); ++i)
|
||||
if (!cursors[i].empty())
|
||||
queue.push(TSortCursor(&cursors[i]));
|
||||
}
|
||||
|
||||
|
||||
Block MergingSortedBlockInputStream::readImpl()
|
||||
{
|
||||
if (finished)
|
||||
@ -115,7 +106,7 @@ Block MergingSortedBlockInputStream::readImpl()
|
||||
|
||||
|
||||
template <typename TSortCursor>
|
||||
void MergingSortedBlockInputStream::fetchNextBlock(const TSortCursor & current, std::priority_queue<TSortCursor> & queue)
|
||||
void MergingSortedBlockInputStream::fetchNextBlock(const TSortCursor & current, SortingHeap<TSortCursor> & queue)
|
||||
{
|
||||
size_t order = current->order;
|
||||
size_t size = cursors.size();
|
||||
@ -125,15 +116,19 @@ void MergingSortedBlockInputStream::fetchNextBlock(const TSortCursor & current,
|
||||
|
||||
while (true)
|
||||
{
|
||||
source_blocks[order] = new detail::SharedBlock(children[order]->read());
|
||||
source_blocks[order] = new detail::SharedBlock(children[order]->read()); /// intrusive ptr
|
||||
|
||||
if (!*source_blocks[order])
|
||||
{
|
||||
queue.removeTop();
|
||||
break;
|
||||
}
|
||||
|
||||
if (source_blocks[order]->rows())
|
||||
{
|
||||
cursors[order].reset(*source_blocks[order]);
|
||||
queue.push(TSortCursor(&cursors[order]));
|
||||
queue.replaceTop(&cursors[order]);
|
||||
|
||||
source_blocks[order]->all_columns = cursors[order].all_columns;
|
||||
source_blocks[order]->sort_columns = cursors[order].sort_columns;
|
||||
break;
|
||||
@ -154,19 +149,14 @@ bool MergingSortedBlockInputStream::MergeStopCondition::checkStop() const
|
||||
return sum_rows_count >= average;
|
||||
}
|
||||
|
||||
template
|
||||
void MergingSortedBlockInputStream::fetchNextBlock<SortCursor>(const SortCursor & current, std::priority_queue<SortCursor> & queue);
|
||||
|
||||
template
|
||||
void MergingSortedBlockInputStream::fetchNextBlock<SortCursorWithCollation>(const SortCursorWithCollation & current, std::priority_queue<SortCursorWithCollation> & queue);
|
||||
|
||||
|
||||
template <typename TSortCursor>
|
||||
void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<TSortCursor> & queue)
|
||||
template <typename TSortingHeap>
|
||||
void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, TSortingHeap & queue)
|
||||
{
|
||||
size_t merged_rows = 0;
|
||||
|
||||
MergeStopCondition stop_condition(average_block_sizes, max_block_size);
|
||||
|
||||
/** Increase row counters.
|
||||
* Return true if it's time to finish generating the current data block.
|
||||
*/
|
||||
@ -186,123 +176,100 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
|
||||
return stop_condition.checkStop();
|
||||
};
|
||||
|
||||
/// Take rows in required order and put them into `merged_columns`, while the rows are no more than `max_block_size`
|
||||
while (!queue.empty())
|
||||
/// Take rows in required order and put them into `merged_columns`, while the number of rows are no more than `max_block_size`
|
||||
while (queue.isValid())
|
||||
{
|
||||
TSortCursor current = queue.top();
|
||||
auto current = queue.current();
|
||||
size_t current_block_granularity = current->rows;
|
||||
queue.pop();
|
||||
|
||||
while (true)
|
||||
/** And what if the block is totally less or equal than the rest for the current cursor?
|
||||
* Or is there only one data source left in the queue? Then you can take the entire block on current cursor.
|
||||
*/
|
||||
if (current->isFirst()
|
||||
&& (queue.size() == 1
|
||||
|| (queue.size() >= 2 && current.totallyLessOrEquals(queue.nextChild()))))
|
||||
{
|
||||
/** And what if the block is totally less or equal than the rest for the current cursor?
|
||||
* Or is there only one data source left in the queue? Then you can take the entire block on current cursor.
|
||||
*/
|
||||
if (current->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top())))
|
||||
// std::cerr << "current block is totally less or equals\n";
|
||||
|
||||
/// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function.
|
||||
if (merged_rows != 0)
|
||||
{
|
||||
// std::cerr << "current block is totally less or equals\n";
|
||||
|
||||
/// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function.
|
||||
if (merged_rows != 0)
|
||||
{
|
||||
//std::cerr << "merged rows is non-zero\n";
|
||||
queue.push(current);
|
||||
return;
|
||||
}
|
||||
|
||||
/// Actually, current->order stores source number (i.e. cursors[current->order] == current)
|
||||
size_t source_num = current->order;
|
||||
|
||||
if (source_num >= cursors.size())
|
||||
throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
merged_columns[i] = (*std::move(source_blocks[source_num]->getByPosition(i).column)).mutate();
|
||||
|
||||
// std::cerr << "copied columns\n";
|
||||
|
||||
merged_rows = merged_columns.at(0)->size();
|
||||
|
||||
/// Limit output
|
||||
if (limit && total_merged_rows + merged_rows > limit)
|
||||
{
|
||||
merged_rows = limit - total_merged_rows;
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
{
|
||||
auto & column = merged_columns[i];
|
||||
column = (*column->cut(0, merged_rows)).mutate();
|
||||
}
|
||||
|
||||
cancel(false);
|
||||
finished = true;
|
||||
}
|
||||
|
||||
/// Write order of rows for other columns
|
||||
/// this data will be used in grather stream
|
||||
if (out_row_sources_buf)
|
||||
{
|
||||
RowSourcePart row_source(source_num);
|
||||
for (size_t i = 0; i < merged_rows; ++i)
|
||||
out_row_sources_buf->write(row_source.data);
|
||||
}
|
||||
|
||||
//std::cerr << "fetching next block\n";
|
||||
|
||||
total_merged_rows += merged_rows;
|
||||
fetchNextBlock(current, queue);
|
||||
//std::cerr << "merged rows is non-zero\n";
|
||||
return;
|
||||
}
|
||||
|
||||
// std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n";
|
||||
// std::cerr << "Inserting row\n";
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
merged_columns[i]->insertFrom(*current->all_columns[i], current->pos);
|
||||
/// Actually, current->order stores source number (i.e. cursors[current->order] == current)
|
||||
size_t source_num = current->order;
|
||||
|
||||
if (source_num >= cursors.size())
|
||||
throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
merged_columns[i] = (*std::move(source_blocks[source_num]->getByPosition(i).column)).mutate();
|
||||
|
||||
// std::cerr << "copied columns\n";
|
||||
|
||||
merged_rows = merged_columns.at(0)->size();
|
||||
|
||||
/// Limit output
|
||||
if (limit && total_merged_rows + merged_rows > limit)
|
||||
{
|
||||
merged_rows = limit - total_merged_rows;
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
{
|
||||
auto & column = merged_columns[i];
|
||||
column = (*column->cut(0, merged_rows)).mutate();
|
||||
}
|
||||
|
||||
cancel(false);
|
||||
finished = true;
|
||||
}
|
||||
|
||||
/// Write order of rows for other columns
|
||||
/// this data will be used in grather stream
|
||||
if (out_row_sources_buf)
|
||||
{
|
||||
/// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
|
||||
RowSourcePart row_source(current->order);
|
||||
out_row_sources_buf->write(row_source.data);
|
||||
RowSourcePart row_source(source_num);
|
||||
for (size_t i = 0; i < merged_rows; ++i)
|
||||
out_row_sources_buf->write(row_source.data);
|
||||
}
|
||||
|
||||
if (!current->isLast())
|
||||
{
|
||||
// std::cerr << "moving to next row\n";
|
||||
current->next();
|
||||
//std::cerr << "fetching next block\n";
|
||||
|
||||
if (queue.empty() || !(current.greater(queue.top())))
|
||||
{
|
||||
if (count_row_and_check_limit(current_block_granularity))
|
||||
{
|
||||
// std::cerr << "pushing back to queue\n";
|
||||
queue.push(current);
|
||||
return;
|
||||
}
|
||||
total_merged_rows += merged_rows;
|
||||
fetchNextBlock(current, queue);
|
||||
return;
|
||||
}
|
||||
|
||||
/// Do not put the cursor back in the queue, but continue to work with the current cursor.
|
||||
// std::cerr << "current is still on top, using current row\n";
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
// std::cerr << "next row is not least, pushing back to queue\n";
|
||||
queue.push(current);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/// We get the next block from the corresponding source, if there is one.
|
||||
// std::cerr << "It was last row, fetching next block\n";
|
||||
fetchNextBlock(current, queue);
|
||||
}
|
||||
// std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n";
|
||||
// std::cerr << "Inserting row\n";
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
merged_columns[i]->insertFrom(*current->all_columns[i], current->pos);
|
||||
|
||||
break;
|
||||
if (out_row_sources_buf)
|
||||
{
|
||||
/// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
|
||||
RowSourcePart row_source(current->order);
|
||||
out_row_sources_buf->write(row_source.data);
|
||||
}
|
||||
|
||||
if (!current->isLast())
|
||||
{
|
||||
// std::cerr << "moving to next row\n";
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
/// We get the next block from the corresponding source, if there is one.
|
||||
// std::cerr << "It was last row, fetching next block\n";
|
||||
fetchNextBlock(current, queue);
|
||||
}
|
||||
|
||||
if (count_row_and_check_limit(current_block_granularity))
|
||||
return;
|
||||
}
|
||||
|
||||
/// We have read all data. Ask childs to cancel providing more data.
|
||||
cancel(false);
|
||||
finished = true;
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <queue>
|
||||
|
||||
#include <boost/smart_ptr/intrusive_ptr.hpp>
|
||||
|
||||
#include <common/logger_useful.h>
|
||||
@ -87,7 +85,7 @@ protected:
|
||||
|
||||
/// Gets the next block from the source corresponding to the `current`.
|
||||
template <typename TSortCursor>
|
||||
void fetchNextBlock(const TSortCursor & current, std::priority_queue<TSortCursor> & queue);
|
||||
void fetchNextBlock(const TSortCursor & current, SortingHeap<TSortCursor> & queue);
|
||||
|
||||
|
||||
Block header;
|
||||
@ -109,14 +107,10 @@ protected:
|
||||
size_t num_columns = 0;
|
||||
std::vector<SharedBlockPtr> source_blocks;
|
||||
|
||||
using CursorImpls = std::vector<SortCursorImpl>;
|
||||
CursorImpls cursors;
|
||||
SortCursorImpls cursors;
|
||||
|
||||
using Queue = std::priority_queue<SortCursor>;
|
||||
Queue queue_without_collation;
|
||||
|
||||
using QueueWithCollation = std::priority_queue<SortCursorWithCollation>;
|
||||
QueueWithCollation queue_with_collation;
|
||||
SortingHeap<SortCursor> queue_without_collation;
|
||||
SortingHeap<SortCursorWithCollation> queue_with_collation;
|
||||
|
||||
/// Used in Vertical merge algorithm to gather non-PK/non-index columns (on next step)
|
||||
/// If it is not nullptr then it should be populated during execution
|
||||
@ -177,13 +171,10 @@ protected:
|
||||
private:
|
||||
|
||||
/** We support two different cursors - with Collation and without.
|
||||
* Templates are used instead of polymorphic SortCursor and calls to virtual functions.
|
||||
*/
|
||||
template <typename TSortCursor>
|
||||
void initQueue(std::priority_queue<TSortCursor> & queue);
|
||||
|
||||
template <typename TSortCursor>
|
||||
void merge(MutableColumns & merged_columns, std::priority_queue<TSortCursor> & queue);
|
||||
* Templates are used instead of polymorphic SortCursor and calls to virtual functions.
|
||||
*/
|
||||
template <typename TSortingHeap>
|
||||
void merge(MutableColumns & merged_columns, TSortingHeap & queue);
|
||||
|
||||
Logger * log = &Logger::get("MergingSortedBlockInputStream");
|
||||
|
||||
|
@ -48,13 +48,14 @@ Block ReplacingSortedBlockInputStream::readImpl()
|
||||
}
|
||||
|
||||
|
||||
void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue)
|
||||
void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue)
|
||||
{
|
||||
MergeStopCondition stop_condition(average_block_sizes, max_block_size);
|
||||
|
||||
/// Take the rows in needed order and put them into `merged_columns` until rows no more than `max_block_size`
|
||||
while (!queue.empty())
|
||||
while (queue.isValid())
|
||||
{
|
||||
SortCursor current = queue.top();
|
||||
SortCursor current = queue.current();
|
||||
size_t current_block_granularity = current->rows;
|
||||
|
||||
if (current_key.empty())
|
||||
@ -68,8 +69,6 @@ void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, std
|
||||
if (key_differs && stop_condition.checkStop())
|
||||
return;
|
||||
|
||||
queue.pop();
|
||||
|
||||
if (key_differs)
|
||||
{
|
||||
/// Write the data for the previous primary key.
|
||||
@ -98,8 +97,7 @@ void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, std
|
||||
|
||||
if (!current->isLast())
|
||||
{
|
||||
current->next();
|
||||
queue.push(current);
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -52,7 +52,7 @@ private:
|
||||
/// Sources of rows with the current primary key.
|
||||
PODArray<RowSourcePart> current_row_sources;
|
||||
|
||||
void merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue);
|
||||
void merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue);
|
||||
|
||||
/// Output into result the rows for current primary key.
|
||||
void insertRow(MutableColumns & merged_columns);
|
||||
|
@ -314,14 +314,14 @@ Block SummingSortedBlockInputStream::readImpl()
|
||||
}
|
||||
|
||||
|
||||
void SummingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue)
|
||||
void SummingSortedBlockInputStream::merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue)
|
||||
{
|
||||
merged_rows = 0;
|
||||
|
||||
/// Take the rows in needed order and put them in `merged_columns` until rows no more than `max_block_size`
|
||||
while (!queue.empty())
|
||||
while (queue.isValid())
|
||||
{
|
||||
SortCursor current = queue.top();
|
||||
SortCursor current = queue.current();
|
||||
|
||||
setPrimaryKeyRef(next_key, current);
|
||||
|
||||
@ -383,12 +383,9 @@ void SummingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
|
||||
current_row_is_zero = false;
|
||||
}
|
||||
|
||||
queue.pop();
|
||||
|
||||
if (!current->isLast())
|
||||
{
|
||||
current->next();
|
||||
queue.push(current);
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <queue>
|
||||
|
||||
#include <Core/Row.h>
|
||||
#include <Core/ColumnNumbers.h>
|
||||
#include <Common/AlignedBuffer.h>
|
||||
@ -140,7 +142,7 @@ private:
|
||||
/** We support two different cursors - with Collation and without.
|
||||
* Templates are used instead of polymorphic SortCursor and calls to virtual functions.
|
||||
*/
|
||||
void merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue);
|
||||
void merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue);
|
||||
|
||||
/// Insert the summed row for the current group into the result and updates some of per-block flags if the row is not "zero".
|
||||
void insertCurrentRowIfNeeded(MutableColumns & merged_columns);
|
||||
|
@ -82,21 +82,18 @@ Block VersionedCollapsingSortedBlockInputStream::readImpl()
|
||||
}
|
||||
|
||||
|
||||
void VersionedCollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue)
|
||||
void VersionedCollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue)
|
||||
{
|
||||
MergeStopCondition stop_condition(average_block_sizes, max_block_size);
|
||||
|
||||
auto update_queue = [this, & queue](SortCursor & cursor)
|
||||
{
|
||||
queue.pop();
|
||||
|
||||
if (out_row_sources_buf)
|
||||
current_row_sources.emplace(cursor->order, true);
|
||||
|
||||
if (!cursor->isLast())
|
||||
{
|
||||
cursor->next();
|
||||
queue.push(cursor);
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -106,9 +103,9 @@ void VersionedCollapsingSortedBlockInputStream::merge(MutableColumns & merged_co
|
||||
};
|
||||
|
||||
/// Take rows in correct order and put them into `merged_columns` until the rows no more than `max_block_size`
|
||||
while (!queue.empty())
|
||||
while (queue.isValid())
|
||||
{
|
||||
SortCursor current = queue.top();
|
||||
SortCursor current = queue.current();
|
||||
size_t current_block_granularity = current->rows;
|
||||
|
||||
SharedBlockRowRef next_key;
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <DataStreams/MergingSortedBlockInputStream.h>
|
||||
#include <DataStreams/ColumnGathererStream.h>
|
||||
|
||||
#include <deque>
|
||||
#include <queue>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -204,7 +204,7 @@ private:
|
||||
/// Sources of rows for VERTICAL merge algorithm. Size equals to (size + number of gaps) in current_keys.
|
||||
std::queue<RowSourcePart> current_row_sources;
|
||||
|
||||
void merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue);
|
||||
void merge(MutableColumns & merged_columns, SortingHeap<SortCursor> & queue);
|
||||
|
||||
/// Output to result row for the current primary key.
|
||||
void insertRow(size_t skip_rows, const SharedBlockRowRef & row, MutableColumns & merged_columns);
|
||||
|
@ -57,6 +57,6 @@ catch (const Exception & e)
|
||||
std::cerr << e.what() << ", " << e.displayText() << std::endl
|
||||
<< std::endl
|
||||
<< "Stack trace:" << std::endl
|
||||
<< e.getStackTrace().toString();
|
||||
<< e.getStackTraceString();
|
||||
return 1;
|
||||
}
|
||||
|
@ -23,7 +23,6 @@ namespace ErrorCodes
|
||||
extern const int TABLE_ALREADY_EXISTS;
|
||||
extern const int UNKNOWN_TABLE;
|
||||
extern const int UNSUPPORTED_METHOD;
|
||||
extern const int CANNOT_CREATE_TABLE_FROM_METADATA;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
@ -255,10 +254,10 @@ StoragePtr DatabaseLazy::loadTable(const Context & context, const String & table
|
||||
return it->second.table = table;
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
catch (Exception & e)
|
||||
{
|
||||
throw Exception("Cannot create table from metadata file " + table_metadata_path + ". Error: " + DB::getCurrentExceptionMessage(true),
|
||||
e, DB::ErrorCodes::CANNOT_CREATE_TABLE_FROM_METADATA);
|
||||
e.addMessage("Cannot create table from metadata file " + table_metadata_path);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,6 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_CREATE_TABLE_FROM_METADATA;
|
||||
extern const int CANNOT_CREATE_DICTIONARY_FROM_METADATA;
|
||||
extern const int EMPTY_LIST_OF_COLUMNS_PASSED;
|
||||
extern const int CANNOT_PARSE_TEXT;
|
||||
@ -66,13 +65,10 @@ namespace
|
||||
= createTableFromAST(query, database_name, database.getTableDataPath(query), context, has_force_restore_data_flag);
|
||||
database.attachTable(table_name, table);
|
||||
}
|
||||
catch (const Exception & e)
|
||||
catch (Exception & e)
|
||||
{
|
||||
throw Exception(
|
||||
"Cannot attach table '" + query.table + "' from query " + serializeAST(query)
|
||||
+ ". Error: " + DB::getCurrentExceptionMessage(true),
|
||||
e,
|
||||
DB::ErrorCodes::CANNOT_CREATE_TABLE_FROM_METADATA);
|
||||
e.addMessage("Cannot attach table '" + backQuote(query.table) + "' from query " + serializeAST(query));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
@ -87,13 +83,10 @@ namespace
|
||||
{
|
||||
database.attachDictionary(query.table, context);
|
||||
}
|
||||
catch (const Exception & e)
|
||||
catch (Exception & e)
|
||||
{
|
||||
throw Exception(
|
||||
"Cannot create dictionary '" + query.table + "' from query " + serializeAST(query)
|
||||
+ ". Error: " + DB::getCurrentExceptionMessage(true),
|
||||
e,
|
||||
DB::ErrorCodes::CANNOT_CREATE_DICTIONARY_FROM_METADATA);
|
||||
e.addMessage("Cannot attach table '" + backQuote(query.table) + "' from query " + serializeAST(query));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
@ -142,10 +135,10 @@ void DatabaseOrdinary::loadStoredObjects(
|
||||
total_dictionaries += create_query->is_dictionary;
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
catch (Exception & e)
|
||||
{
|
||||
throw Exception(
|
||||
"Cannot parse definition from metadata file " + full_path + ". Error: " + DB::getCurrentExceptionMessage(true), e, ErrorCodes::CANNOT_PARSE_TEXT);
|
||||
e.addMessage("Cannot parse definition from metadata file " + full_path);
|
||||
throw;
|
||||
}
|
||||
|
||||
});
|
||||
|
@ -74,6 +74,7 @@ public:
|
||||
data_to.resize(next_offset);
|
||||
offsets_to[row_num] = next_offset;
|
||||
|
||||
auto * data_to_ptr = data_to.data(); /// avoid assert on array indexing after end
|
||||
for (size_t pos = offset, end = offset + length; pos < end; pos += 4) /// We have padding in column buffers that we can overwrite.
|
||||
{
|
||||
UInt64 rand = thread_local_rng();
|
||||
@ -86,10 +87,10 @@ public:
|
||||
/// Printable characters are from range [32; 126].
|
||||
/// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
||||
|
||||
data_to[pos + 0] = 32 + ((rand1 * 95) >> 16);
|
||||
data_to[pos + 1] = 32 + ((rand2 * 95) >> 16);
|
||||
data_to[pos + 2] = 32 + ((rand3 * 95) >> 16);
|
||||
data_to[pos + 3] = 32 + ((rand4 * 95) >> 16);
|
||||
data_to_ptr[pos + 0] = 32 + ((rand1 * 95) >> 16);
|
||||
data_to_ptr[pos + 1] = 32 + ((rand2 * 95) >> 16);
|
||||
data_to_ptr[pos + 2] = 32 + ((rand3 * 95) >> 16);
|
||||
data_to_ptr[pos + 3] = 32 + ((rand4 * 95) >> 16);
|
||||
|
||||
/// NOTE gcc failed to vectorize this code (aliasing of char?)
|
||||
/// TODO Implement SIMD optimizations from Danila Kutenin.
|
||||
|
@ -124,6 +124,10 @@ public:
|
||||
t1.join();
|
||||
t2.join();
|
||||
}
|
||||
else if (mode == "throw exception")
|
||||
{
|
||||
std::vector<int>().at(0);
|
||||
}
|
||||
else if (mode == "access context")
|
||||
{
|
||||
(void)context.getCurrentQueryId();
|
||||
|
@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <Core/Types.h>
|
||||
#include <Common/BitHelpers.h>
|
||||
#include <Common/Exception.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#if defined(__OpenBSD__) || defined(__FreeBSD__)
|
||||
# include <sys/endian.h>
|
||||
@ -14,9 +15,16 @@
|
||||
# define be64toh(x) OSSwapBigToHostInt64(x)
|
||||
#endif
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
|
||||
extern const int ATTEMPT_TO_READ_AFTER_EOF;
|
||||
}
|
||||
|
||||
/** Reads data from underlying ReadBuffer bit by bit, max 64 bits at once.
|
||||
*
|
||||
* reads MSB bits first, imagine that you have a data:
|
||||
@ -34,15 +42,20 @@ namespace DB
|
||||
|
||||
class BitReader
|
||||
{
|
||||
ReadBuffer & buf;
|
||||
using BufferType = unsigned __int128;
|
||||
|
||||
UInt64 bits_buffer;
|
||||
const char * source_begin;
|
||||
const char * source_current;
|
||||
const char * source_end;
|
||||
|
||||
BufferType bits_buffer;
|
||||
UInt8 bits_count;
|
||||
static constexpr UInt8 BIT_BUFFER_SIZE = sizeof(bits_buffer) * 8;
|
||||
|
||||
public:
|
||||
BitReader(ReadBuffer & buf_)
|
||||
: buf(buf_),
|
||||
BitReader(const char * begin, size_t size)
|
||||
: source_begin(begin),
|
||||
source_current(begin),
|
||||
source_end(begin + size),
|
||||
bits_buffer(0),
|
||||
bits_count(0)
|
||||
{}
|
||||
@ -50,44 +63,21 @@ public:
|
||||
~BitReader()
|
||||
{}
|
||||
|
||||
inline UInt64 readBits(UInt8 bits)
|
||||
// reads bits_to_read high-bits from bits_buffer
|
||||
inline UInt64 readBits(UInt8 bits_to_read)
|
||||
{
|
||||
UInt64 result = 0;
|
||||
bits = std::min(static_cast<UInt8>(sizeof(result) * 8), bits);
|
||||
if (bits_to_read > bits_count)
|
||||
fillBitBuffer();
|
||||
|
||||
while (bits != 0)
|
||||
{
|
||||
if (bits_count == 0)
|
||||
{
|
||||
fillBuffer();
|
||||
if (bits_count == 0)
|
||||
{
|
||||
// EOF.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const auto to_read = std::min(bits, bits_count);
|
||||
|
||||
const UInt64 v = bits_buffer >> (bits_count - to_read);
|
||||
const UInt64 mask = maskLowBits<UInt64>(to_read);
|
||||
const UInt64 value = v & mask;
|
||||
result |= value;
|
||||
|
||||
// unset bits that were read
|
||||
bits_buffer &= ~(mask << (bits_count - to_read));
|
||||
bits_count -= to_read;
|
||||
bits -= to_read;
|
||||
|
||||
result <<= std::min(bits, BIT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
return result;
|
||||
return getBitsFromBitBuffer<CONSUME>(bits_to_read);
|
||||
}
|
||||
|
||||
inline UInt64 peekBits(UInt8 /*bits*/)
|
||||
inline UInt8 peekByte()
|
||||
{
|
||||
return 0;
|
||||
if (bits_count < 8)
|
||||
fillBitBuffer();
|
||||
|
||||
return getBitsFromBitBuffer<PEEK>(8);
|
||||
}
|
||||
|
||||
inline UInt8 readBit()
|
||||
@ -95,34 +85,95 @@ public:
|
||||
return static_cast<UInt8>(readBits(1));
|
||||
}
|
||||
|
||||
// skip bits from bits_buffer
|
||||
inline void skipBufferedBits(UInt8 bits)
|
||||
{
|
||||
bits_buffer <<= bits;
|
||||
bits_count -= bits;
|
||||
}
|
||||
|
||||
|
||||
inline bool eof() const
|
||||
{
|
||||
return bits_count == 0 && buf.eof();
|
||||
return bits_count == 0 && source_current >= source_end;
|
||||
}
|
||||
|
||||
// number of bits that was already read by clients with readBits()
|
||||
inline UInt64 count() const
|
||||
{
|
||||
return (source_current - source_begin) * 8 - bits_count;
|
||||
}
|
||||
|
||||
inline UInt64 remaining() const
|
||||
{
|
||||
return (source_end - source_current) * 8 + bits_count;
|
||||
}
|
||||
|
||||
private:
|
||||
void fillBuffer()
|
||||
enum GetBitsMode {CONSUME, PEEK};
|
||||
// read data from internal buffer, if it has not enough bits, result is undefined.
|
||||
template <GetBitsMode mode>
|
||||
inline UInt64 getBitsFromBitBuffer(UInt8 bits_to_read)
|
||||
{
|
||||
auto read = buf.read(reinterpret_cast<char *>(&bits_buffer), BIT_BUFFER_SIZE / 8);
|
||||
bits_buffer = be64toh(bits_buffer);
|
||||
bits_buffer >>= BIT_BUFFER_SIZE - read * 8;
|
||||
// push down the high-bits
|
||||
const UInt64 result = static_cast<UInt64>(bits_buffer >> (sizeof(bits_buffer) * 8 - bits_to_read));
|
||||
|
||||
bits_count = static_cast<UInt8>(read) * 8;
|
||||
if constexpr (mode == CONSUME)
|
||||
{
|
||||
// 'erase' high-bits that were have read
|
||||
skipBufferedBits(bits_to_read);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Fills internal bits_buffer with data from source, reads at most 64 bits
|
||||
size_t fillBitBuffer()
|
||||
{
|
||||
const size_t available = source_end - source_current;
|
||||
const auto bytes_to_read = std::min<size_t>(64 / 8, available);
|
||||
if (available == 0)
|
||||
{
|
||||
if (bytes_to_read == 0)
|
||||
return 0;
|
||||
|
||||
throw Exception("Buffer is empty, but requested to read "
|
||||
+ std::to_string(bytes_to_read) + " more bytes.",
|
||||
ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF);
|
||||
}
|
||||
|
||||
UInt64 tmp_buffer = 0;
|
||||
memcpy(&tmp_buffer, source_current, bytes_to_read);
|
||||
source_current += bytes_to_read;
|
||||
|
||||
tmp_buffer = be64toh(tmp_buffer);
|
||||
|
||||
bits_buffer |= BufferType(tmp_buffer) << ((sizeof(BufferType) - sizeof(tmp_buffer)) * 8 - bits_count);
|
||||
bits_count += static_cast<UInt8>(bytes_to_read) * 8;
|
||||
|
||||
return bytes_to_read;
|
||||
}
|
||||
};
|
||||
|
||||
class BitWriter
|
||||
{
|
||||
WriteBuffer & buf;
|
||||
using BufferType = unsigned __int128;
|
||||
|
||||
UInt64 bits_buffer;
|
||||
char * dest_begin;
|
||||
char * dest_current;
|
||||
char * dest_end;
|
||||
|
||||
BufferType bits_buffer;
|
||||
UInt8 bits_count;
|
||||
|
||||
static constexpr UInt8 BIT_BUFFER_SIZE = sizeof(bits_buffer) * 8;
|
||||
|
||||
public:
|
||||
BitWriter(WriteBuffer & buf_)
|
||||
: buf(buf_),
|
||||
BitWriter(char * begin, size_t size)
|
||||
: dest_begin(begin),
|
||||
dest_current(begin),
|
||||
dest_end(begin + size),
|
||||
bits_buffer(0),
|
||||
bits_count(0)
|
||||
{}
|
||||
@ -132,54 +183,59 @@ public:
|
||||
flush();
|
||||
}
|
||||
|
||||
inline void writeBits(UInt8 bits, UInt64 value)
|
||||
// write `bits_to_write` low-bits of `value` to the buffer
|
||||
inline void writeBits(UInt8 bits_to_write, UInt64 value)
|
||||
{
|
||||
bits = std::min(static_cast<UInt8>(sizeof(value) * 8), bits);
|
||||
|
||||
while (bits > 0)
|
||||
UInt32 capacity = BIT_BUFFER_SIZE - bits_count;
|
||||
if (capacity < bits_to_write)
|
||||
{
|
||||
auto v = value;
|
||||
auto to_write = bits;
|
||||
|
||||
const UInt8 capacity = BIT_BUFFER_SIZE - bits_count;
|
||||
if (capacity < bits)
|
||||
{
|
||||
v >>= bits - capacity;
|
||||
to_write = capacity;
|
||||
}
|
||||
|
||||
const UInt64 mask = maskLowBits<UInt64>(to_write);
|
||||
v &= mask;
|
||||
|
||||
bits_buffer <<= to_write;
|
||||
bits_buffer |= v;
|
||||
bits_count += to_write;
|
||||
|
||||
if (bits_count < BIT_BUFFER_SIZE)
|
||||
break;
|
||||
|
||||
doFlush();
|
||||
bits -= to_write;
|
||||
capacity = BIT_BUFFER_SIZE - bits_count;
|
||||
}
|
||||
|
||||
// write low bits of value as high bits of bits_buffer
|
||||
const UInt64 mask = maskLowBits<UInt64>(bits_to_write);
|
||||
BufferType v = value & mask;
|
||||
v <<= capacity - bits_to_write;
|
||||
|
||||
bits_buffer |= v;
|
||||
bits_count += bits_to_write;
|
||||
}
|
||||
|
||||
// flush contents of bits_buffer to the dest_current, partial bytes are completed with zeroes.
|
||||
inline void flush()
|
||||
{
|
||||
if (bits_count != 0)
|
||||
{
|
||||
bits_buffer <<= (BIT_BUFFER_SIZE - bits_count);
|
||||
bits_count = (bits_count + 8 - 1) & ~(8 - 1); // align UP to 8-bytes, so doFlush will write ALL data from bits_buffer
|
||||
while (bits_count != 0)
|
||||
doFlush();
|
||||
}
|
||||
}
|
||||
|
||||
inline UInt64 count() const
|
||||
{
|
||||
return (dest_current - dest_begin) * 8 + bits_count;
|
||||
}
|
||||
|
||||
private:
|
||||
void doFlush()
|
||||
{
|
||||
bits_buffer = htobe64(bits_buffer);
|
||||
buf.write(reinterpret_cast<const char *>(&bits_buffer), (bits_count + 7) / 8);
|
||||
// write whole bytes to the dest_current, leaving partial bits in bits_buffer
|
||||
const size_t available = dest_end - dest_current;
|
||||
const size_t to_write = std::min<size_t>(sizeof(UInt64), bits_count / 8); // align to 8-bit boundary
|
||||
|
||||
bits_count = 0;
|
||||
bits_buffer = 0;
|
||||
if (available < to_write)
|
||||
{
|
||||
throw Exception("Can not write past end of buffer. Space available "
|
||||
+ std::to_string(available) + " bytes, required to write: "
|
||||
+ std::to_string(to_write) + ".",
|
||||
ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER);
|
||||
}
|
||||
|
||||
const auto tmp_buffer = htobe64(static_cast<UInt64>(bits_buffer >> (sizeof(bits_buffer) - sizeof(UInt64)) * 8));
|
||||
memcpy(dest_current, &tmp_buffer, to_write);
|
||||
dest_current += to_write;
|
||||
|
||||
bits_buffer <<= to_write * 8;
|
||||
bits_count -= to_write * 8;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -30,14 +30,14 @@ public:
|
||||
BrotliEncoderState * state;
|
||||
};
|
||||
|
||||
BrotliWriteBuffer::BrotliWriteBuffer(WriteBuffer & out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
|
||||
: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment)
|
||||
, brotli(std::make_unique<BrotliStateWrapper>())
|
||||
, in_available(0)
|
||||
, in_data(nullptr)
|
||||
, out_capacity(0)
|
||||
, out_data(nullptr)
|
||||
, out(out_)
|
||||
BrotliWriteBuffer::BrotliWriteBuffer(std::unique_ptr<WriteBuffer> out_, int compression_level, size_t buf_size, char * existing_memory, size_t alignment)
|
||||
: BufferWithOwnMemory<WriteBuffer>(buf_size, existing_memory, alignment)
|
||||
, brotli(std::make_unique<BrotliStateWrapper>())
|
||||
, in_available(0)
|
||||
, in_data(nullptr)
|
||||
, out_capacity(0)
|
||||
, out_data(nullptr)
|
||||
, out(std::move(out_))
|
||||
{
|
||||
BrotliEncoderSetParameter(brotli->state, BROTLI_PARAM_QUALITY, static_cast<uint32_t>(compression_level));
|
||||
// Set LZ77 window size. According to brotli sources default value is 24 (c/tools/brotli.c:81)
|
||||
@ -68,9 +68,9 @@ void BrotliWriteBuffer::nextImpl()
|
||||
|
||||
do
|
||||
{
|
||||
out.nextIfAtEnd();
|
||||
out_data = reinterpret_cast<unsigned char *>(out.position());
|
||||
out_capacity = out.buffer().end() - out.position();
|
||||
out->nextIfAtEnd();
|
||||
out_data = reinterpret_cast<unsigned char *>(out->position());
|
||||
out_capacity = out->buffer().end() - out->position();
|
||||
|
||||
int result = BrotliEncoderCompressStream(
|
||||
brotli->state,
|
||||
@ -81,7 +81,7 @@ void BrotliWriteBuffer::nextImpl()
|
||||
&out_data,
|
||||
nullptr);
|
||||
|
||||
out.position() = out.buffer().end() - out_capacity;
|
||||
out->position() = out->buffer().end() - out_capacity;
|
||||
|
||||
if (result == 0)
|
||||
{
|
||||
@ -100,9 +100,9 @@ void BrotliWriteBuffer::finish()
|
||||
|
||||
while (true)
|
||||
{
|
||||
out.nextIfAtEnd();
|
||||
out_data = reinterpret_cast<unsigned char *>(out.position());
|
||||
out_capacity = out.buffer().end() - out.position();
|
||||
out->nextIfAtEnd();
|
||||
out_data = reinterpret_cast<unsigned char *>(out->position());
|
||||
out_capacity = out->buffer().end() - out->position();
|
||||
|
||||
int result = BrotliEncoderCompressStream(
|
||||
brotli->state,
|
||||
@ -113,7 +113,7 @@ void BrotliWriteBuffer::finish()
|
||||
&out_data,
|
||||
nullptr);
|
||||
|
||||
out.position() = out.buffer().end() - out_capacity;
|
||||
out->position() = out->buffer().end() - out_capacity;
|
||||
|
||||
if (BrotliEncoderIsFinished(brotli->state))
|
||||
{
|
||||
|
@ -10,11 +10,11 @@ class BrotliWriteBuffer : public BufferWithOwnMemory<WriteBuffer>
|
||||
{
|
||||
public:
|
||||
BrotliWriteBuffer(
|
||||
WriteBuffer & out_,
|
||||
int compression_level,
|
||||
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
|
||||
char * existing_memory = nullptr,
|
||||
size_t alignment = 0);
|
||||
std::unique_ptr<WriteBuffer> out_,
|
||||
int compression_level,
|
||||
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
|
||||
char * existing_memory = nullptr,
|
||||
size_t alignment = 0);
|
||||
|
||||
~BrotliWriteBuffer() override;
|
||||
|
||||
@ -30,9 +30,9 @@ private:
|
||||
const uint8_t * in_data;
|
||||
|
||||
size_t out_capacity;
|
||||
uint8_t * out_data;
|
||||
uint8_t * out_data;
|
||||
|
||||
WriteBuffer & out;
|
||||
std::unique_ptr<WriteBuffer> out;
|
||||
|
||||
bool finished = false;
|
||||
};
|
||||
|
104
dbms/src/IO/CompressionMethod.cpp
Normal file
104
dbms/src/IO/CompressionMethod.cpp
Normal file
@ -0,0 +1,104 @@
|
||||
#include <IO/CompressionMethod.h>
|
||||
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <IO/ZlibInflatingReadBuffer.h>
|
||||
#include <IO/ZlibDeflatingWriteBuffer.h>
|
||||
#include <IO/BrotliReadBuffer.h>
|
||||
#include <IO/BrotliWriteBuffer.h>
|
||||
|
||||
#include <Common/config.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
|
||||
std::string toContentEncodingName(CompressionMethod method)
|
||||
{
|
||||
switch (method)
|
||||
{
|
||||
case CompressionMethod::Gzip: return "gzip";
|
||||
case CompressionMethod::Zlib: return "deflate";
|
||||
case CompressionMethod::Brotli: return "br";
|
||||
case CompressionMethod::None: return "";
|
||||
}
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
|
||||
CompressionMethod chooseCompressionMethod(const std::string & path, const std::string & hint)
|
||||
{
|
||||
std::string file_extension;
|
||||
if (hint.empty() || hint == "auto")
|
||||
{
|
||||
auto pos = path.find_last_of('.');
|
||||
if (pos != std::string::npos)
|
||||
file_extension = path.substr(pos + 1, std::string::npos);
|
||||
}
|
||||
|
||||
const std::string * method_str = file_extension.empty() ? &hint : &file_extension;
|
||||
|
||||
if (*method_str == "gzip" || *method_str == "gz")
|
||||
return CompressionMethod::Gzip;
|
||||
if (*method_str == "deflate")
|
||||
return CompressionMethod::Zlib;
|
||||
if (*method_str == "brotli" || *method_str == "br")
|
||||
return CompressionMethod::Brotli;
|
||||
if (hint.empty() || hint == "auto" || hint == "none")
|
||||
return CompressionMethod::None;
|
||||
|
||||
throw Exception("Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'br' are supported as compression methods",
|
||||
ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod(
|
||||
std::unique_ptr<ReadBuffer> nested,
|
||||
CompressionMethod method,
|
||||
size_t buf_size,
|
||||
char * existing_memory,
|
||||
size_t alignment)
|
||||
{
|
||||
if (method == CompressionMethod::Gzip || method == CompressionMethod::Zlib)
|
||||
return std::make_unique<ZlibInflatingReadBuffer>(std::move(nested), method, buf_size, existing_memory, alignment);
|
||||
#if USE_BROTLI
|
||||
if (method == CompressionMethod::Brotli)
|
||||
return std::make_unique<BrotliReadBuffer>(std::move(nested), buf_size, existing_memory, alignment);
|
||||
#endif
|
||||
|
||||
if (method == CompressionMethod::None)
|
||||
return nested;
|
||||
|
||||
throw Exception("Unsupported compression method", ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<WriteBuffer> wrapWriteBufferWithCompressionMethod(
|
||||
std::unique_ptr<WriteBuffer> nested,
|
||||
CompressionMethod method,
|
||||
int level,
|
||||
size_t buf_size,
|
||||
char * existing_memory,
|
||||
size_t alignment)
|
||||
{
|
||||
if (method == DB::CompressionMethod::Gzip || method == CompressionMethod::Zlib)
|
||||
return std::make_unique<ZlibDeflatingWriteBuffer>(std::move(nested), method, level, buf_size, existing_memory, alignment);
|
||||
|
||||
#if USE_BROTLI
|
||||
if (method == DB::CompressionMethod::Brotli)
|
||||
return std::make_unique<BrotliWriteBuffer>(std::move(nested), level, buf_size, existing_memory, alignment);
|
||||
#endif
|
||||
|
||||
if (method == CompressionMethod::None)
|
||||
return nested;
|
||||
|
||||
throw Exception("Unsupported compression method", ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
}
|
@ -1,18 +1,57 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
#include <Core/Defines.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ReadBuffer;
|
||||
class WriteBuffer;
|
||||
|
||||
/** These are "generally recognizable" compression methods for data import/export.
|
||||
* Do not mess with more efficient compression methods used by ClickHouse internally
|
||||
* (they use non-standard framing, indexes, checksums...)
|
||||
*/
|
||||
|
||||
enum class CompressionMethod
|
||||
{
|
||||
None,
|
||||
/// DEFLATE compression with gzip header and CRC32 checksum.
|
||||
/// This option corresponds to files produced by gzip(1) or HTTP Content-Encoding: gzip.
|
||||
Gzip,
|
||||
/// DEFLATE compression with zlib header and Adler32 checksum.
|
||||
/// This option corresponds to HTTP Content-Encoding: deflate.
|
||||
Zlib,
|
||||
Brotli,
|
||||
None
|
||||
Brotli
|
||||
};
|
||||
|
||||
/// How the compression method is named in HTTP.
|
||||
std::string toContentEncodingName(CompressionMethod method);
|
||||
|
||||
/** Choose compression method from path and hint.
|
||||
* if hint is "auto" or empty string, then path is analyzed,
|
||||
* otherwise path parameter is ignored and hint is used as compression method name.
|
||||
* path is arbitrary string that will be analyzed for file extension (gz, br...) that determines compression.
|
||||
*/
|
||||
CompressionMethod chooseCompressionMethod(const std::string & path, const std::string & hint);
|
||||
|
||||
std::unique_ptr<ReadBuffer> wrapReadBufferWithCompressionMethod(
|
||||
std::unique_ptr<ReadBuffer> nested,
|
||||
CompressionMethod method,
|
||||
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
|
||||
char * existing_memory = nullptr,
|
||||
size_t alignment = 0);
|
||||
|
||||
std::unique_ptr<WriteBuffer> wrapWriteBufferWithCompressionMethod(
|
||||
std::unique_ptr<WriteBuffer> nested,
|
||||
CompressionMethod method,
|
||||
int level,
|
||||
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
|
||||
char * existing_memory = nullptr,
|
||||
size_t alignment = 0);
|
||||
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ namespace ErrorCodes
|
||||
}
|
||||
|
||||
|
||||
void MMapReadBufferFromFile::open(const std::string & file_name)
|
||||
void MMapReadBufferFromFile::open()
|
||||
{
|
||||
ProfileEvents::increment(ProfileEvents::FileOpen);
|
||||
|
||||
@ -34,16 +34,24 @@ void MMapReadBufferFromFile::open(const std::string & file_name)
|
||||
}
|
||||
|
||||
|
||||
MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name, size_t offset, size_t length_)
|
||||
std::string MMapReadBufferFromFile::getFileName() const
|
||||
{
|
||||
open(file_name);
|
||||
return file_name;
|
||||
}
|
||||
|
||||
|
||||
MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, size_t offset, size_t length_)
|
||||
: file_name(file_name_)
|
||||
{
|
||||
open();
|
||||
init(fd, offset, length_);
|
||||
}
|
||||
|
||||
|
||||
MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name, size_t offset)
|
||||
MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, size_t offset)
|
||||
: file_name(file_name_)
|
||||
{
|
||||
open(file_name);
|
||||
open();
|
||||
init(fd, offset);
|
||||
}
|
||||
|
||||
|
@ -16,21 +16,24 @@ namespace DB
|
||||
class MMapReadBufferFromFile : public MMapReadBufferFromFileDescriptor
|
||||
{
|
||||
public:
|
||||
MMapReadBufferFromFile(const std::string & file_name, size_t offset, size_t length_);
|
||||
MMapReadBufferFromFile(const std::string & file_name_, size_t offset, size_t length_);
|
||||
|
||||
/// Map till end of file.
|
||||
MMapReadBufferFromFile(const std::string & file_name, size_t offset);
|
||||
MMapReadBufferFromFile(const std::string & file_name_, size_t offset);
|
||||
|
||||
~MMapReadBufferFromFile() override;
|
||||
|
||||
void close();
|
||||
|
||||
std::string getFileName() const override;
|
||||
|
||||
private:
|
||||
int fd = -1;
|
||||
std::string file_name;
|
||||
|
||||
CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead};
|
||||
|
||||
void open(const std::string & file_name);
|
||||
void open();
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -5,6 +5,8 @@
|
||||
|
||||
#include <Common/ProfileEvents.h>
|
||||
#include <Common/formatReadable.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/MMapReadBufferFromFileDescriptor.h>
|
||||
|
||||
|
||||
@ -18,6 +20,8 @@ namespace ErrorCodes
|
||||
extern const int CANNOT_STAT;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int CANNOT_SEEK_THROUGH_FILE;
|
||||
}
|
||||
|
||||
|
||||
@ -34,6 +38,7 @@ void MMapReadBufferFromFileDescriptor::init(int fd_, size_t offset, size_t lengt
|
||||
ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||
|
||||
BufferBase::set(static_cast<char *>(buf), length, 0);
|
||||
ReadBuffer::padded = (length % 4096) > 0 && (length % 4096) <= (4096 - 15); /// TODO determine page size
|
||||
}
|
||||
}
|
||||
|
||||
@ -58,14 +63,12 @@ void MMapReadBufferFromFileDescriptor::init(int fd_, size_t offset)
|
||||
|
||||
|
||||
MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd_, size_t offset_, size_t length_)
|
||||
: MMapReadBufferFromFileDescriptor()
|
||||
{
|
||||
init(fd_, offset_, length_);
|
||||
}
|
||||
|
||||
|
||||
MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd_, size_t offset_)
|
||||
: MMapReadBufferFromFileDescriptor()
|
||||
{
|
||||
init(fd_, offset_);
|
||||
}
|
||||
@ -87,4 +90,39 @@ void MMapReadBufferFromFileDescriptor::finish()
|
||||
length = 0;
|
||||
}
|
||||
|
||||
std::string MMapReadBufferFromFileDescriptor::getFileName() const
|
||||
{
|
||||
return "(fd = " + toString(fd) + ")";
|
||||
}
|
||||
|
||||
int MMapReadBufferFromFileDescriptor::getFD() const
|
||||
{
|
||||
return fd;
|
||||
}
|
||||
|
||||
off_t MMapReadBufferFromFileDescriptor::getPositionInFile()
|
||||
{
|
||||
return count();
|
||||
}
|
||||
|
||||
off_t MMapReadBufferFromFileDescriptor::doSeek(off_t offset, int whence)
|
||||
{
|
||||
off_t new_pos;
|
||||
if (whence == SEEK_SET)
|
||||
new_pos = offset;
|
||||
else if (whence == SEEK_CUR)
|
||||
new_pos = count() + offset;
|
||||
else
|
||||
throw Exception("MMapReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
||||
|
||||
working_buffer = internal_buffer;
|
||||
if (new_pos < 0 || new_pos > off_t(working_buffer.size()))
|
||||
throw Exception("Cannot seek through file " + getFileName()
|
||||
+ " because seek position (" + toString(new_pos) + ") is out of bounds [0, " + toString(working_buffer.size()) + "]",
|
||||
ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||
|
||||
position() = working_buffer.begin() + new_pos;
|
||||
return new_pos;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/ReadBufferFromFileBase.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -11,14 +11,16 @@ namespace DB
|
||||
* Also you cannot control whether and how long actual IO take place,
|
||||
* so this method is not manageable and not recommended for anything except benchmarks.
|
||||
*/
|
||||
class MMapReadBufferFromFileDescriptor : public ReadBuffer
|
||||
class MMapReadBufferFromFileDescriptor : public ReadBufferFromFileBase
|
||||
{
|
||||
protected:
|
||||
MMapReadBufferFromFileDescriptor() : ReadBuffer(nullptr, 0) {}
|
||||
MMapReadBufferFromFileDescriptor() {}
|
||||
|
||||
void init(int fd_, size_t offset, size_t length_);
|
||||
void init(int fd_, size_t offset);
|
||||
|
||||
off_t doSeek(off_t off, int whence) override;
|
||||
|
||||
public:
|
||||
MMapReadBufferFromFileDescriptor(int fd_, size_t offset_, size_t length_);
|
||||
|
||||
@ -30,6 +32,10 @@ public:
|
||||
/// unmap memory before call to destructor
|
||||
void finish();
|
||||
|
||||
off_t getPositionInFile() override;
|
||||
std::string getFileName() const override;
|
||||
int getFD() const override;
|
||||
|
||||
private:
|
||||
size_t length = 0;
|
||||
int fd = -1;
|
||||
|
@ -3,6 +3,11 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
ReadBufferFromFileBase::ReadBufferFromFileBase()
|
||||
: BufferWithOwnMemory<ReadBuffer>(0)
|
||||
{
|
||||
}
|
||||
|
||||
ReadBufferFromFileBase::ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment)
|
||||
: BufferWithOwnMemory<ReadBuffer>(buf_size, existing_memory, alignment)
|
||||
{
|
||||
|
@ -14,6 +14,7 @@ namespace DB
|
||||
class ReadBufferFromFileBase : public BufferWithOwnMemory<ReadBuffer>
|
||||
{
|
||||
public:
|
||||
ReadBufferFromFileBase();
|
||||
ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment);
|
||||
ReadBufferFromFileBase(ReadBufferFromFileBase &&) = default;
|
||||
~ReadBufferFromFileBase() override;
|
||||
|
@ -101,10 +101,12 @@ bool ReadBufferFromFileDescriptor::nextImpl()
|
||||
/// If 'offset' is small enough to stay in buffer after seek, then true seek in file does not happen.
|
||||
off_t ReadBufferFromFileDescriptor::doSeek(off_t offset, int whence)
|
||||
{
|
||||
off_t new_pos = offset;
|
||||
if (whence == SEEK_CUR)
|
||||
off_t new_pos;
|
||||
if (whence == SEEK_SET)
|
||||
new_pos = offset;
|
||||
else if (whence == SEEK_CUR)
|
||||
new_pos = pos_in_file - (working_buffer.end() - pos) + offset;
|
||||
else if (whence != SEEK_SET)
|
||||
else
|
||||
throw Exception("ReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
||||
|
||||
/// Position is unchanged.
|
||||
|
@ -965,7 +965,7 @@ void readException(Exception & e, ReadBuffer & buf, const String & additional_me
|
||||
String name;
|
||||
String message;
|
||||
String stack_trace;
|
||||
bool has_nested = false;
|
||||
bool has_nested = false; /// Obsolete
|
||||
|
||||
readBinary(code, buf);
|
||||
readBinary(name, buf);
|
||||
@ -986,14 +986,7 @@ void readException(Exception & e, ReadBuffer & buf, const String & additional_me
|
||||
if (!stack_trace.empty())
|
||||
out << " Stack trace:\n\n" << stack_trace;
|
||||
|
||||
if (has_nested)
|
||||
{
|
||||
Exception nested;
|
||||
readException(nested, buf);
|
||||
e = Exception(out.str(), nested, code);
|
||||
}
|
||||
else
|
||||
e = Exception(out.str(), code);
|
||||
e = Exception(out.str(), code);
|
||||
}
|
||||
|
||||
void readAndThrowException(ReadBuffer & buf, const String & additional_message)
|
||||
|
@ -29,22 +29,13 @@
|
||||
#include <IO/CompressionMethod.h>
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/ReadBufferFromMemory.h>
|
||||
#include <IO/BufferWithOwnMemory.h>
|
||||
#include <IO/VarInt.h>
|
||||
#include <IO/ZlibInflatingReadBuffer.h>
|
||||
|
||||
#include <DataTypes/DataTypeDateTime.h>
|
||||
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wdouble-promotion"
|
||||
#endif
|
||||
|
||||
#include <double-conversion/double-conversion.h>
|
||||
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif
|
||||
|
||||
|
||||
/// 1 GiB
|
||||
#define DEFAULT_MAX_STRING_SIZE (1ULL << 30)
|
||||
@ -1024,21 +1015,11 @@ void skipToNextLineOrEOF(ReadBuffer & buf);
|
||||
/// Skip to next character after next unescaped \n. If no \n in stream, skip to end. Does not throw on invalid escape sequences.
|
||||
void skipToUnescapedNextLineOrEOF(ReadBuffer & buf);
|
||||
|
||||
template <class TReadBuffer, class... Types>
|
||||
std::unique_ptr<ReadBuffer> getReadBuffer(const DB::CompressionMethod method, Types&&... args)
|
||||
{
|
||||
if (method == DB::CompressionMethod::Gzip)
|
||||
{
|
||||
auto read_buf = std::make_unique<TReadBuffer>(std::forward<Types>(args)...);
|
||||
return std::make_unique<ZlibInflatingReadBuffer>(std::move(read_buf), method);
|
||||
}
|
||||
return std::make_unique<TReadBuffer>(args...);
|
||||
}
|
||||
|
||||
/** This function just copies the data from buffer's internal position (in.position())
|
||||
* to current position (from arguments) into memory.
|
||||
*/
|
||||
void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current);
|
||||
void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current);
|
||||
|
||||
/** This function is negative to eof().
|
||||
* In fact it returns whether the data was loaded to internal ReadBuffers's buffer or not.
|
||||
@ -1047,6 +1028,6 @@ void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current);
|
||||
* of our buffer and the current cursor in the end of the buffer. When we call eof() it calls next().
|
||||
* And this function can fill the buffer with new data, so we will lose the data from previous buffer state.
|
||||
*/
|
||||
bool loadAtPosition(ReadBuffer & in, DB::Memory<> & memory, char * & current);
|
||||
bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current);
|
||||
|
||||
}
|
||||
|
@ -105,67 +105,41 @@ void WriteBufferFromHTTPServerResponse::nextImpl()
|
||||
{
|
||||
if (compress)
|
||||
{
|
||||
if (compression_method == CompressionMethod::Gzip)
|
||||
{
|
||||
#if defined(POCO_CLICKHOUSE_PATCH)
|
||||
*response_header_ostr << "Content-Encoding: gzip\r\n";
|
||||
#else
|
||||
response.set("Content-Encoding", "gzip");
|
||||
response_body_ostr = &(response.send());
|
||||
#endif
|
||||
out_raw = std::make_unique<WriteBufferFromOStream>(*response_body_ostr);
|
||||
deflating_buf.emplace(std::move(out_raw), compression_method, compression_level, working_buffer.size(), working_buffer.begin());
|
||||
out = &*deflating_buf;
|
||||
}
|
||||
else if (compression_method == CompressionMethod::Zlib)
|
||||
{
|
||||
#if defined(POCO_CLICKHOUSE_PATCH)
|
||||
*response_header_ostr << "Content-Encoding: deflate\r\n";
|
||||
#else
|
||||
response.set("Content-Encoding", "deflate");
|
||||
response_body_ostr = &(response.send());
|
||||
#endif
|
||||
out_raw = std::make_unique<WriteBufferFromOStream>(*response_body_ostr);
|
||||
deflating_buf.emplace(std::move(out_raw), compression_method, compression_level, working_buffer.size(), working_buffer.begin());
|
||||
out = &*deflating_buf;
|
||||
}
|
||||
#if USE_BROTLI
|
||||
else if (compression_method == CompressionMethod::Brotli)
|
||||
{
|
||||
#if defined(POCO_CLICKHOUSE_PATCH)
|
||||
*response_header_ostr << "Content-Encoding: br\r\n";
|
||||
#else
|
||||
response.set("Content-Encoding", "br");
|
||||
response_body_ostr = &(response.send());
|
||||
#endif
|
||||
out_raw = std::make_unique<WriteBufferFromOStream>(*response_body_ostr);
|
||||
brotli_buf.emplace(*out_raw, compression_level, working_buffer.size(), working_buffer.begin());
|
||||
out = &*brotli_buf;
|
||||
}
|
||||
#endif
|
||||
auto content_encoding_name = toContentEncodingName(compression_method);
|
||||
|
||||
else
|
||||
throw Exception("Logical error: unknown compression method passed to WriteBufferFromHTTPServerResponse",
|
||||
ErrorCodes::LOGICAL_ERROR);
|
||||
/// Use memory allocated for the outer buffer in the buffer pointed to by out. This avoids extra allocation and copy.
|
||||
#if defined(POCO_CLICKHOUSE_PATCH)
|
||||
*response_header_ostr << "Content-Encoding: " << content_encoding_name << "\r\n";
|
||||
#else
|
||||
response.set("Content-Encoding", content_encoding_name);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
#if !defined(POCO_CLICKHOUSE_PATCH)
|
||||
response_body_ostr = &(response.send());
|
||||
response_body_ostr = &(response.send());
|
||||
#endif
|
||||
|
||||
out_raw = std::make_unique<WriteBufferFromOStream>(*response_body_ostr, working_buffer.size(), working_buffer.begin());
|
||||
out = &*out_raw;
|
||||
}
|
||||
/// We reuse our buffer in "out" to avoid extra allocations and copies.
|
||||
|
||||
if (compress)
|
||||
out = wrapWriteBufferWithCompressionMethod(
|
||||
std::make_unique<WriteBufferFromOStream>(*response_body_ostr),
|
||||
compress ? compression_method : CompressionMethod::None,
|
||||
compression_level,
|
||||
working_buffer.size(),
|
||||
working_buffer.begin());
|
||||
else
|
||||
out = std::make_unique<WriteBufferFromOStream>(
|
||||
*response_body_ostr,
|
||||
working_buffer.size(),
|
||||
working_buffer.begin());
|
||||
}
|
||||
|
||||
finishSendHeaders();
|
||||
|
||||
}
|
||||
|
||||
if (out)
|
||||
{
|
||||
out->buffer() = buffer();
|
||||
out->position() = position();
|
||||
out->next();
|
||||
}
|
||||
@ -177,9 +151,8 @@ WriteBufferFromHTTPServerResponse::WriteBufferFromHTTPServerResponse(
|
||||
Poco::Net::HTTPServerResponse & response_,
|
||||
unsigned keep_alive_timeout_,
|
||||
bool compress_,
|
||||
CompressionMethod compression_method_,
|
||||
size_t size)
|
||||
: BufferWithOwnMemory<WriteBuffer>(size)
|
||||
CompressionMethod compression_method_)
|
||||
: BufferWithOwnMemory<WriteBuffer>(DBMS_DEFAULT_BUFFER_SIZE)
|
||||
, request(request_)
|
||||
, response(response_)
|
||||
, keep_alive_timeout(keep_alive_timeout_)
|
||||
@ -215,6 +188,9 @@ void WriteBufferFromHTTPServerResponse::finalize()
|
||||
if (offset())
|
||||
{
|
||||
next();
|
||||
|
||||
if (out)
|
||||
out.reset();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -8,8 +8,6 @@
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <IO/BufferWithOwnMemory.h>
|
||||
#include <IO/WriteBufferFromOStream.h>
|
||||
#include <IO/ZlibDeflatingWriteBuffer.h>
|
||||
#include <IO/BrotliWriteBuffer.h>
|
||||
#include <IO/HTTPCommon.h>
|
||||
#include <IO/Progress.h>
|
||||
#include <Common/NetException.h>
|
||||
@ -52,7 +50,7 @@ private:
|
||||
unsigned keep_alive_timeout = 0;
|
||||
bool compress = false;
|
||||
CompressionMethod compression_method;
|
||||
int compression_level = Z_DEFAULT_COMPRESSION;
|
||||
int compression_level = 1;
|
||||
|
||||
std::ostream * response_body_ostr = nullptr;
|
||||
|
||||
@ -60,13 +58,7 @@ private:
|
||||
std::ostream * response_header_ostr = nullptr;
|
||||
#endif
|
||||
|
||||
std::unique_ptr<WriteBufferFromOStream> out_raw;
|
||||
std::optional<ZlibDeflatingWriteBuffer> deflating_buf;
|
||||
#if USE_BROTLI
|
||||
std::optional<BrotliWriteBuffer> brotli_buf;
|
||||
#endif
|
||||
|
||||
WriteBuffer * out = nullptr; /// Uncompressed HTTP body is written to this buffer. Points to out_raw or possibly to deflating_buf.
|
||||
std::unique_ptr<WriteBuffer> out;
|
||||
|
||||
bool headers_started_sending = false;
|
||||
bool headers_finished_sending = false; /// If true, you could not add any headers.
|
||||
@ -99,8 +91,7 @@ public:
|
||||
Poco::Net::HTTPServerResponse & response_,
|
||||
unsigned keep_alive_timeout_,
|
||||
bool compress_ = false, /// If true - set Content-Encoding header and compress the result.
|
||||
CompressionMethod compression_method_ = CompressionMethod::Gzip,
|
||||
size_t size = DBMS_DEFAULT_BUFFER_SIZE);
|
||||
CompressionMethod compression_method_ = CompressionMethod::None);
|
||||
|
||||
/// Writes progess in repeating HTTP headers.
|
||||
void onProgress(const Progress & progress);
|
||||
|
@ -48,7 +48,6 @@ void formatUUID(std::reverse_iterator<const UInt8 *> src16, UInt8 * dst36)
|
||||
}
|
||||
|
||||
|
||||
|
||||
void writeException(const Exception & e, WriteBuffer & buf, bool with_stack_trace)
|
||||
{
|
||||
writeBinary(e.code(), buf);
|
||||
@ -56,14 +55,11 @@ void writeException(const Exception & e, WriteBuffer & buf, bool with_stack_trac
|
||||
writeBinary(e.displayText(), buf);
|
||||
|
||||
if (with_stack_trace)
|
||||
writeBinary(e.getStackTrace().toString(), buf);
|
||||
writeBinary(e.getStackTraceString(), buf);
|
||||
else
|
||||
writeBinary(String(), buf);
|
||||
|
||||
bool has_nested = e.nested() != nullptr;
|
||||
bool has_nested = false;
|
||||
writeBinary(has_nested, buf);
|
||||
|
||||
if (has_nested)
|
||||
writeException(Exception(Exception::CreateFromPoco, *e.nested()), buf, with_stack_trace);
|
||||
}
|
||||
}
|
||||
|
@ -26,10 +26,12 @@
|
||||
#include <IO/VarInt.h>
|
||||
#include <IO/DoubleConverter.h>
|
||||
#include <IO/WriteBufferFromString.h>
|
||||
#include <IO/ZlibDeflatingWriteBuffer.h>
|
||||
|
||||
#include <ryu/ryu.h>
|
||||
|
||||
#include <Formats/FormatSettings.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
@ -115,21 +117,108 @@ inline void writeBoolText(bool x, WriteBuffer & buf)
|
||||
writeChar(x ? '1' : '0', buf);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline size_t writeFloatTextFastPath(T x, char * buffer, int len)
|
||||
|
||||
struct DecomposedFloat64
|
||||
{
|
||||
using Converter = DoubleConverter<false>;
|
||||
double_conversion::StringBuilder builder{buffer, len};
|
||||
DecomposedFloat64(double x)
|
||||
{
|
||||
memcpy(&x_uint, &x, sizeof(x));
|
||||
}
|
||||
|
||||
uint64_t x_uint;
|
||||
|
||||
bool sign() const
|
||||
{
|
||||
return x_uint >> 63;
|
||||
}
|
||||
|
||||
uint16_t exponent() const
|
||||
{
|
||||
return (x_uint >> 52) & 0x7FF;
|
||||
}
|
||||
|
||||
int16_t normalized_exponent() const
|
||||
{
|
||||
return int16_t(exponent()) - 1023;
|
||||
}
|
||||
|
||||
uint64_t mantissa() const
|
||||
{
|
||||
return x_uint & 0x5affffffffffffful;
|
||||
}
|
||||
|
||||
/// NOTE Probably floating point instructions can be better.
|
||||
bool is_inside_int64() const
|
||||
{
|
||||
return x_uint == 0
|
||||
|| (normalized_exponent() >= 0 && normalized_exponent() <= 52
|
||||
&& ((mantissa() & ((1ULL << (52 - normalized_exponent())) - 1)) == 0));
|
||||
}
|
||||
};
|
||||
|
||||
struct DecomposedFloat32
|
||||
{
|
||||
DecomposedFloat32(float x)
|
||||
{
|
||||
memcpy(&x_uint, &x, sizeof(x));
|
||||
}
|
||||
|
||||
uint32_t x_uint;
|
||||
|
||||
bool sign() const
|
||||
{
|
||||
return x_uint >> 31;
|
||||
}
|
||||
|
||||
uint16_t exponent() const
|
||||
{
|
||||
return (x_uint >> 23) & 0xFF;
|
||||
}
|
||||
|
||||
int16_t normalized_exponent() const
|
||||
{
|
||||
return int16_t(exponent()) - 127;
|
||||
}
|
||||
|
||||
uint32_t mantissa() const
|
||||
{
|
||||
return x_uint & 0x7fffff;
|
||||
}
|
||||
|
||||
bool is_inside_int32() const
|
||||
{
|
||||
return x_uint == 0
|
||||
|| (normalized_exponent() >= 0 && normalized_exponent() <= 23
|
||||
&& ((mantissa() & ((1ULL << (23 - normalized_exponent())) - 1)) == 0));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline size_t writeFloatTextFastPath(T x, char * buffer)
|
||||
{
|
||||
int result = 0;
|
||||
|
||||
bool result = false;
|
||||
if constexpr (std::is_same_v<T, double>)
|
||||
result = Converter::instance().ToShortest(x, &builder);
|
||||
else
|
||||
result = Converter::instance().ToShortestSingle(x, &builder);
|
||||
{
|
||||
/// The library Ryu has low performance on integers.
|
||||
/// This workaround improves performance 6..10 times.
|
||||
|
||||
if (!result)
|
||||
if (DecomposedFloat64(x).is_inside_int64())
|
||||
result = itoa(Int64(x), buffer) - buffer;
|
||||
else
|
||||
result = d2s_buffered_n(x, buffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (DecomposedFloat32(x).is_inside_int32())
|
||||
result = itoa(Int32(x), buffer) - buffer;
|
||||
else
|
||||
result = f2s_buffered_n(x, buffer);
|
||||
}
|
||||
|
||||
if (result <= 0)
|
||||
throw Exception("Cannot print floating point number", ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER);
|
||||
return builder.position();
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -140,23 +229,13 @@ inline void writeFloatText(T x, WriteBuffer & buf)
|
||||
using Converter = DoubleConverter<false>;
|
||||
if (likely(buf.available() >= Converter::MAX_REPRESENTATION_LENGTH))
|
||||
{
|
||||
buf.position() += writeFloatTextFastPath(x, buf.position(), Converter::MAX_REPRESENTATION_LENGTH);
|
||||
buf.position() += writeFloatTextFastPath(x, buf.position());
|
||||
return;
|
||||
}
|
||||
|
||||
Converter::BufferType buffer;
|
||||
double_conversion::StringBuilder builder{buffer, sizeof(buffer)};
|
||||
|
||||
bool result = false;
|
||||
if constexpr (std::is_same_v<T, double>)
|
||||
result = Converter::instance().ToShortest(x, &builder);
|
||||
else
|
||||
result = Converter::instance().ToShortestSingle(x, &builder);
|
||||
|
||||
if (!result)
|
||||
throw Exception("Cannot print floating point number", ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER);
|
||||
|
||||
buf.write(buffer, builder.position());
|
||||
size_t result = writeFloatTextFastPath(x, buffer);
|
||||
buf.write(buffer, result);
|
||||
}
|
||||
|
||||
|
||||
@ -955,15 +1034,4 @@ inline String toString(const T & x)
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
template <class TWriteBuffer, class... Types>
|
||||
std::unique_ptr<WriteBuffer> getWriteBuffer(const DB::CompressionMethod method, Types&&... args)
|
||||
{
|
||||
if (method == DB::CompressionMethod::Gzip)
|
||||
{
|
||||
auto write_buf = std::make_unique<TWriteBuffer>(std::forward<Types>(args)...);
|
||||
return std::make_unique<ZlibDeflatingWriteBuffer>(std::move(write_buf), method, 1 /* compression level */);
|
||||
}
|
||||
return std::make_unique<TWriteBuffer>(args...);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,6 +5,12 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ZLIB_DEFLATE_FAILED;
|
||||
}
|
||||
|
||||
|
||||
ZlibDeflatingWriteBuffer::ZlibDeflatingWriteBuffer(
|
||||
std::unique_ptr<WriteBuffer> out_,
|
||||
CompressionMethod compression_method,
|
||||
@ -84,6 +90,21 @@ void ZlibDeflatingWriteBuffer::finish()
|
||||
|
||||
next();
|
||||
|
||||
/// https://github.com/zlib-ng/zlib-ng/issues/494
|
||||
do
|
||||
{
|
||||
out->nextIfAtEnd();
|
||||
zstr.next_out = reinterpret_cast<unsigned char *>(out->position());
|
||||
zstr.avail_out = out->buffer().end() - out->position();
|
||||
|
||||
int rc = deflate(&zstr, Z_FULL_FLUSH);
|
||||
out->position() = out->buffer().end() - zstr.avail_out;
|
||||
|
||||
if (rc != Z_OK)
|
||||
throw Exception(std::string("deflate failed: ") + zError(rc), ErrorCodes::ZLIB_DEFLATE_FAILED);
|
||||
}
|
||||
while (zstr.avail_out == 0);
|
||||
|
||||
while (true)
|
||||
{
|
||||
out->nextIfAtEnd();
|
||||
|
@ -10,11 +10,6 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ZLIB_DEFLATE_FAILED;
|
||||
}
|
||||
|
||||
/// Performs compression using zlib library and writes compressed data to out_ WriteBuffer.
|
||||
class ZlibDeflatingWriteBuffer : public BufferWithOwnMemory<WriteBuffer>
|
||||
{
|
||||
|
@ -3,6 +3,7 @@
|
||||
#if defined(__linux__) || defined(__FreeBSD__)
|
||||
#include <IO/ReadBufferAIO.h>
|
||||
#endif
|
||||
#include <IO/MMapReadBufferFromFile.h>
|
||||
#include <Common/ProfileEvents.h>
|
||||
|
||||
|
||||
@ -11,13 +12,17 @@ namespace ProfileEvents
|
||||
extern const Event CreatedReadBufferOrdinary;
|
||||
extern const Event CreatedReadBufferAIO;
|
||||
extern const Event CreatedReadBufferAIOFailed;
|
||||
extern const Event CreatedReadBufferMMap;
|
||||
extern const Event CreatedReadBufferMMapFailed;
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(const std::string & filename_, size_t estimated_size,
|
||||
size_t aio_threshold, size_t buffer_size_, int flags_, char * existing_memory_, size_t alignment)
|
||||
std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
|
||||
const std::string & filename_,
|
||||
size_t estimated_size, size_t aio_threshold, size_t mmap_threshold,
|
||||
size_t buffer_size_, int flags_, char * existing_memory_, size_t alignment)
|
||||
{
|
||||
#if defined(__linux__) || defined(__FreeBSD__)
|
||||
if (aio_threshold && estimated_size >= aio_threshold)
|
||||
@ -40,6 +45,21 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(const std::
|
||||
(void)estimated_size;
|
||||
#endif
|
||||
|
||||
if (!existing_memory_ && mmap_threshold && estimated_size >= mmap_threshold)
|
||||
{
|
||||
try
|
||||
{
|
||||
auto res = std::make_unique<MMapReadBufferFromFile>(filename_, 0);
|
||||
ProfileEvents::increment(ProfileEvents::CreatedReadBufferMMap);
|
||||
return res;
|
||||
}
|
||||
catch (const ErrnoException &)
|
||||
{
|
||||
/// Fallback if mmap is not supported (example: pipe).
|
||||
ProfileEvents::increment(ProfileEvents::CreatedReadBufferMMapFailed);
|
||||
}
|
||||
}
|
||||
|
||||
ProfileEvents::increment(ProfileEvents::CreatedReadBufferOrdinary);
|
||||
return std::make_unique<ReadBufferFromFile>(filename_, buffer_size_, flags_, existing_memory_, alignment);
|
||||
}
|
||||
|
@ -19,6 +19,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
|
||||
const std::string & filename_,
|
||||
size_t estimated_size,
|
||||
size_t aio_threshold,
|
||||
size_t mmap_threshold,
|
||||
size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
|
||||
int flags_ = -1,
|
||||
char * existing_memory_ = nullptr,
|
||||
|
@ -78,7 +78,7 @@ add_executable (parse_date_time_best_effort parse_date_time_best_effort.cpp)
|
||||
target_link_libraries (parse_date_time_best_effort PRIVATE clickhouse_common_io)
|
||||
|
||||
add_executable (zlib_ng_bug zlib_ng_bug.cpp)
|
||||
target_link_libraries (zlib_ng_bug PRIVATE ${Poco_Foundation_LIBRARY})
|
||||
if(NOT USE_INTERNAL_POCO_LIBRARY)
|
||||
target_include_directories(zlib_ng_bug SYSTEM BEFORE PRIVATE ${Poco_INCLUDE_DIRS})
|
||||
endif()
|
||||
target_link_libraries (zlib_ng_bug PRIVATE ${Poco_Foundation_LIBRARY} ${ZLIB_LIBRARY})
|
||||
|
||||
add_executable (ryu_test ryu_test.cpp)
|
||||
target_link_libraries (ryu_test PRIVATE ryu)
|
||||
|
@ -36,11 +36,11 @@ std::string bin(const T & value, size_t bits = sizeof(T)*8)
|
||||
.to_string().substr(MAX_BITS - bits, bits);
|
||||
}
|
||||
|
||||
// gets N low bits of value
|
||||
template <typename T>
|
||||
T getBits(UInt8 bits, const T & value)
|
||||
{
|
||||
const T mask = ((static_cast<T>(1) << static_cast<T>(bits)) - 1);
|
||||
return value & mask;
|
||||
return value & maskLowBits<T>(bits);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -83,12 +83,36 @@ std::string dumpContents(const T& container,
|
||||
return sstr.str();
|
||||
}
|
||||
|
||||
template <typename ValueLeft, typename ValueRight>
|
||||
::testing::AssertionResult BinaryEqual(const ValueLeft & left, const ValueRight & right)
|
||||
{
|
||||
// ::testing::AssertionResult result = ::testing::AssertionSuccess();
|
||||
if (sizeof(left) != sizeof(right))
|
||||
return ::testing::AssertionFailure()
|
||||
<< "Sizes do not match, expected: " << sizeof(left) << " actual: " << sizeof(right);
|
||||
|
||||
const auto size = std::min(sizeof(left), sizeof(right));
|
||||
if (memcmp(&left, &right, size) != 0)
|
||||
{
|
||||
const auto l_bits = left ? static_cast<size_t>(std::log2(left)) : 0;
|
||||
const auto r_bits = right ? static_cast<size_t>(std::log2(right)) : 0;
|
||||
const size_t bits = std::max(l_bits, r_bits) + 1;
|
||||
|
||||
return ::testing::AssertionFailure()
|
||||
<< "Values are binary different,\n"
|
||||
<< "\texpected: 0b" << bin(left, bits) << " (" << std::hex << left << "),\n"
|
||||
<< "\tactual : 0b" << bin(right, bits) << " (" <<std::hex << right << ").";
|
||||
}
|
||||
|
||||
return ::testing::AssertionSuccess();
|
||||
}
|
||||
|
||||
struct TestCaseParameter
|
||||
{
|
||||
std::vector<std::pair<UInt8, UInt64>> bits_and_vals;
|
||||
std::string expected_buffer_binary;
|
||||
|
||||
explicit TestCaseParameter(std::vector<std::pair<UInt8, UInt64>> vals, std::string binary = std::string{})
|
||||
TestCaseParameter(std::vector<std::pair<UInt8, UInt64>> vals, std::string binary = std::string{})
|
||||
: bits_and_vals(std::move(vals)),
|
||||
expected_buffer_binary(binary)
|
||||
{}
|
||||
@ -114,8 +138,7 @@ TEST_P(BitIO, WriteAndRead)
|
||||
PODArray<char> data(max_buffer_size);
|
||||
|
||||
{
|
||||
WriteBuffer write_buffer(data.data(), data.size());
|
||||
BitWriter writer(write_buffer);
|
||||
BitWriter writer(data.data(), data.size());
|
||||
for (const auto & bv : bits_and_vals)
|
||||
{
|
||||
writer.writeBits(bv.first, bv.second);
|
||||
@ -133,38 +156,73 @@ TEST_P(BitIO, WriteAndRead)
|
||||
ASSERT_EQ(expected_buffer_binary, actual_buffer_binary);
|
||||
}
|
||||
|
||||
BitReader reader(read_buffer);
|
||||
BitReader reader(data.data(), data.size());
|
||||
|
||||
int bitpos = 0;
|
||||
int item = 0;
|
||||
for (const auto & bv : bits_and_vals)
|
||||
{
|
||||
SCOPED_TRACE(::testing::Message()
|
||||
<< "item #" << item << ", width: " << static_cast<UInt32>(bv.first)
|
||||
<< ", value: " << bin(bv.second)
|
||||
<< ".\n\n\nBuffer memory:\n" << dumpContents(data));
|
||||
<< "item #" << item << " of " << bits_and_vals.size() << ", width: " << static_cast<UInt32>(bv.first)
|
||||
<< ", value: " << bv.second << "(" << bin(bv.second) << ")"
|
||||
<< ", at bit position: " << std::dec << reader.count()
|
||||
<< ".\nBuffer memory:\n" << dumpContents(data));
|
||||
|
||||
//EXPECT_EQ(getBits(bv.first, bv.second), reader.peekBits(bv.first));
|
||||
EXPECT_EQ(getBits(bv.first, bv.second), reader.readBits(bv.first));
|
||||
// const UInt8 next_byte = getBits(bv.first, bv.second) &
|
||||
ASSERT_TRUE(BinaryEqual(getBits(bv.first, bv.second), reader.readBits(bv.first)));
|
||||
|
||||
++item;
|
||||
bitpos += bv.first;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Simple,
|
||||
BitIO,
|
||||
::testing::Values(
|
||||
TestCaseParameter(
|
||||
{{9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}},
|
||||
"11111111 10000000 00111111 11100000 00001111 11111000 "),
|
||||
TestCaseParameter(
|
||||
{{7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {3, 0xFFFF}},
|
||||
"01111110 11111101 11111011 11110111 11101111 11011111 10111111 01111111 11000000 "),
|
||||
TestCaseParameter({{33, 0xFF110d0b07050300}, {33, 0xAAEE29251f1d1713}}),
|
||||
TestCaseParameter({{33, BIT_PATTERN}, {33, BIT_PATTERN}}),
|
||||
TestCaseParameter({{24, 0xFFFFFFFF}},
|
||||
"11111111 11111111 11111111 ")
|
||||
),);
|
||||
BitIO,
|
||||
::testing::ValuesIn(std::initializer_list<TestCaseParameter>{
|
||||
{
|
||||
{{9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}, {9, 0x00}, {9, 0xFFFFFFFF}},
|
||||
"11111111 10000000 00111111 11100000 00001111 11111000 "
|
||||
},
|
||||
{
|
||||
{{7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {7, 0x3f}, {3, 0xFFFF}},
|
||||
"01111110 11111101 11111011 11110111 11101111 11011111 10111111 01111111 11000000 "
|
||||
},
|
||||
{
|
||||
{{33, 0xFF110d0b07050300}, {33, 0xAAEE29251f1d1713}}
|
||||
},
|
||||
{
|
||||
{{33, BIT_PATTERN}, {33, BIT_PATTERN}}
|
||||
},
|
||||
{
|
||||
{{24, 0xFFFFFFFF}},
|
||||
"11111111 11111111 11111111 "
|
||||
},
|
||||
{
|
||||
// Note that we take only N lower bits of the number: {3, 0b01011} => 011
|
||||
{{5, 0b01010}, {3, 0b111}, {7, 0b11001100}, {6, 0}, {5, 0b11111111}, {4, 0}, {3, 0b101}, {2, 0}, {1, 0b11111111}},
|
||||
"01010111 10011000 00000111 11000010 10010000 "
|
||||
},
|
||||
{
|
||||
{{64, BIT_PATTERN}, {56, BIT_PATTERN} , {4, 0b1111}, {4, 0}, // 128
|
||||
{8, 0b11111111}, {64, BIT_PATTERN}, {48, BIT_PATTERN}, {8, 0}}, // 256
|
||||
"11101011 11101111 10111010 11101111 10101111 10111010 11101011 10101001 " // 64
|
||||
"11101111 10111010 11101111 10101111 10111010 11101011 10101001 11110000 " // 128
|
||||
"11111111 11101011 11101111 10111010 11101111 10101111 10111010 11101011 " // 192
|
||||
"10101001 10111010 11101111 10101111 10111010 11101011 10101001 00000000 " // 256
|
||||
},
|
||||
{
|
||||
{{64, BIT_PATTERN}, {56, BIT_PATTERN} , {5, 0b11111}, {3, 0}, // 128
|
||||
{8, 0b11111111}, {64, BIT_PATTERN}, {48, BIT_PATTERN}, {8, 0}, //256
|
||||
{32, BIT_PATTERN}, {12, 0xff}, {8, 0}, {12, 0xAEff}},
|
||||
"11101011 11101111 10111010 11101111 10101111 10111010 11101011 10101001 " // 64
|
||||
"11101111 10111010 11101111 10101111 10111010 11101011 10101001 11111000 " // 128
|
||||
"11111111 11101011 11101111 10111010 11101111 10101111 10111010 11101011 " // 192
|
||||
"10101001 10111010 11101111 10101111 10111010 11101011 10101001 00000000 " // 256
|
||||
"10101111 10111010 11101011 10101001 00001111 11110000 00001110 11111111 " // 320
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
TestCaseParameter primes_case(UInt8 repeat_times, UInt64 pattern)
|
||||
{
|
||||
|
92
dbms/src/IO/tests/ryu_test.cpp
Normal file
92
dbms/src/IO/tests/ryu_test.cpp
Normal file
@ -0,0 +1,92 @@
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <ryu/ryu.h>
|
||||
|
||||
|
||||
struct DecomposedFloat64
|
||||
{
|
||||
DecomposedFloat64(double x)
|
||||
{
|
||||
memcpy(&x_uint, &x, sizeof(x));
|
||||
}
|
||||
|
||||
uint64_t x_uint;
|
||||
|
||||
bool sign() const
|
||||
{
|
||||
return x_uint >> 63;
|
||||
}
|
||||
|
||||
uint16_t exponent() const
|
||||
{
|
||||
return (x_uint >> 52) & 0x7FF;
|
||||
}
|
||||
|
||||
int16_t normalized_exponent() const
|
||||
{
|
||||
return int16_t(exponent()) - 1023;
|
||||
}
|
||||
|
||||
uint64_t mantissa() const
|
||||
{
|
||||
return x_uint & 0x5affffffffffffful;
|
||||
}
|
||||
|
||||
bool is_inside_int64() const
|
||||
{
|
||||
return x_uint == 0
|
||||
|| (normalized_exponent() >= 0 && normalized_exponent() <= 52
|
||||
&& ((mantissa() & ((1ULL << (52 - normalized_exponent())) - 1)) == 0));
|
||||
}
|
||||
};
|
||||
|
||||
struct DecomposedFloat32
|
||||
{
|
||||
DecomposedFloat32(float x)
|
||||
{
|
||||
memcpy(&x_uint, &x, sizeof(x));
|
||||
}
|
||||
|
||||
uint32_t x_uint;
|
||||
|
||||
bool sign() const
|
||||
{
|
||||
return x_uint >> 31;
|
||||
}
|
||||
|
||||
uint16_t exponent() const
|
||||
{
|
||||
return (x_uint >> 23) & 0xFF;
|
||||
}
|
||||
|
||||
int16_t normalized_exponent() const
|
||||
{
|
||||
return int16_t(exponent()) - 127;
|
||||
}
|
||||
|
||||
uint32_t mantissa() const
|
||||
{
|
||||
return x_uint & 0x7fffff;
|
||||
}
|
||||
|
||||
bool is_inside_int32() const
|
||||
{
|
||||
return x_uint == 0
|
||||
|| (normalized_exponent() >= 0 && normalized_exponent() <= 23
|
||||
&& ((mantissa() & ((1ULL << (23 - normalized_exponent())) - 1)) == 0));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
double x = argc > 1 ? std::stod(argv[1]) : 0;
|
||||
char buf[32];
|
||||
|
||||
d2s_buffered(x, buf);
|
||||
std::cout << buf << "\n";
|
||||
|
||||
std::cout << DecomposedFloat64(x).is_inside_int64() << "\n";
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,32 +1,50 @@
|
||||
#include <Poco/FileStream.h>
|
||||
#include <Poco/NullStream.h>
|
||||
#include <Poco/StreamCopier.h>
|
||||
#include <Poco/DeflatingStream.h>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
#include <zlib.h>
|
||||
|
||||
/** This script reproduces the bug in zlib-ng library.
|
||||
* Put the following content to "data.bin" file:
|
||||
abcdefghijklmn!@Aab#AAabcdefghijklmn$%
|
||||
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
* There are two lines. First line make sense. Second line contains padding to make file size large enough.
|
||||
* Compile with
|
||||
* cmake -D SANITIZE=address
|
||||
* and run:
|
||||
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
||||
|
||||
./zlib_ng_bug data2.bin
|
||||
=================================================================
|
||||
==204952==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x6310000147ff at pc 0x000000596d7a bp 0x7ffd139edd50 sp 0x7ffd139edd48
|
||||
READ of size 1 at 0x6310000147ff thread T0
|
||||
*/
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
/// https://github.com/zlib-ng/zlib-ng/issues/494
|
||||
int main(int, char **)
|
||||
{
|
||||
using namespace Poco;
|
||||
std::vector<unsigned char> in(1048576);
|
||||
std::vector<unsigned char> out(1048576);
|
||||
|
||||
std::string filename(argc >= 2 ? argv[1] : "data.bin");
|
||||
FileInputStream istr(filename);
|
||||
NullOutputStream ostr;
|
||||
DeflatingOutputStream deflater(ostr, DeflatingStreamBuf::STREAM_GZIP);
|
||||
StreamCopier::copyStream(istr, deflater);
|
||||
ssize_t in_size = read(STDIN_FILENO, in.data(), 1048576);
|
||||
if (in_size < 0)
|
||||
throw std::runtime_error("Cannot read");
|
||||
in.resize(in_size);
|
||||
|
||||
z_stream zstr{};
|
||||
if (Z_OK != deflateInit2(&zstr, 1, Z_DEFLATED, 15 + 16, 8, Z_DEFAULT_STRATEGY))
|
||||
throw std::runtime_error("Cannot deflateInit2");
|
||||
|
||||
zstr.next_in = in.data();
|
||||
zstr.avail_in = in.size();
|
||||
zstr.next_out = out.data();
|
||||
zstr.avail_out = out.size();
|
||||
|
||||
while (zstr.avail_in > 0)
|
||||
if (Z_OK != deflate(&zstr, Z_NO_FLUSH))
|
||||
throw std::runtime_error("Cannot deflate");
|
||||
|
||||
while (true)
|
||||
{
|
||||
int rc = deflate(&zstr, Z_FINISH);
|
||||
|
||||
if (rc == Z_STREAM_END)
|
||||
break;
|
||||
|
||||
if (rc != Z_OK)
|
||||
throw std::runtime_error("Cannot finish deflate");
|
||||
}
|
||||
|
||||
deflateEnd(&zstr);
|
||||
|
||||
if (ssize_t(zstr.total_out) != write(STDOUT_FILENO, out.data(), zstr.total_out))
|
||||
throw std::runtime_error("Cannot write");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -514,14 +514,21 @@ void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const
|
||||
ASTPtr as_create_ptr = context.getDatabase(as_database_name)->getCreateTableQuery(context, as_table_name);
|
||||
const auto & as_create = as_create_ptr->as<ASTCreateQuery &>();
|
||||
|
||||
const String qualified_name = backQuoteIfNeed(as_database_name) + "." + backQuoteIfNeed(as_table_name);
|
||||
|
||||
if (as_create.is_view)
|
||||
throw Exception(
|
||||
"Cannot CREATE a table AS " + as_database_name + "." + as_table_name + ", it is a View",
|
||||
"Cannot CREATE a table AS " + qualified_name + ", it is a View",
|
||||
ErrorCodes::INCORRECT_QUERY);
|
||||
|
||||
if (as_create.is_live_view)
|
||||
throw Exception(
|
||||
"Cannot CREATE a table AS " + as_database_name + "." + as_table_name + ", it is a Live View",
|
||||
"Cannot CREATE a table AS " + qualified_name + ", it is a Live View",
|
||||
ErrorCodes::INCORRECT_QUERY);
|
||||
|
||||
if (as_create.is_dictionary)
|
||||
throw Exception(
|
||||
"Cannot CREATE a table AS " + qualified_name + ", it is a Dictionary",
|
||||
ErrorCodes::INCORRECT_QUERY);
|
||||
|
||||
create.set(create.storage, as_create.storage->ptr());
|
||||
|
@ -2267,17 +2267,17 @@ void InterpreterSelectQuery::executeOrder(Pipeline & pipeline, InputSortingInfoP
|
||||
limits.size_limits = SizeLimits(settings.max_rows_to_sort, settings.max_bytes_to_sort, settings.sort_overflow_mode);
|
||||
sorting_stream->setLimits(limits);
|
||||
|
||||
stream = sorting_stream;
|
||||
auto merging_stream = std::make_shared<MergeSortingBlockInputStream>(
|
||||
sorting_stream, output_order_descr, settings.max_block_size, limit,
|
||||
settings.max_bytes_before_remerge_sort,
|
||||
settings.max_bytes_before_external_sort / pipeline.streams.size(),
|
||||
context->getTemporaryPath(), settings.min_free_disk_space_for_temporary_data);
|
||||
|
||||
stream = merging_stream;
|
||||
});
|
||||
|
||||
/// If there are several streams, we merge them into one
|
||||
executeUnion(pipeline, {});
|
||||
|
||||
/// Merge the sorted blocks.
|
||||
pipeline.firstStream() = std::make_shared<MergeSortingBlockInputStream>(
|
||||
pipeline.firstStream(), output_order_descr, settings.max_block_size, limit,
|
||||
settings.max_bytes_before_remerge_sort,
|
||||
settings.max_bytes_before_external_sort, context->getTemporaryPath(), settings.min_free_disk_space_for_temporary_data);
|
||||
executeMergeSorted(pipeline, output_order_descr, limit);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTQueryParameter.h>
|
||||
#include <Interpreters/ReplaceQueryParameterVisitor.h>
|
||||
@ -54,10 +55,12 @@ void ReplaceQueryParameterVisitor::visitQueryParameter(ASTPtr & ast)
|
||||
IColumn & temp_column = *temp_column_ptr;
|
||||
ReadBufferFromString read_buffer{value};
|
||||
FormatSettings format_settings;
|
||||
data_type->deserializeAsWholeText(temp_column, read_buffer, format_settings);
|
||||
data_type->deserializeAsTextEscaped(temp_column, read_buffer, format_settings);
|
||||
|
||||
if (!read_buffer.eof())
|
||||
throw Exception("Value " + value + " cannot be parsed as " + type_name + " for query parameter '" + ast_param.name + "'", ErrorCodes::BAD_QUERY_PARAMETER);
|
||||
throw Exception("Value " + value + " cannot be parsed as " + type_name + " for query parameter '" + ast_param.name + "'"
|
||||
" because it isn't parsed completely: only " + toString(read_buffer.count()) + " of " + toString(value.size()) + " bytes was parsed: "
|
||||
+ value.substr(0, read_buffer.count()), ErrorCodes::BAD_QUERY_PARAMETER);
|
||||
|
||||
ast = addTypeConversionToAST(std::make_shared<ASTLiteral>(temp_column[0]), type_name);
|
||||
}
|
||||
|
@ -129,9 +129,9 @@ static void setExceptionStackTrace(QueryLogElement & elem)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (const Exception & e)
|
||||
catch (const std::exception & e)
|
||||
{
|
||||
elem.stack_trace = e.getStackTrace().toString();
|
||||
elem.stack_trace = getExceptionStackTraceString(e);
|
||||
}
|
||||
catch (...) {}
|
||||
}
|
||||
|
@ -97,6 +97,6 @@ catch (const Exception & e)
|
||||
std::cerr << e.what() << ", " << e.displayText() << std::endl
|
||||
<< std::endl
|
||||
<< "Stack trace:" << std::endl
|
||||
<< e.getStackTrace().toString();
|
||||
<< e.getStackTraceString();
|
||||
return 1;
|
||||
}
|
||||
|
@ -55,6 +55,6 @@ catch (const Exception & e)
|
||||
std::cerr << e.what() << ", " << e.displayText() << std::endl
|
||||
<< std::endl
|
||||
<< "Stack trace:" << std::endl
|
||||
<< e.getStackTrace().toString();
|
||||
<< e.getStackTraceString();
|
||||
return 1;
|
||||
}
|
||||
|
@ -1,11 +1,10 @@
|
||||
#include <Poco/Version.h>
|
||||
#include <Processors/Transforms/MergeSortingTransform.h>
|
||||
#include <Processors/IAccumulatingTransform.h>
|
||||
#include <Processors/Transforms/MergingSortedTransform.h>
|
||||
#include <Common/formatReadable.h>
|
||||
#include <Common/ProfileEvents.h>
|
||||
#include <common/config_common.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <Compression/CompressedReadBuffer.h>
|
||||
#include <Compression/CompressedWriteBuffer.h>
|
||||
#include <DataStreams/NativeBlockInputStream.h>
|
||||
#include <DataStreams/NativeBlockOutputStream.h>
|
||||
@ -21,6 +20,13 @@ namespace ProfileEvents
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NOT_ENOUGH_SPACE;
|
||||
}
|
||||
class MergeSorter;
|
||||
|
||||
|
||||
class BufferingToFileTransform : public IAccumulatingTransform
|
||||
{
|
||||
public:
|
||||
|
@ -1,25 +1,14 @@
|
||||
#pragma once
|
||||
|
||||
#include <Processors/Transforms/SortingTransform.h>
|
||||
#include <Core/SortDescription.h>
|
||||
#include <Common/filesystemHelpers.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <Compression/CompressedReadBuffer.h>
|
||||
#include <DataStreams/IBlockInputStream.h>
|
||||
#include <DataStreams/NativeBlockInputStream.h>
|
||||
|
||||
#include <common/logger_useful.h>
|
||||
|
||||
#include <queue>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NOT_ENOUGH_SPACE;
|
||||
}
|
||||
class MergeSorter;
|
||||
|
||||
class MergeSortingTransform : public SortingTransform
|
||||
{
|
||||
public:
|
||||
|
@ -148,9 +148,9 @@ IProcessor::Status MergingSortedTransform::prepare()
|
||||
return Status::NeedData;
|
||||
|
||||
if (has_collation)
|
||||
initQueue(queue_with_collation);
|
||||
queue_with_collation = SortingHeap<SortCursorWithCollation>(cursors);
|
||||
else
|
||||
initQueue(queue_without_collation);
|
||||
queue_without_collation = SortingHeap<SortCursor>(cursors);
|
||||
|
||||
is_initialized = true;
|
||||
return Status::Ready;
|
||||
@ -169,7 +169,6 @@ IProcessor::Status MergingSortedTransform::prepare()
|
||||
|
||||
if (need_data)
|
||||
{
|
||||
|
||||
auto & input = *std::next(inputs.begin(), next_input_to_read);
|
||||
if (!input.isFinished())
|
||||
{
|
||||
@ -183,7 +182,11 @@ IProcessor::Status MergingSortedTransform::prepare()
|
||||
return Status::NeedData;
|
||||
|
||||
updateCursor(std::move(chunk), next_input_to_read);
|
||||
pushToQueue(next_input_to_read);
|
||||
|
||||
if (has_collation)
|
||||
queue_with_collation.push(cursors[next_input_to_read]);
|
||||
else
|
||||
queue_without_collation.push(cursors[next_input_to_read]);
|
||||
}
|
||||
|
||||
need_data = false;
|
||||
@ -201,8 +204,8 @@ void MergingSortedTransform::work()
|
||||
merge(queue_without_collation);
|
||||
}
|
||||
|
||||
template <typename TSortCursor>
|
||||
void MergingSortedTransform::merge(std::priority_queue<TSortCursor> & queue)
|
||||
template <typename TSortingHeap>
|
||||
void MergingSortedTransform::merge(TSortingHeap & queue)
|
||||
{
|
||||
/// Returns MergeStatus which we should return if we are going to finish now.
|
||||
auto can_read_another_row = [&, this]()
|
||||
@ -224,77 +227,66 @@ void MergingSortedTransform::merge(std::priority_queue<TSortCursor> & queue)
|
||||
};
|
||||
|
||||
/// Take rows in required order and put them into `merged_data`, while the rows are no more than `max_block_size`
|
||||
while (!queue.empty())
|
||||
while (queue.isValid())
|
||||
{
|
||||
/// Shouldn't happen at first iteration, but check just in case.
|
||||
if (!can_read_another_row())
|
||||
return;
|
||||
|
||||
TSortCursor current = queue.top();
|
||||
queue.pop();
|
||||
bool first_iteration = true;
|
||||
auto current = queue.current();
|
||||
|
||||
while (true)
|
||||
/** And what if the block is totally less or equal than the rest for the current cursor?
|
||||
* Or is there only one data source left in the queue? Then you can take the entire block on current cursor.
|
||||
*/
|
||||
if (current.impl->isFirst()
|
||||
&& (queue.size() == 1
|
||||
|| (queue.size() >= 2 && current.totallyLessOrEquals(queue.nextChild()))))
|
||||
{
|
||||
if (!first_iteration && !can_read_another_row())
|
||||
//std::cerr << "current block is totally less or equals\n";
|
||||
|
||||
/// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function.
|
||||
if (merged_data.mergedRows() != 0)
|
||||
{
|
||||
queue.push(current);
|
||||
return;
|
||||
}
|
||||
first_iteration = false;
|
||||
|
||||
/** And what if the block is totally less or equal than the rest for the current cursor?
|
||||
* Or is there only one data source left in the queue? Then you can take the entire block on current cursor.
|
||||
*/
|
||||
if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top())))
|
||||
{
|
||||
//std::cerr << "current block is totally less or equals\n";
|
||||
|
||||
/// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function.
|
||||
if (merged_data.mergedRows() != 0)
|
||||
{
|
||||
//std::cerr << "merged rows is non-zero\n";
|
||||
queue.push(current);
|
||||
return;
|
||||
}
|
||||
|
||||
/// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
|
||||
size_t source_num = current.impl->order;
|
||||
insertFromChunk(source_num);
|
||||
//std::cerr << "merged rows is non-zero\n";
|
||||
return;
|
||||
}
|
||||
|
||||
//std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n";
|
||||
//std::cerr << "Inserting row\n";
|
||||
merged_data.insertRow(current->all_columns, current->pos);
|
||||
/// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
|
||||
size_t source_num = current.impl->order;
|
||||
insertFromChunk(source_num);
|
||||
queue.removeTop();
|
||||
return;
|
||||
}
|
||||
|
||||
if (out_row_sources_buf)
|
||||
{
|
||||
/// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
|
||||
RowSourcePart row_source(current.impl->order);
|
||||
out_row_sources_buf->write(row_source.data);
|
||||
}
|
||||
//std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n";
|
||||
//std::cerr << "Inserting row\n";
|
||||
merged_data.insertRow(current->all_columns, current->pos);
|
||||
|
||||
if (current->isLast())
|
||||
{
|
||||
need_data = true;
|
||||
next_input_to_read = current.impl->order;
|
||||
if (out_row_sources_buf)
|
||||
{
|
||||
/// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
|
||||
RowSourcePart row_source(current.impl->order);
|
||||
out_row_sources_buf->write(row_source.data);
|
||||
}
|
||||
|
||||
if (limit && merged_data.totalMergedRows() >= limit)
|
||||
is_finished = true;
|
||||
if (!current->isLast())
|
||||
{
|
||||
// std::cerr << "moving to next row\n";
|
||||
queue.next();
|
||||
}
|
||||
else
|
||||
{
|
||||
/// We will get the next block from the corresponding source, if there is one.
|
||||
queue.removeTop();
|
||||
|
||||
return;
|
||||
}
|
||||
// std::cerr << "It was last row, fetching next block\n";
|
||||
need_data = true;
|
||||
next_input_to_read = current.impl->order;
|
||||
|
||||
//std::cerr << "moving to next row\n";
|
||||
current->next();
|
||||
if (limit && merged_data.totalMergedRows() >= limit)
|
||||
is_finished = true;
|
||||
|
||||
if (!queue.empty() && current.greater(queue.top()))
|
||||
{
|
||||
//std::cerr << "next row is not least, pushing back to queue\n";
|
||||
queue.push(current);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
is_finished = true;
|
||||
|
@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <Processors/IProcessor.h>
|
||||
#include <Core/SortDescription.h>
|
||||
#include <Core/SortCursor.h>
|
||||
#include <Processors/SharedChunk.h>
|
||||
|
||||
#include <queue>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -111,14 +111,10 @@ protected:
|
||||
/// Chunks currently being merged.
|
||||
std::vector<SharedChunkPtr> source_chunks;
|
||||
|
||||
using CursorImpls = std::vector<SortCursorImpl>;
|
||||
CursorImpls cursors;
|
||||
SortCursorImpls cursors;
|
||||
|
||||
using Queue = std::priority_queue<SortCursor>;
|
||||
Queue queue_without_collation;
|
||||
|
||||
using QueueWithCollation = std::priority_queue<SortCursorWithCollation>;
|
||||
QueueWithCollation queue_with_collation;
|
||||
SortingHeap<SortCursor> queue_without_collation;
|
||||
SortingHeap<SortCursorWithCollation> queue_with_collation;
|
||||
|
||||
private:
|
||||
|
||||
@ -128,8 +124,8 @@ private:
|
||||
bool need_data = false;
|
||||
size_t next_input_to_read = 0;
|
||||
|
||||
template <typename TSortCursor>
|
||||
void merge(std::priority_queue<TSortCursor> & queue);
|
||||
template <typename TSortingHeap>
|
||||
void merge(TSortingHeap & queue);
|
||||
|
||||
void insertFromChunk(size_t source_num);
|
||||
|
||||
@ -159,22 +155,6 @@ private:
|
||||
shared_chunk_ptr->all_columns = cursors[source_num].all_columns;
|
||||
shared_chunk_ptr->sort_columns = cursors[source_num].sort_columns;
|
||||
}
|
||||
|
||||
void pushToQueue(size_t source_num)
|
||||
{
|
||||
if (has_collation)
|
||||
queue_with_collation.push(SortCursorWithCollation(&cursors[source_num]));
|
||||
else
|
||||
queue_without_collation.push(SortCursor(&cursors[source_num]));
|
||||
}
|
||||
|
||||
template <typename TSortCursor>
|
||||
void initQueue(std::priority_queue<TSortCursor> & queue)
|
||||
{
|
||||
for (auto & cursor : cursors)
|
||||
if (!cursor.empty())
|
||||
queue.push(TSortCursor(&cursor));
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -40,16 +40,12 @@ MergeSorter::MergeSorter(Chunks chunks_, SortDescription & description_, size_t
|
||||
|
||||
chunks.swap(nonempty_chunks);
|
||||
|
||||
if (!has_collation)
|
||||
{
|
||||
for (auto & cursor : cursors)
|
||||
queue_without_collation.push(SortCursor(&cursor));
|
||||
}
|
||||
if (has_collation)
|
||||
queue_with_collation = SortingHeap<SortCursorWithCollation>(cursors);
|
||||
else if (description.size() > 1)
|
||||
queue_without_collation = SortingHeap<SortCursor>(cursors);
|
||||
else
|
||||
{
|
||||
for (auto & cursor : cursors)
|
||||
queue_with_collation.push(SortCursorWithCollation(&cursor));
|
||||
}
|
||||
queue_simple = SortingHeap<SimpleSortCursor>(cursors);
|
||||
}
|
||||
|
||||
|
||||
@ -65,50 +61,61 @@ Chunk MergeSorter::read()
|
||||
return res;
|
||||
}
|
||||
|
||||
return !has_collation
|
||||
? mergeImpl<SortCursor>(queue_without_collation)
|
||||
: mergeImpl<SortCursorWithCollation>(queue_with_collation);
|
||||
if (has_collation)
|
||||
return mergeImpl(queue_with_collation);
|
||||
else if (description.size() > 1)
|
||||
return mergeImpl(queue_without_collation);
|
||||
else
|
||||
return mergeImpl(queue_simple);
|
||||
}
|
||||
|
||||
|
||||
template <typename TSortCursor>
|
||||
Chunk MergeSorter::mergeImpl(std::priority_queue<TSortCursor> & queue)
|
||||
template <typename TSortingHeap>
|
||||
Chunk MergeSorter::mergeImpl(TSortingHeap & queue)
|
||||
{
|
||||
size_t num_columns = chunks[0].getNumColumns();
|
||||
|
||||
MutableColumns merged_columns = chunks[0].cloneEmptyColumns();
|
||||
/// TODO: reserve (in each column)
|
||||
|
||||
/// Reserve
|
||||
if (queue.isValid())
|
||||
{
|
||||
/// The expected size of output block is the same as input block
|
||||
size_t size_to_reserve = chunks[0].getNumRows();
|
||||
for (auto & column : merged_columns)
|
||||
column->reserve(size_to_reserve);
|
||||
}
|
||||
|
||||
/// TODO: Optimization when a single block left.
|
||||
|
||||
/// Take rows from queue in right order and push to 'merged'.
|
||||
size_t merged_rows = 0;
|
||||
while (!queue.empty())
|
||||
while (queue.isValid())
|
||||
{
|
||||
TSortCursor current = queue.top();
|
||||
queue.pop();
|
||||
auto current = queue.current();
|
||||
|
||||
/// Append a row from queue.
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
merged_columns[i]->insertFrom(*current->all_columns[i], current->pos);
|
||||
|
||||
++total_merged_rows;
|
||||
++merged_rows;
|
||||
|
||||
if (!current->isLast())
|
||||
{
|
||||
current->next();
|
||||
queue.push(current);
|
||||
}
|
||||
|
||||
/// We don't need more rows because of limit has reached.
|
||||
if (limit && total_merged_rows == limit)
|
||||
{
|
||||
chunks.clear();
|
||||
return Chunk(std::move(merged_columns), merged_rows);
|
||||
break;
|
||||
}
|
||||
|
||||
queue.next();
|
||||
|
||||
/// It's enough for current output block but we will continue.
|
||||
if (merged_rows == max_merged_block_size)
|
||||
return Chunk(std::move(merged_columns), merged_rows);
|
||||
break;
|
||||
}
|
||||
|
||||
chunks.clear();
|
||||
if (!queue.isValid())
|
||||
chunks.clear();
|
||||
|
||||
if (merged_rows == 0)
|
||||
return {};
|
||||
|
@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <Processors/IProcessor.h>
|
||||
#include <Core/SortDescription.h>
|
||||
#include <Core/SortCursor.h>
|
||||
#include <DataStreams/IBlockInputStream.h>
|
||||
#include <Processors/ISource.h>
|
||||
#include <queue>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -27,19 +27,19 @@ private:
|
||||
UInt64 limit;
|
||||
size_t total_merged_rows = 0;
|
||||
|
||||
using CursorImpls = std::vector<SortCursorImpl>;
|
||||
CursorImpls cursors;
|
||||
SortCursorImpls cursors;
|
||||
|
||||
bool has_collation = false;
|
||||
|
||||
std::priority_queue<SortCursor> queue_without_collation;
|
||||
std::priority_queue<SortCursorWithCollation> queue_with_collation;
|
||||
SortingHeap<SortCursor> queue_without_collation;
|
||||
SortingHeap<SimpleSortCursor> queue_simple;
|
||||
SortingHeap<SortCursorWithCollation> queue_with_collation;
|
||||
|
||||
/** Two different cursors are supported - with and without Collation.
|
||||
* Templates are used (instead of virtual functions in SortCursor) for zero-overhead.
|
||||
*/
|
||||
template <typename TSortCursor>
|
||||
Chunk mergeImpl(std::priority_queue<TSortCursor> & queue);
|
||||
template <typename TSortingHeap>
|
||||
Chunk mergeImpl(TSortingHeap & queue);
|
||||
};
|
||||
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user