From 188d5cfbd8208644db8cc8d4f6d5e99fdc2205ae Mon Sep 17 00:00:00 2001 From: Date: Fri, 1 Sep 2017 16:43:05 +0300 Subject: [PATCH 01/84] Getting external dictionary from MySQL through the sockets. --- docs/ru/dicts/external_dicts_dict_sources.rst | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/docs/ru/dicts/external_dicts_dict_sources.rst b/docs/ru/dicts/external_dicts_dict_sources.rst index 251f89adb34..821208ed475 100644 --- a/docs/ru/dicts/external_dicts_dict_sources.rst +++ b/docs/ru/dicts/external_dicts_dict_sources.rst @@ -125,7 +125,7 @@ ODBC DatabaseName TableName
DSN=some_parameters - SQL_REQUEST + SQL_QUERY Поля настройки: @@ -307,10 +307,10 @@ MySQL example01-2 1 - conv_main - counters
+ db_name + table_name
id=10 - SQL_REQUEST + SQL_QUERY @@ -328,6 +328,26 @@ MySQL * ``table`` - имя таблицы. * ``where`` - условие выбора. Необязательный параметр. * ``invalidate_query`` - запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе :ref:`dicts-external_dicts_dict_lifetime`. + +MySQL можно подключить на локальном хосте через сокеты, для этого необходимо задать ``host`` и ``socket``. + +Пример настройки: + +.. code-block:: xml + + + + localhost + /path/to/socket/file.sock + clickhouse + qwerty + db_name + table_name
+ id=10 + SQL_QUERY +
+ + .. _dicts-external_dicts_dict_sources-clickhouse: From 6885f6cfaa59a3c507072b986a6c21edad80c9f7 Mon Sep 17 00:00:00 2001 From: Date: Wed, 6 Sep 2017 08:48:35 +0300 Subject: [PATCH 02/84] Minor text edits. --- docs/ru/query_language/queries.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/ru/query_language/queries.rst b/docs/ru/query_language/queries.rst index 2ed59af3c80..13ac431b20f 100644 --- a/docs/ru/query_language/queries.rst +++ b/docs/ru/query_language/queries.rst @@ -627,7 +627,7 @@ ClickHouse отсекает все пробелы и один перенос с SELECT ~~~~~~ -Его величество, запрос SELECT. +Выборка данных. .. code-block:: sql @@ -645,6 +645,7 @@ SELECT [UNION ALL ...] [INTO OUTFILE filename] [FORMAT format] + [LIMIT n BY columns] Все секции, кроме списка выражений сразу после SELECT, являются необязательными. Ниже секции будут описаны в порядке, почти соответствующем конвейеру выполнения запроса. @@ -1172,7 +1173,7 @@ GROUP BY во внешней памяти Модификатор LIMIT N BY ^^^^^^^^^^^^^^^^^^^^^^ -LIMIT N BY COLUMNS позволяет выбрать топ N строк для каждой группы COLUMNS. LIMIT N BY не связан с LIMIT и они могут использоваться в одном запросе. Ключ для LIMIT N BY может содержать произвольное число колонок или выражений. +LIMIT N BY COLUMNS выбирает топ N строк для каждой группы COLUMNS. LIMIT N BY не связан с LIMIT и они могут использоваться в одном запросе. Ключ для LIMIT N BY может содержать произвольное число колонок или выражений. Пример: @@ -1189,7 +1190,9 @@ LIMIT N BY COLUMNS позволяет выбрать топ N строк для LIMIT 5 BY domain, device_type LIMIT 100 -выберет топ 5 рефереров для каждой пары domain - device type. Ограничить общее число строк результата 100. +Запрос выберет топ 5 рефереров для каждой пары ``domain - device type``, но не более 100 строк (``LIMIT n BY + LIMIT``). + + Секция HAVING """"""""""""" From 12cf25bbe2f3c13e4d4d65b04478e4886fbe242e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 Sep 2017 22:43:30 +0300 Subject: [PATCH 03/84] Added link to RPM packages in documentation [#CLICKHOUSE-3]. --- docs/en/getting_started/index.rst | 2 ++ docs/ru/getting_started/index.rst | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/getting_started/index.rst b/docs/en/getting_started/index.rst index 9a322ae6807..41a5dcf7182 100644 --- a/docs/en/getting_started/index.rst +++ b/docs/en/getting_started/index.rst @@ -74,6 +74,8 @@ Other methods of installation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The Docker image is located here: https://hub.docker.com/r/yandex/clickhouse-server/ +There are RPM packages for CentOS, RHEL: https://github.com/Altinity/clickhouse-rpm-install + There is Gentoo overlay located here: https://github.com/kmeaw/clickhouse-overlay diff --git a/docs/ru/getting_started/index.rst b/docs/ru/getting_started/index.rst index dead1211166..36bd2824e6e 100644 --- a/docs/ru/getting_started/index.rst +++ b/docs/ru/getting_started/index.rst @@ -76,6 +76,8 @@ ClickHouse содержит настройки ограничения досту ~~~~~~~~~~~~~~~~~~~~~~~ Docker образ: https://hub.docker.com/r/yandex/clickhouse-server/ +RPM пакеты для CentOS, RHEL: https://github.com/Altinity/clickhouse-rpm-install + Gentoo overlay: https://github.com/kmeaw/clickhouse-overlay From a4c5cf000917b4b39ba5c0091f8741a59e04d9d2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 Sep 2017 22:47:39 +0300 Subject: [PATCH 04/84] Fixed bad translation [#CLICKHOUSE-3]. --- docs/en/getting_started/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/getting_started/index.rst b/docs/en/getting_started/index.rst index 41a5dcf7182..af8946f76b4 100644 --- a/docs/en/getting_started/index.rst +++ b/docs/en/getting_started/index.rst @@ -58,7 +58,7 @@ You can compile packages and install them. You can also use programs without ins Client: dbms/src/Client/ Server: dbms/src/Server/ -For the server, create a catalog with data, such as: +For the server, create a directory with data, such as: .. code-block:: text @@ -88,7 +88,7 @@ To start the server (as a daemon), run: sudo service clickhouse-server start -View the logs in the catalog `/var/log/clickhouse-server/` +View the logs in the directory `/var/log/clickhouse-server/` If the server doesn't start, check the configurations in the file `/etc/clickhouse-server/config.xml` From 9a58e26b8c652445ad5f2a6c8c590578cf35cc27 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 13 Sep 2017 00:06:11 +0300 Subject: [PATCH 05/84] Added comment [#CLICKHOUSE-2]. --- dbms/src/Common/RadixSort.h | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Common/RadixSort.h b/dbms/src/Common/RadixSort.h index e22fdd01c2b..4f9cf260a64 100644 --- a/dbms/src/Common/RadixSort.h +++ b/dbms/src/Common/RadixSort.h @@ -185,6 +185,7 @@ public: Element * swap_buffer = reinterpret_cast(allocator.allocate(size * sizeof(Element))); /// Transform the array and calculate the histogram. + /// NOTE This is slightly suboptimal. Look at https://github.com/powturbo/TurboHist for (size_t i = 0; i < size; ++i) { if (!Traits::Transform::transform_is_simple) From 68f48253c93ec74494de09e12cb433a32d381c5f Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 13 Sep 2017 17:24:54 +0300 Subject: [PATCH 06/84] Tests: catch xml internal error --- dbms/tests/clickhouse-test | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index 58304f1768e..13a3b422d9c 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -203,9 +203,12 @@ def main(args): failure = et.Element("failure", attrib = {"message": "result differs with reference"}) report_testcase.append(failure) - stdout_element = et.Element("system-out") - stdout_element.text = et.CDATA(diff) - report_testcase.append(stdout_element) + try: + stdout_element = et.Element("system-out") + stdout_element.text = et.CDATA(diff) + report_testcase.append(stdout_element) + except Exception as e: + print("{0} - test internal error\n{1}".format(MSG_FAIL, e)) failures = failures + 1 print("{0} - result differs with reference:\n{1}".format(MSG_FAIL, diff.encode('utf-8'))) From 8a7e604a2a5dd2540e73706bd329126ca7fcd65c Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 13 Sep 2017 21:08:38 +0300 Subject: [PATCH 07/84] Tests fixes (#1238) * Tests: catch xml internal error * Tests fixes --- dbms/tests/clickhouse-test | 3 ++- ...6_clear_column_in_partition_concurrent_zookeeper.reference} | 0 ...=> 00446_clear_column_in_partition_concurrent_zookeeper.sh} | 0 ...nce => 00446_clear_column_in_partition_zookeeper.reference} | 0 ...ition.sql => 00446_clear_column_in_partition_zookeeper.sql} | 0 5 files changed, 2 insertions(+), 1 deletion(-) rename dbms/tests/queries/0_stateless/{00446_clear_column_in_partition_concurrent.reference => 00446_clear_column_in_partition_concurrent_zookeeper.reference} (100%) rename dbms/tests/queries/0_stateless/{00446_clear_column_in_partition_concurrent.sh => 00446_clear_column_in_partition_concurrent_zookeeper.sh} (100%) rename dbms/tests/queries/0_stateless/{00446_clear_column_in_partition.reference => 00446_clear_column_in_partition_zookeeper.reference} (100%) rename dbms/tests/queries/0_stateless/{00446_clear_column_in_partition.sql => 00446_clear_column_in_partition_zookeeper.sql} (100%) diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index 13a3b422d9c..86d6fed4e12 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -238,13 +238,14 @@ if __name__ == '__main__': parser.add_argument('-o', '--output', help = 'Output xUnit compliant test report directory') parser.add_argument('-t', '--timeout', type = int, default = 600, help = 'Timeout for each test case in seconds') parser.add_argument('test', nargs = '?', help = 'Optional test case name regex') + parser.add_argument('--stop', action = 'store_true', default = None, dest = 'stop', help = 'Stop on network errors ') group = parser.add_mutually_exclusive_group(required = False) group.add_argument('--zookeeper', action = 'store_true', default = None, dest = 'zookeeper', help = 'Run zookeeper related tests') group.add_argument('--no-zookeeper', action = 'store_false', default = None, dest = 'zookeeper', help = 'Do not run zookeeper related tests') + group = parser.add_mutually_exclusive_group(required = False) group.add_argument('--shard', action = 'store_true', default = None, dest = 'shard', help = 'Run sharding related tests (required to clickhouse-server listen 127.0.0.2 127.0.0.3)') group.add_argument('--no-shard', action = 'store_false', default = None, dest = 'shard', help = 'Do not run shard related tests') - group.add_argument('--stop', action = 'store_true', default = None, dest = 'stop', help = 'Stop on network errors ') args = parser.parse_args() diff --git a/dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent.reference b/dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent_zookeeper.reference similarity index 100% rename from dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent.reference rename to dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent_zookeeper.reference diff --git a/dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent.sh b/dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent_zookeeper.sh similarity index 100% rename from dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent.sh rename to dbms/tests/queries/0_stateless/00446_clear_column_in_partition_concurrent_zookeeper.sh diff --git a/dbms/tests/queries/0_stateless/00446_clear_column_in_partition.reference b/dbms/tests/queries/0_stateless/00446_clear_column_in_partition_zookeeper.reference similarity index 100% rename from dbms/tests/queries/0_stateless/00446_clear_column_in_partition.reference rename to dbms/tests/queries/0_stateless/00446_clear_column_in_partition_zookeeper.reference diff --git a/dbms/tests/queries/0_stateless/00446_clear_column_in_partition.sql b/dbms/tests/queries/0_stateless/00446_clear_column_in_partition_zookeeper.sql similarity index 100% rename from dbms/tests/queries/0_stateless/00446_clear_column_in_partition.sql rename to dbms/tests/queries/0_stateless/00446_clear_column_in_partition_zookeeper.sql From 2bb487c333e994947f04fc54d7ef08fbfbccb8ee Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 13 Sep 2017 22:02:24 +0300 Subject: [PATCH 08/84] fixed string concat wih first array argument in block --- dbms/src/Functions/FunctionsString.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index de1d3686fc0..0766ee47587 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -705,7 +705,7 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, const size_t result) override { - if (!is_injective && !arguments.empty() && checkDataType(block.getByPosition(0).type.get())) + if (!is_injective && !arguments.empty() && checkDataType(block.getByPosition(arguments[0]).type.get())) return FunctionArrayConcat().executeImpl(block, arguments, result); if (arguments.size() == 2) From c14263a9fa8f2bc14c2531fcff227b8ccd57f2e4 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 13 Sep 2017 22:09:33 +0300 Subject: [PATCH 09/84] added test --- .../0_stateless/00502_string_concat_with_array.reference | 2 ++ .../queries/0_stateless/00502_string_concat_with_array.sql | 1 + 2 files changed, 3 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/00502_string_concat_with_array.reference create mode 100644 dbms/tests/queries/0_stateless/00502_string_concat_with_array.sql diff --git a/dbms/tests/queries/0_stateless/00502_string_concat_with_array.reference b/dbms/tests/queries/0_stateless/00502_string_concat_with_array.reference new file mode 100644 index 00000000000..3ae3ec73944 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00502_string_concat_with_array.reference @@ -0,0 +1,2 @@ +[0] 00 +[1] 11 diff --git a/dbms/tests/queries/0_stateless/00502_string_concat_with_array.sql b/dbms/tests/queries/0_stateless/00502_string_concat_with_array.sql new file mode 100644 index 00000000000..8bfcaa8daaf --- /dev/null +++ b/dbms/tests/queries/0_stateless/00502_string_concat_with_array.sql @@ -0,0 +1 @@ +select a, b || b from (select [number] as a, toString(number) as b from system.numbers limit 2); From 5e25c40a26f5499afa09916cad2df243a1a4daef Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 Sep 2017 07:24:20 +0300 Subject: [PATCH 10/84] Fixed error found by Coverity [#CLICKHOUSE-2]. --- dbms/src/Common/ConcurrentBoundedQueue.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dbms/src/Common/ConcurrentBoundedQueue.h b/dbms/src/Common/ConcurrentBoundedQueue.h index 52338286fac..d3f67c5d4f1 100644 --- a/dbms/src/Common/ConcurrentBoundedQueue.h +++ b/dbms/src/Common/ConcurrentBoundedQueue.h @@ -39,7 +39,7 @@ namespace detail } }; -/** A very simple thread-safe queue of limited length. +/** A very simple thread-safe queue of limited size. * If you try to pop an item from an empty queue, the thread is blocked until the queue becomes nonempty. * If you try to push an element into an overflowed queue, the thread is blocked until space appears in the queue. */ @@ -47,7 +47,6 @@ template class ConcurrentBoundedQueue { private: - size_t max_fill; std::queue queue; Poco::FastMutex mutex; Poco::Semaphore fill_count; From a43b9ec398f68300f125509583d03b29b2926bc6 Mon Sep 17 00:00:00 2001 From: proller Date: Wed, 13 Sep 2017 21:09:21 +0300 Subject: [PATCH 11/84] Revert "Simplification [#CLICKHOUSE-2]." This reverts commit 98ad6a5db3239ac6567ee96e90973b935dcac39b. --- .../AsynchronousBlockInputStream.h | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/dbms/src/DataStreams/AsynchronousBlockInputStream.h b/dbms/src/DataStreams/AsynchronousBlockInputStream.h index cd4116a8325..e72b2f6489f 100644 --- a/dbms/src/DataStreams/AsynchronousBlockInputStream.h +++ b/dbms/src/DataStreams/AsynchronousBlockInputStream.h @@ -57,7 +57,10 @@ public: if (started) { pool.wait(); + if (exception) + std::rethrow_exception(exception); children.back()->readSuffix(); + started = false; } } @@ -79,23 +82,18 @@ public: ~AsynchronousBlockInputStream() override { - try - { - pool.wait(); /// It's ok to call wait even if there is no active threads. - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } + if (started) + pool.wait(); } protected: - ThreadPool pool{1}; /// Rethrows exceptions automatically on wait. + ThreadPool pool{1}; Poco::Event ready; bool started = false; bool first = true; Block block; + std::exception_ptr exception; Block readImpl() override @@ -103,12 +101,15 @@ protected: /// If there were no calculations yet, calculate the first block synchronously if (!started) { - started = true; calculate(current_memory_tracker); + started = true; } else /// If the calculations are already in progress - wait for the result pool.wait(); + if (exception) + std::rethrow_exception(exception); + Block res = block; if (!res) return res; @@ -133,15 +134,22 @@ protected: { CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread}; - if (first) + try { - first = false; - setThreadName("AsyncBlockInput"); - current_memory_tracker = memory_tracker; - children.back()->readPrefix(); - } + if (first) + { + first = false; + setThreadName("AsyncBlockInput"); + current_memory_tracker = memory_tracker; + children.back()->readPrefix(); + } - block = children.back()->read(); + block = children.back()->read(); + } + catch (...) + { + exception = std::current_exception(); + } ready.set(); } From 826c354ff5fbf687baf2b7bf6d7cab31d59f5653 Mon Sep 17 00:00:00 2001 From: Vitaliy Lyudvichenko Date: Thu, 14 Sep 2017 16:56:54 +0300 Subject: [PATCH 12/84] Fixed merges progress bar. [#CLICKHOUSE-2] --- dbms/src/DataStreams/ColumnGathererStream.cpp | 7 ++++++- .../src/Storages/MergeTree/MergeTreeDataMerger.cpp | 14 +++++++------- dbms/src/Storages/System/StorageSystemMerges.cpp | 2 +- dbms/tests/queries/0_stateless/00155_merges.sh | 5 +++++ 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/dbms/src/DataStreams/ColumnGathererStream.cpp b/dbms/src/DataStreams/ColumnGathererStream.cpp index fef47a9d175..760af89e0b2 100644 --- a/dbms/src/DataStreams/ColumnGathererStream.cpp +++ b/dbms/src/DataStreams/ColumnGathererStream.cpp @@ -118,8 +118,13 @@ void ColumnGathererStream::fetchNewBlock(Source & source, size_t source_num) void ColumnGathererStream::readSuffixImpl() { const BlockStreamProfileInfo & profile_info = getProfileInfo(); + + /// Don't print info for small parts (< 10M rows) + if (profile_info.rows < 10000000) + return; + double seconds = profile_info.total_stopwatch.elapsedSeconds(); - LOG_DEBUG(log, std::fixed << std::setprecision(2) + LOG_TRACE(log, std::fixed << std::setprecision(2) << "Gathered column " << name << " (" << static_cast(profile_info.bytes) / profile_info.rows << " bytes/elem.)" << " in " << seconds << " sec., " diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp index c9638a1137f..e0961430e58 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp @@ -386,7 +386,7 @@ public: * - amount of merged rows and their size (PK columns subset is used in case of Vertical merge) * - time elapsed for current merge. */ -class MergeProgressCallback : public ProgressCallback +class MergeProgressCallback { public: MergeProgressCallback(MergeList::Entry & merge_entry_, UInt64 & watch_prev_elapsed_) @@ -540,14 +540,14 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMerger::mergePartsToTemporaryPart BlockInputStreams src_streams; UInt64 watch_prev_elapsed = 0; - for (size_t i = 0; i < parts.size(); ++i) + for (const auto & part : parts) { auto input = std::make_unique( - data, parts[i], DEFAULT_MERGE_BLOCK_SIZE, 0, 0, merging_column_names, MarkRanges(1, MarkRange(0, parts[i]->size)), + data, part, DEFAULT_MERGE_BLOCK_SIZE, 0, 0, merging_column_names, MarkRanges(1, MarkRange(0, part->size)), false, nullptr, "", true, aio_threshold, DBMS_DEFAULT_BUFFER_SIZE, false); - input->setProgressCallback( - MergeProgressCallback{merge_entry, sum_input_rows_upper_bound, column_sizes, watch_prev_elapsed, merge_alg}); + input->setProgressCallback(MergeProgressCallback( + merge_entry, sum_input_rows_upper_bound, column_sizes, watch_prev_elapsed, merge_alg)); if (data.merging_params.mode != MergeTreeData::MergingParams::Unsorted) src_streams.emplace_back(std::make_shared( @@ -680,8 +680,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMerger::mergePartsToTemporaryPart data, parts[part_num], DEFAULT_MERGE_BLOCK_SIZE, 0, 0, column_name_, MarkRanges{MarkRange(0, parts[part_num]->size)}, false, nullptr, "", true, aio_threshold, DBMS_DEFAULT_BUFFER_SIZE, false, Names{}, 0, true); - column_part_stream->setProgressCallback( - MergeProgressCallbackVerticalStep{merge_entry, sum_input_rows_exact, column_sizes, column_name, watch_prev_elapsed}); + column_part_stream->setProgressCallback(MergeProgressCallbackVerticalStep( + merge_entry, sum_input_rows_exact, column_sizes, column_name, watch_prev_elapsed)); column_part_streams[part_num] = std::move(column_part_stream); } diff --git a/dbms/src/Storages/System/StorageSystemMerges.cpp b/dbms/src/Storages/System/StorageSystemMerges.cpp index 81f20aab9e6..8b1781ad46d 100644 --- a/dbms/src/Storages/System/StorageSystemMerges.cpp +++ b/dbms/src/Storages/System/StorageSystemMerges.cpp @@ -54,7 +54,7 @@ BlockInputStreams StorageSystemMerges::read( block.getByPosition(i++).column->insert(merge.database); block.getByPosition(i++).column->insert(merge.table); block.getByPosition(i++).column->insert(merge.elapsed); - block.getByPosition(i++).column->insert(std::min(1., merge.progress)); /// little cheat + block.getByPosition(i++).column->insert(merge.progress); block.getByPosition(i++).column->insert(merge.num_parts); block.getByPosition(i++).column->insert(merge.source_part_names); block.getByPosition(i++).column->insert(merge.result_part_name); diff --git a/dbms/tests/queries/0_stateless/00155_merges.sh b/dbms/tests/queries/0_stateless/00155_merges.sh index 9e18cb8061f..36602d5b87d 100755 --- a/dbms/tests/queries/0_stateless/00155_merges.sh +++ b/dbms/tests/queries/0_stateless/00155_merges.sh @@ -63,6 +63,8 @@ function test { echo } +merged_rows_0=`clickhouse-client -q "select value from system.events where event = 'MergedRows'"` + test 8191 8191 test 8191 8192 test 8192 8191 @@ -76,4 +78,7 @@ test 8193 8194 test 8194 8193 test 8194 8194 +merged_rows_1=`clickhouse-client -q "select value from system.events where event = 'MergedRows'"` +[[ $merged_rows_1 -le $merged_rows_0 ]] + cleanup From 7284ac69c057b8ffcf7d77c97a7d41fc9e39ffa0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 02:33:46 +0300 Subject: [PATCH 13/84] Updated changelog [#CLICKHOUSE-2]. --- CHANGELOG_RU.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/CHANGELOG_RU.md b/CHANGELOG_RU.md index 6ac2620cbac..b85828d6a7b 100644 --- a/CHANGELOG_RU.md +++ b/CHANGELOG_RU.md @@ -1,3 +1,44 @@ +# Релиз ClickHouse 1.1.54289 + +## Новые возможности: +* Запросы `SYSTEM` для административных действий с сервером: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. +* Добавлены функции для работы с массивами: `concat`, `arraySlice`, `arrayPushBack`, `arrayPushFront`, `arrayPopBack`, `arrayPopFront`. +* Добавлены параметры `root` и `identity` для конфигурации ZooKeeper. Это позволяет использовать разных пользователей одного ZooKeeper кластера. +* Добавлены агрегатные функции `groupBitAnd`, `groupBitOr`, `groupBitXor` (для совместимости доступны также под именами `BIT_AND`, `BIT_OR`, `BIT_XOR`). +* Возможность загрузки внешних словарей из MySQL с указанием сокета на файловой системе. +* Возможность загрузки внешних словарей из MySQL через SSL соединение (параметры `ssl_cert`, `ssl_key`, `ssl_ca`). +* Добавлена настройка `max_network_bandwidth_for_user` для ограничения общего потребления сети для всех запросов одного пользователя. +* Поддержка `DROP TABLE` для временных таблиц. +* Поддержка чтения значений типа `DateTime` в формате unix timestamp из форматов `CSV` и `JSONEachRow`. +* Включено по-умолчанию отключение отстающих реплик при распределённых запросах (по-умолчанию порог равен 5 минутам). +* Используются FIFO блокировки при ALTER: выполнение ALTER не будет неограниченно блокироваться при непрерывно выполняющихся запросах. +* Возможность задать `umask` в конфигурационном файле. +* Увеличена производительность запросов с `DISTINCT`. + +## Исправления ошибок: +* Более оптимальная процедура удаления старых нод в ZooKeeper. Ранее в случае очень частых вставок, старые ноды могли не успевать удаляться, что приводило, в том числе, к очень долгому завершению сервера. +* Исправлена рандомизация при выборе хостов для соединения с ZooKeeper. +* Исправлено отключение отстающей реплики при распределённых запросах, если реплика является localhost. +* Исправлена ошибка, в связи с которой кусок данных таблицы типа `ReplicatedMergeTree` мог становиться битым после выполнения `ALTER MODIFY` элемента `Nested` структуры. +* Исправлена ошибка приводящая к возможному зависанию SELECT запросов. +* Доработки распределённых DDL запросов. +* Исправлен запрос `CREATE TABLE ... AS `. +* Исправлен дедлок при запросе `ALTER ... CLEAR COLUMN IN PARTITION` для `Buffer` таблиц. +* Исправлено использование неправильного значения по-умолчанию для `Enum`-ов (0 вместо минимального) при использовании форматов `JSONEachRow` и `TSKV`. +* Исправлено появление zombie процессов при работе со словарём с источником `executable`. +* Исправлен segfault при запросе HEAD. + +## Обратно несовместимые изменения: +* Изменён генератор случайных чисел для агрегатных функций `quantile`, `quantileDeterministic`. Функция `quantileDeterministic` может выдавать другой результат. Если вам важно, чтобы функция `quantileDeterministic` возвращала тот же результат, что и раньше, обращайтесь к разработчикам ClickHouse. + +## Улучшения процесса разработки и сборки ClickHouse: +* Возможность сборки с помощью `pbuilder`. +* Возможность сборки с использованием `libc++` вместо `libstdc++` под Linux. +* Добавлены инструкции для использования статических анализаторов кода `Coverity`, `clang-tidy`, `cppcheck`. + +## На что обратить внимание при обновлении: +* Увеличено значение по-умолчанию для настройки MergeTree `max_bytes_to_merge_at_max_space_in_pool` (максимальный суммарный размер кусков в байтах для мержа) со 100 GiB до 150 GiB. Это может привести к запуску больших мержей после обновления сервера, что может вызвать повышенную нагрузку на дисковую подсистему. Если же на серверах, где это происходит, количество свободного места менее чем в два раза больше суммарного объёма выполняющихся мержей, то в связи с этим перестанут выполняться какие-либо другие мержи, включая мержи мелких кусков. Это приведёт к тому, что INSERT-ы будут отклоняться с сообщением "Merges are processing significantly slower than inserts". Для наблюдения, используйте запрос `SELECT * FROM system.merges`. Вы также можете смотреть на метрику `DiskSpaceReservedForMerge` в таблице `system.metrics` или в Graphite. Для исправления этой ситуации можно ничего не делать, так как она нормализуется сама после завершения больших мержей. Если же вас это не устраивает, вы можете вернуть настройку `max_bytes_to_merge_at_max_space_in_pool` в старое значение, прописав в config.xml в секции `` ` Date: Fri, 15 Sep 2017 02:34:57 +0300 Subject: [PATCH 14/84] Updated changelog [#CLICKHOUSE-2]. --- CHANGELOG_RU.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG_RU.md b/CHANGELOG_RU.md index b85828d6a7b..f846e60714b 100644 --- a/CHANGELOG_RU.md +++ b/CHANGELOG_RU.md @@ -37,7 +37,7 @@ * Добавлены инструкции для использования статических анализаторов кода `Coverity`, `clang-tidy`, `cppcheck`. ## На что обратить внимание при обновлении: -* Увеличено значение по-умолчанию для настройки MergeTree `max_bytes_to_merge_at_max_space_in_pool` (максимальный суммарный размер кусков в байтах для мержа) со 100 GiB до 150 GiB. Это может привести к запуску больших мержей после обновления сервера, что может вызвать повышенную нагрузку на дисковую подсистему. Если же на серверах, где это происходит, количество свободного места менее чем в два раза больше суммарного объёма выполняющихся мержей, то в связи с этим перестанут выполняться какие-либо другие мержи, включая мержи мелких кусков. Это приведёт к тому, что INSERT-ы будут отклоняться с сообщением "Merges are processing significantly slower than inserts". Для наблюдения, используйте запрос `SELECT * FROM system.merges`. Вы также можете смотреть на метрику `DiskSpaceReservedForMerge` в таблице `system.metrics` или в Graphite. Для исправления этой ситуации можно ничего не делать, так как она нормализуется сама после завершения больших мержей. Если же вас это не устраивает, вы можете вернуть настройку `max_bytes_to_merge_at_max_space_in_pool` в старое значение, прописав в config.xml в секции `` `` `107374182400` и перезапустить сервер. # Релиз ClickHouse 1.1.54284 From 2fe24c0ca120026dc4d0ae54049d9d93d7747919 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 02:36:42 +0300 Subject: [PATCH 15/84] Updated changelog [#CLICKHOUSE-2]. --- CHANGELOG_RU.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG_RU.md b/CHANGELOG_RU.md index f846e60714b..95acfef6466 100644 --- a/CHANGELOG_RU.md +++ b/CHANGELOG_RU.md @@ -3,7 +3,7 @@ ## Новые возможности: * Запросы `SYSTEM` для административных действий с сервером: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. * Добавлены функции для работы с массивами: `concat`, `arraySlice`, `arrayPushBack`, `arrayPushFront`, `arrayPopBack`, `arrayPopFront`. -* Добавлены параметры `root` и `identity` для конфигурации ZooKeeper. Это позволяет использовать разных пользователей одного ZooKeeper кластера. +* Добавлены параметры `root` и `identity` для конфигурации ZooKeeper. Это позволяет изолировать разных пользователей одного ZooKeeper кластера. * Добавлены агрегатные функции `groupBitAnd`, `groupBitOr`, `groupBitXor` (для совместимости доступны также под именами `BIT_AND`, `BIT_OR`, `BIT_XOR`). * Возможность загрузки внешних словарей из MySQL с указанием сокета на файловой системе. * Возможность загрузки внешних словарей из MySQL через SSL соединение (параметры `ssl_cert`, `ssl_key`, `ssl_ca`). From 2a5391fda8cbba62857626aed33bc88b1ae97b58 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 02:45:45 +0300 Subject: [PATCH 16/84] Updated changelog [#CLICKHOUSE-2]. --- CHANGELOG_RU.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG_RU.md b/CHANGELOG_RU.md index 95acfef6466..4003edde1fd 100644 --- a/CHANGELOG_RU.md +++ b/CHANGELOG_RU.md @@ -28,9 +28,6 @@ * Исправлено появление zombie процессов при работе со словарём с источником `executable`. * Исправлен segfault при запросе HEAD. -## Обратно несовместимые изменения: -* Изменён генератор случайных чисел для агрегатных функций `quantile`, `quantileDeterministic`. Функция `quantileDeterministic` может выдавать другой результат. Если вам важно, чтобы функция `quantileDeterministic` возвращала тот же результат, что и раньше, обращайтесь к разработчикам ClickHouse. - ## Улучшения процесса разработки и сборки ClickHouse: * Возможность сборки с помощью `pbuilder`. * Возможность сборки с использованием `libc++` вместо `libstdc++` под Linux. From c37a387ff8c50b2d87cf2d143a973399576442e8 Mon Sep 17 00:00:00 2001 From: Evgeny Konkov Date: Thu, 14 Sep 2017 20:13:40 +0300 Subject: [PATCH 17/84] remove control characters from diff output in order to properly save diff output in junit xml report files. --- dbms/tests/clickhouse-test | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index 86d6fed4e12..64ae054c181 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -29,6 +29,19 @@ MSG_OK = OP_SQUARE_BRACKET + colored(" OK ", "green", attrs=['bold']) + CL_SQUAR MSG_SKIPPED = OP_SQUARE_BRACKET + colored(" SKIPPED ", "cyan", attrs=['bold']) + CL_SQUARE_BRACKET +def remove_control_characters(s): + """ + https://github.com/html5lib/html5lib-python/issues/96#issuecomment-43438438 + """ + def str_to_int(s, default, base=10): + if int(s, base) < 0x10000: + return unichr(int(s, base)) + return default + s = re.sub(ur"&#(\d+);?", lambda c: str_to_int(c.group(1), c.group(0)), s) + s = re.sub(ur"&#[xX]([0-9a-fA-F]+);?", lambda c: str_to_int(c.group(1), c.group(0), base=16), s) + s = re.sub(ur"[\x00-\x08\x0b\x0e-\x1f\x7f]", "", s) + return s + def main(args): SERVER_DIED = False @@ -202,14 +215,14 @@ def main(args): failure = et.Element("failure", attrib = {"message": "result differs with reference"}) report_testcase.append(failure) - + + stdout_element = et.Element("system-out") try: - stdout_element = et.Element("system-out") stdout_element.text = et.CDATA(diff) - report_testcase.append(stdout_element) except Exception as e: - print("{0} - test internal error\n{1}".format(MSG_FAIL, e)) - + stdout_element.text = et.CDATA(remove_control_characters(diff)) + + report_testcase.append(stdout_element) failures = failures + 1 print("{0} - result differs with reference:\n{1}".format(MSG_FAIL, diff.encode('utf-8'))) else: From e95ab368328ba8bd59f0ceb4525039e35a9cb814 Mon Sep 17 00:00:00 2001 From: Evgeny Konkov Date: Fri, 15 Sep 2017 00:35:26 +0300 Subject: [PATCH 18/84] handle test errors --- dbms/tests/clickhouse-test | 218 +++++++++++++++++++------------------ 1 file changed, 114 insertions(+), 104 deletions(-) diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index 64ae054c181..43b45e1ec29 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -120,119 +120,129 @@ def main(args): (name, ext) = os.path.splitext(case) report_testcase = et.Element("testcase", attrib = {"name": name}) - print "{0:70}".format(name + ": "), - sys.stdout.flush() - - if not args.zookeeper and 'zookeeper' in name: - report_testcase.append(et.Element("skipped", attrib = {"message": "no zookeeper"})) - print(MSG_SKIPPED + " - no zookeeper") - elif not args.shard and 'shard' in name: - report_testcase.append(et.Element("skipped", attrib = {"message": "no shard"})) - print(MSG_SKIPPED + " - no shard") - else: - reference_file = os.path.join(suite_dir, name) + '.reference' - stdout_file = os.path.join(suite_dir, name) + '.stdout' - stderr_file = os.path.join(suite_dir, name) + '.stderr' - - if ext == '.sql': - command = "{0} --multiquery < {1} > {2} 2> {3}".format(args.client, case_file, stdout_file, stderr_file) + try: + print "{0:70}".format(name + ": "), + sys.stdout.flush() + + print 1/0 + + if not args.zookeeper and 'zookeeper' in name: + report_testcase.append(et.Element("skipped", attrib = {"message": "no zookeeper"})) + print(MSG_SKIPPED + " - no zookeeper") + elif not args.shard and 'shard' in name: + report_testcase.append(et.Element("skipped", attrib = {"message": "no shard"})) + print(MSG_SKIPPED + " - no shard") else: - command = "{0} > {1} 2> {2}".format(case_file, stdout_file, stderr_file) - - proc = Popen(command, shell = True) - start_time = datetime.now() - while (datetime.now() - start_time).total_seconds() < args.timeout and proc.poll() is None: - sleep(0) - - if proc.returncode is None: - try: - proc.kill() - except OSError as e: - if e.errno != ESRCH: - raise - - failure = et.Element("failure", attrib = {"message": "Timeout"}) - report_testcase.append(failure) - - failures = failures + 1 - print("{0} - Timeout!".format(MSG_FAIL)) - else: - stdout = open(stdout_file, 'r').read() if os.path.exists(stdout_file) else '' - stdout = unicode(stdout, errors='replace', encoding='utf-8') - stderr = open(stderr_file, 'r').read() if os.path.exists(stderr_file) else '' - stderr = unicode(stderr, errors='replace', encoding='utf-8') - - if proc.returncode != 0: - failure = et.Element("failure", attrib = {"message": "return code {}".format(proc.returncode)}) + reference_file = os.path.join(suite_dir, name) + '.reference' + stdout_file = os.path.join(suite_dir, name) + '.stdout' + stderr_file = os.path.join(suite_dir, name) + '.stderr' + + if ext == '.sql': + command = "{0} --multiquery < {1} > {2} 2> {3}".format(args.client, case_file, stdout_file, stderr_file) + else: + command = "{0} > {1} 2> {2}".format(case_file, stdout_file, stderr_file) + + proc = Popen(command, shell = True) + start_time = datetime.now() + while (datetime.now() - start_time).total_seconds() < args.timeout and proc.poll() is None: + sleep(0) + + if proc.returncode is None: + try: + proc.kill() + except OSError as e: + if e.errno != ESRCH: + raise + + failure = et.Element("failure", attrib = {"message": "Timeout"}) report_testcase.append(failure) - - stdout_element = et.Element("system-out") - stdout_element.text = et.CDATA(stdout) - report_testcase.append(stdout_element) - + failures = failures + 1 - print("{0} - return code {1}".format(MSG_FAIL, proc.returncode)) - - if stderr: + print("{0} - Timeout!".format(MSG_FAIL)) + else: + stdout = open(stdout_file, 'r').read() if os.path.exists(stdout_file) else '' + stdout = unicode(stdout, errors='replace', encoding='utf-8') + stderr = open(stderr_file, 'r').read() if os.path.exists(stderr_file) else '' + stderr = unicode(stderr, errors='replace', encoding='utf-8') + + if proc.returncode != 0: + failure = et.Element("failure", attrib = {"message": "return code {}".format(proc.returncode)}) + report_testcase.append(failure) + + stdout_element = et.Element("system-out") + stdout_element.text = et.CDATA(stdout) + report_testcase.append(stdout_element) + + failures = failures + 1 + print("{0} - return code {1}".format(MSG_FAIL, proc.returncode)) + + if stderr: + stderr_element = et.Element("system-err") + stderr_element.text = et.CDATA(stderr) + report_testcase.append(stderr_element) + print(stderr) + + if args.stop and ('Connection refused' in stderr or 'Attempt to read after eof' in stderr) and not 'Received exception from server' in stderr: + SERVER_DIED = True + + elif stderr: + failure = et.Element("failure", attrib = {"message": "having stderror"}) + report_testcase.append(failure) + stderr_element = et.Element("system-err") stderr_element.text = et.CDATA(stderr) report_testcase.append(stderr_element) - print(stderr) - - if args.stop and ('Connection refused' in stderr or 'Attempt to read after eof' in stderr) and not 'Received exception from server' in stderr: - SERVER_DIED = True - - elif stderr: - failure = et.Element("failure", attrib = {"message": "having stderror"}) - report_testcase.append(failure) - - stderr_element = et.Element("system-err") - stderr_element.text = et.CDATA(stderr) - report_testcase.append(stderr_element) - - failures = failures + 1 - print("{0} - having stderror:\n{1}".format(MSG_FAIL, stderr.encode('utf-8'))) - elif 'Exception' in stdout: - failure = et.Element("error", attrib = {"message": "having exception"}) - report_testcase.append(failure) - - stdout_element = et.Element("system-out") - stdout_element.text = et.CDATA(stdout) - report_testcase.append(stdout_element) - - failures = failures + 1 - print("{0} - having exception:\n{1}".format(MSG_FAIL, stdout.encode('utf-8'))) - elif not os.path.isfile(reference_file): - skipped = et.Element("skipped", attrib = {"message": "no reference file"}) - report_testcase.append(skipped) - print("{0} - no reference file".format(MSG_UNKNOWN)) - else: - result_is_different = subprocess.call(['cmp', '-s', reference_file, stdout_file], stdout = PIPE) - - if result_is_different: - (diff, _) = Popen(['diff', '--side-by-side', reference_file, stdout_file], stdout = PIPE).communicate() - diff = unicode(diff, errors='replace', encoding='utf-8') - - failure = et.Element("failure", attrib = {"message": "result differs with reference"}) - report_testcase.append(failure) - - stdout_element = et.Element("system-out") - try: - stdout_element.text = et.CDATA(diff) - except Exception as e: - stdout_element.text = et.CDATA(remove_control_characters(diff)) - - report_testcase.append(stdout_element) + failures = failures + 1 - print("{0} - result differs with reference:\n{1}".format(MSG_FAIL, diff.encode('utf-8'))) + print("{0} - having stderror:\n{1}".format(MSG_FAIL, stderr.encode('utf-8'))) + elif 'Exception' in stdout: + failure = et.Element("error", attrib = {"message": "having exception"}) + report_testcase.append(failure) + + stdout_element = et.Element("system-out") + stdout_element.text = et.CDATA(stdout) + report_testcase.append(stdout_element) + + failures = failures + 1 + print("{0} - having exception:\n{1}".format(MSG_FAIL, stdout.encode('utf-8'))) + elif not os.path.isfile(reference_file): + skipped = et.Element("skipped", attrib = {"message": "no reference file"}) + report_testcase.append(skipped) + print("{0} - no reference file".format(MSG_UNKNOWN)) else: - print(MSG_OK) - if os.path.exists(stdout_file): - os.remove(stdout_file) - if os.path.exists(stderr_file): - os.remove(stderr_file) + result_is_different = subprocess.call(['cmp', '-s', reference_file, stdout_file], stdout = PIPE) + + if result_is_different: + (diff, _) = Popen(['diff', '--side-by-side', reference_file, stdout_file], stdout = PIPE).communicate() + diff = unicode(diff, errors='replace', encoding='utf-8') + + failure = et.Element("failure", attrib = {"message": "result differs with reference"}) + report_testcase.append(failure) + + stdout_element = et.Element("system-out") + try: + stdout_element.text = et.CDATA(diff) + except: + stdout_element.text = et.CDATA(remove_control_characters(diff)) + + report_testcase.append(stdout_element) + failures = failures + 1 + print("{0} - result differs with reference:\n{1}".format(MSG_FAIL, diff.encode('utf-8'))) + else: + print(MSG_OK) + if os.path.exists(stdout_file): + os.remove(stdout_file) + if os.path.exists(stderr_file): + os.remove(stderr_file) + except: + (exc_type, exc_value) = sys.exc_info()[:2] + error = et.Element("error", attrib = {"type": exc_type.__name__, "message": str(exc_value)}) + report_testcase.append(error) - dump_report(args.output, suite, name, report_testcase) + failures = failures + 1 + print("{0} - Test internal error: {1}\n{2}".format(MSG_FAIL, exc_type.__name__, exc_value)) + finally: + dump_report(args.output, suite, name, report_testcase) failures_total = failures_total + failures From 6d41f6859c3f02a0b6aae1f684f127ff0228fbcd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 05:58:17 +0300 Subject: [PATCH 19/84] Imported https://github.com/skywind3000/FastMemcpy at a74a33a1fb6d400252ab73e417d12f622dd8fe61 [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 692 ++++++++++++++++++++++++++++++++++++ libs/libmemcpy/LICENSE | 22 ++ libs/libmemcpy/README.md | 97 +++++ 3 files changed, 811 insertions(+) create mode 100644 libs/libmemcpy/FastMemcpy.h create mode 100644 libs/libmemcpy/LICENSE create mode 100644 libs/libmemcpy/README.md diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h new file mode 100644 index 00000000000..65c249dabc1 --- /dev/null +++ b/libs/libmemcpy/FastMemcpy.h @@ -0,0 +1,692 @@ +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#ifndef __FAST_MEMCPY_H__ +#define __FAST_MEMCPY_H__ + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + + + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_sse2_16(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); +} + +static INLINE void memcpy_sse2_32(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); +} + +static INLINE void memcpy_sse2_64(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); +} + +static INLINE void memcpy_sse2_128(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); + __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); + __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); + __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); + _mm_storeu_si128(((__m128i*)dst) + 4, m4); + _mm_storeu_si128(((__m128i*)dst) + 5, m5); + _mm_storeu_si128(((__m128i*)dst) + 6, m6); + _mm_storeu_si128(((__m128i*)dst) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; + + switch (size) { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); + case 0: + break; + + case 65: + memcpy_sse2_64(dd - 65, ss - 65); + case 1: + dd[-1] = ss[-1]; + break; + + case 66: + memcpy_sse2_64(dd - 66, ss - 66); + case 2: + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 67: + memcpy_sse2_64(dd - 67, ss - 67); + case 3: + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 68: + memcpy_sse2_64(dd - 68, ss - 68); + case 4: + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 69: + memcpy_sse2_64(dd - 69, ss - 69); + case 5: + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 70: + memcpy_sse2_64(dd - 70, ss - 70); + case 6: + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 71: + memcpy_sse2_64(dd - 71, ss - 71); + case 7: + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 72: + memcpy_sse2_64(dd - 72, ss - 72); + case 8: + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 73: + memcpy_sse2_64(dd - 73, ss - 73); + case 9: + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 74: + memcpy_sse2_64(dd - 74, ss - 74); + case 10: + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 75: + memcpy_sse2_64(dd - 75, ss - 75); + case 11: + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 76: + memcpy_sse2_64(dd - 76, ss - 76); + case 12: + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 77: + memcpy_sse2_64(dd - 77, ss - 77); + case 13: + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 78: + memcpy_sse2_64(dd - 78, ss - 78); + case 14: + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 79: + memcpy_sse2_64(dd - 79, ss - 79); + case 15: + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 80: + memcpy_sse2_64(dd - 80, ss - 80); + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 81: + memcpy_sse2_64(dd - 81, ss - 81); + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 82: + memcpy_sse2_64(dd - 82, ss - 82); + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 83: + memcpy_sse2_64(dd - 83, ss - 83); + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +static void* memcpy_fast(void *destination, const void *source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) { + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) { + __m128i head = _mm_loadu_si128((const __m128i*)src); + _mm_storeu_si128((__m128i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128((((__m128i*)dst) + 0), c0); + _mm_store_si128((((__m128i*)dst) + 1), c1); + _mm_store_si128((((__m128i*)dst) + 2), c2); + _mm_store_si128((((__m128i*)dst) + 3), c3); + _mm_store_si128((((__m128i*)dst) + 4), c4); + _mm_store_si128((((__m128i*)dst) + 5), c5); + _mm_store_si128((((__m128i*)dst) + 6), c6); + _mm_store_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) { // source aligned + for (; size >= 128; size -= 128) { + c0 = _mm_load_si128(((const __m128i*)src) + 0); + c1 = _mm_load_si128(((const __m128i*)src) + 1); + c2 = _mm_load_si128(((const __m128i*)src) + 2); + c3 = _mm_load_si128(((const __m128i*)src) + 3); + c4 = _mm_load_si128(((const __m128i*)src) + 4); + c5 = _mm_load_si128(((const __m128i*)src) + 5); + c6 = _mm_load_si128(((const __m128i*)src) + 6); + c7 = _mm_load_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // source unaligned + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; +} + + +#endif + + + diff --git a/libs/libmemcpy/LICENSE b/libs/libmemcpy/LICENSE new file mode 100644 index 00000000000..c449da6aa8a --- /dev/null +++ b/libs/libmemcpy/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Linwei + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md new file mode 100644 index 00000000000..2c63dc21a81 --- /dev/null +++ b/libs/libmemcpy/README.md @@ -0,0 +1,97 @@ +Build +===== + +with gcc: +> gcc -O3 -msse2 FastMemcpy.c -o FastMemcpy + +with msvc: +> cl -nologo -O2 FastMemcpy.c + +Features +======== + +* 50% speedup in avg. vs traditional memcpy in msvc 2012 or gcc 4.9 +* small size copy optimized with jump table +* medium size copy optimized with sse2 vector copy +* huge size copy optimized with cache prefetch & movntdq + +Reference +========= + +[Using Block Prefetch for Optimized Memory Performance](http://files.rsdn.ru/23380/AMD_block_prefetch_paper.pdf) + +The artical only focused on aligned huge memory copy. You need handle other conditions by your self. + + +Results +======= + +``` +result: gcc4.9 (msvc 2012 got a similar result): + +benchmark(size=32 bytes, times=16777216): +result(dst aligned, src aligned): memcpy_fast=81ms memcpy=281 ms +result(dst aligned, src unalign): memcpy_fast=88ms memcpy=254 ms +result(dst unalign, src aligned): memcpy_fast=87ms memcpy=245 ms +result(dst unalign, src unalign): memcpy_fast=81ms memcpy=258 ms + +benchmark(size=64 bytes, times=16777216): +result(dst aligned, src aligned): memcpy_fast=91ms memcpy=364 ms +result(dst aligned, src unalign): memcpy_fast=95ms memcpy=336 ms +result(dst unalign, src aligned): memcpy_fast=96ms memcpy=353 ms +result(dst unalign, src unalign): memcpy_fast=99ms memcpy=346 ms + +benchmark(size=512 bytes, times=8388608): +result(dst aligned, src aligned): memcpy_fast=124ms memcpy=242 ms +result(dst aligned, src unalign): memcpy_fast=166ms memcpy=555 ms +result(dst unalign, src aligned): memcpy_fast=168ms memcpy=602 ms +result(dst unalign, src unalign): memcpy_fast=174ms memcpy=614 ms + +benchmark(size=1024 bytes, times=4194304): +result(dst aligned, src aligned): memcpy_fast=119ms memcpy=171 ms +result(dst aligned, src unalign): memcpy_fast=182ms memcpy=442 ms +result(dst unalign, src aligned): memcpy_fast=163ms memcpy=466 ms +result(dst unalign, src unalign): memcpy_fast=168ms memcpy=472 ms + +benchmark(size=4096 bytes, times=524288): +result(dst aligned, src aligned): memcpy_fast=68ms memcpy=82 ms +result(dst aligned, src unalign): memcpy_fast=94ms memcpy=226 ms +result(dst unalign, src aligned): memcpy_fast=134ms memcpy=216 ms +result(dst unalign, src unalign): memcpy_fast=84ms memcpy=188 ms + +benchmark(size=8192 bytes, times=262144): +result(dst aligned, src aligned): memcpy_fast=55ms memcpy=70 ms +result(dst aligned, src unalign): memcpy_fast=75ms memcpy=192 ms +result(dst unalign, src aligned): memcpy_fast=79ms memcpy=223 ms +result(dst unalign, src unalign): memcpy_fast=91ms memcpy=219 ms + +benchmark(size=1048576 bytes, times=2048): +result(dst aligned, src aligned): memcpy_fast=181ms memcpy=165 ms +result(dst aligned, src unalign): memcpy_fast=192ms memcpy=303 ms +result(dst unalign, src aligned): memcpy_fast=218ms memcpy=310 ms +result(dst unalign, src unalign): memcpy_fast=183ms memcpy=307 ms + +benchmark(size=4194304 bytes, times=512): +result(dst aligned, src aligned): memcpy_fast=263ms memcpy=398 ms +result(dst aligned, src unalign): memcpy_fast=269ms memcpy=433 ms +result(dst unalign, src aligned): memcpy_fast=306ms memcpy=497 ms +result(dst unalign, src unalign): memcpy_fast=285ms memcpy=417 ms + +benchmark(size=8388608 bytes, times=256): +result(dst aligned, src aligned): memcpy_fast=287ms memcpy=421 ms +result(dst aligned, src unalign): memcpy_fast=288ms memcpy=430 ms +result(dst unalign, src aligned): memcpy_fast=285ms memcpy=510 ms +result(dst unalign, src unalign): memcpy_fast=291ms memcpy=440 ms + +benchmark random access: +memcpy_fast=487ms memcpy=1000ms + +``` + + +About +===== + +skywind + +http://www.skywind.me From 347cf3092b8ef9c4c7c91dc401c9e24c8a835d35 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:00:06 +0300 Subject: [PATCH 20/84] Modified whitespaces [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 1384 +++++++++++++++++------------------ libs/libmemcpy/README.md | 16 +- 2 files changed, 700 insertions(+), 700 deletions(-) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h index 65c249dabc1..306bd19ac58 100644 --- a/libs/libmemcpy/FastMemcpy.h +++ b/libs/libmemcpy/FastMemcpy.h @@ -1,692 +1,692 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - - - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -} - -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); -} - -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); -} - -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); - __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); - __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); - __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); - _mm_storeu_si128(((__m128i*)dst) + 4, m4); - _mm_storeu_si128(((__m128i*)dst) + 5, m5); - _mm_storeu_si128(((__m128i*)dst) + 6, m6); - _mm_storeu_si128(((__m128i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); - case 0: - break; - - case 65: - memcpy_sse2_64(dd - 65, ss - 65); - case 1: - dd[-1] = ss[-1]; - break; - - case 66: - memcpy_sse2_64(dd - 66, ss - 66); - case 2: - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 67: - memcpy_sse2_64(dd - 67, ss - 67); - case 3: - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 68: - memcpy_sse2_64(dd - 68, ss - 68); - case 4: - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 69: - memcpy_sse2_64(dd - 69, ss - 69); - case 5: - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 70: - memcpy_sse2_64(dd - 70, ss - 70); - case 6: - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 71: - memcpy_sse2_64(dd - 71, ss - 71); - case 7: - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 72: - memcpy_sse2_64(dd - 72, ss - 72); - case 8: - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 73: - memcpy_sse2_64(dd - 73, ss - 73); - case 9: - *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 74: - memcpy_sse2_64(dd - 74, ss - 74); - case 10: - *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 75: - memcpy_sse2_64(dd - 75, ss - 75); - case 11: - *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 76: - memcpy_sse2_64(dd - 76, ss - 76); - case 12: - *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 77: - memcpy_sse2_64(dd - 77, ss - 77); - case 13: - *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 78: - memcpy_sse2_64(dd - 78, ss - 78); - case 14: - *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 79: - memcpy_sse2_64(dd - 79, ss - 79); - case 15: - *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 80: - memcpy_sse2_64(dd - 80, ss - 80); - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 81: - memcpy_sse2_64(dd - 81, ss - 81); - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 82: - memcpy_sse2_64(dd - 82, ss - 82); - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 83: - memcpy_sse2_64(dd - 83, ss - 83); - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128((const __m128i*)src); - _mm_storeu_si128((__m128i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128((((__m128i*)dst) + 0), c0); - _mm_store_si128((((__m128i*)dst) + 1), c1); - _mm_store_si128((((__m128i*)dst) + 2), c2); - _mm_store_si128((((__m128i*)dst) + 3), c3); - _mm_store_si128((((__m128i*)dst) + 4), c4); - _mm_store_si128((((__m128i*)dst) + 5), c5); - _mm_store_si128((((__m128i*)dst) + 6), c6); - _mm_store_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128(((const __m128i*)src) + 0); - c1 = _mm_load_si128(((const __m128i*)src) + 1); - c2 = _mm_load_si128(((const __m128i*)src) + 2); - c3 = _mm_load_si128(((const __m128i*)src) + 3); - c4 = _mm_load_si128(((const __m128i*)src) + 4); - c5 = _mm_load_si128(((const __m128i*)src) + 5); - c6 = _mm_load_si128(((const __m128i*)src) + 6); - c7 = _mm_load_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; -} - - -#endif - - - +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#ifndef __FAST_MEMCPY_H__ +#define __FAST_MEMCPY_H__ + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + + + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_sse2_16(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); +} + +static INLINE void memcpy_sse2_32(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); +} + +static INLINE void memcpy_sse2_64(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); +} + +static INLINE void memcpy_sse2_128(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); + __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); + __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); + __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); + _mm_storeu_si128(((__m128i*)dst) + 4, m4); + _mm_storeu_si128(((__m128i*)dst) + 5, m5); + _mm_storeu_si128(((__m128i*)dst) + 6, m6); + _mm_storeu_si128(((__m128i*)dst) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; + + switch (size) { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); + case 0: + break; + + case 65: + memcpy_sse2_64(dd - 65, ss - 65); + case 1: + dd[-1] = ss[-1]; + break; + + case 66: + memcpy_sse2_64(dd - 66, ss - 66); + case 2: + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 67: + memcpy_sse2_64(dd - 67, ss - 67); + case 3: + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 68: + memcpy_sse2_64(dd - 68, ss - 68); + case 4: + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 69: + memcpy_sse2_64(dd - 69, ss - 69); + case 5: + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 70: + memcpy_sse2_64(dd - 70, ss - 70); + case 6: + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 71: + memcpy_sse2_64(dd - 71, ss - 71); + case 7: + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 72: + memcpy_sse2_64(dd - 72, ss - 72); + case 8: + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 73: + memcpy_sse2_64(dd - 73, ss - 73); + case 9: + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 74: + memcpy_sse2_64(dd - 74, ss - 74); + case 10: + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 75: + memcpy_sse2_64(dd - 75, ss - 75); + case 11: + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 76: + memcpy_sse2_64(dd - 76, ss - 76); + case 12: + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 77: + memcpy_sse2_64(dd - 77, ss - 77); + case 13: + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 78: + memcpy_sse2_64(dd - 78, ss - 78); + case 14: + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 79: + memcpy_sse2_64(dd - 79, ss - 79); + case 15: + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 80: + memcpy_sse2_64(dd - 80, ss - 80); + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 81: + memcpy_sse2_64(dd - 81, ss - 81); + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 82: + memcpy_sse2_64(dd - 82, ss - 82); + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 83: + memcpy_sse2_64(dd - 83, ss - 83); + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +static void* memcpy_fast(void *destination, const void *source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) { + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) { + __m128i head = _mm_loadu_si128((const __m128i*)src); + _mm_storeu_si128((__m128i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128((((__m128i*)dst) + 0), c0); + _mm_store_si128((((__m128i*)dst) + 1), c1); + _mm_store_si128((((__m128i*)dst) + 2), c2); + _mm_store_si128((((__m128i*)dst) + 3), c3); + _mm_store_si128((((__m128i*)dst) + 4), c4); + _mm_store_si128((((__m128i*)dst) + 5), c5); + _mm_store_si128((((__m128i*)dst) + 6), c6); + _mm_store_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) { // source aligned + for (; size >= 128; size -= 128) { + c0 = _mm_load_si128(((const __m128i*)src) + 0); + c1 = _mm_load_si128(((const __m128i*)src) + 1); + c2 = _mm_load_si128(((const __m128i*)src) + 2); + c3 = _mm_load_si128(((const __m128i*)src) + 3); + c4 = _mm_load_si128(((const __m128i*)src) + 4); + c5 = _mm_load_si128(((const __m128i*)src) + 5); + c6 = _mm_load_si128(((const __m128i*)src) + 6); + c7 = _mm_load_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // source unaligned + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; +} + + +#endif + + + diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md index 2c63dc21a81..91e01d4a5a0 100644 --- a/libs/libmemcpy/README.md +++ b/libs/libmemcpy/README.md @@ -1,8 +1,8 @@ Build ===== - -with gcc: -> gcc -O3 -msse2 FastMemcpy.c -o FastMemcpy + +with gcc: +> gcc -O3 -msse2 FastMemcpy.c -o FastMemcpy with msvc: > cl -nologo -O2 FastMemcpy.c @@ -10,10 +10,10 @@ with msvc: Features ======== -* 50% speedup in avg. vs traditional memcpy in msvc 2012 or gcc 4.9 -* small size copy optimized with jump table -* medium size copy optimized with sse2 vector copy -* huge size copy optimized with cache prefetch & movntdq +* 50% speedup in avg. vs traditional memcpy in msvc 2012 or gcc 4.9 +* small size copy optimized with jump table +* medium size copy optimized with sse2 vector copy +* huge size copy optimized with cache prefetch & movntdq Reference ========= @@ -28,7 +28,7 @@ Results ``` result: gcc4.9 (msvc 2012 got a similar result): - + benchmark(size=32 bytes, times=16777216): result(dst aligned, src aligned): memcpy_fast=81ms memcpy=281 ms result(dst aligned, src unalign): memcpy_fast=88ms memcpy=254 ms From d9a4acb779ef8d58f52cdb26bafbac16bdcd918d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:01:16 +0300 Subject: [PATCH 21/84] Fixed comment [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h index 306bd19ac58..9cb8bb2d019 100644 --- a/libs/libmemcpy/FastMemcpy.h +++ b/libs/libmemcpy/FastMemcpy.h @@ -581,7 +581,7 @@ static void* memcpy_fast(void *destination, const void *source, size_t size) { unsigned char *dst = (unsigned char*)destination; const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size + static size_t cachesize = 0x200000; // something around half of LL-cache size size_t padding; // small memory copy From 2695407a44e81a06ecda27c7c01049e4403e95a1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:26:40 +0300 Subject: [PATCH 22/84] Preparation [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h index 9cb8bb2d019..f6c79eaba60 100644 --- a/libs/libmemcpy/FastMemcpy.h +++ b/libs/libmemcpy/FastMemcpy.h @@ -577,11 +577,10 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { //--------------------------------------------------------------------- // main routine //--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) +static INLINE void* memcpy_fast(void *destination, const void *source, size_t size) { unsigned char *dst = (unsigned char*)destination; const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // something around half of LL-cache size size_t padding; // small memory copy @@ -601,7 +600,8 @@ static void* memcpy_fast(void *destination, const void *source, size_t size) } // medium size copy - if (size <= cachesize) { + if (size <= 0x200000) // something around half of LL-cache size + { __m128i c0, c1, c2, c3, c4, c5, c6, c7; for (; size >= 128; size -= 128) { From 4331b52a1592536d8a0c22ec29d15b065f363877 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:27:37 +0300 Subject: [PATCH 23/84] Added wrapper [#CLICKHOUSE-2]. --- libs/CMakeLists.txt | 1 + libs/libmemcpy/CMakeLists.txt | 1 + libs/libmemcpy/memcpy.c | 3 +++ libs/libmemcpy/memcpy.h | 17 +++++++++++++++++ 4 files changed, 22 insertions(+) create mode 100644 libs/libmemcpy/CMakeLists.txt create mode 100644 libs/libmemcpy/memcpy.c create mode 100644 libs/libmemcpy/memcpy.h diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index e5887d0813f..3420ba2ebc8 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -6,6 +6,7 @@ endif () add_subdirectory (libcommon) add_subdirectory (libpocoext) add_subdirectory (libdaemon) +add_subdirectory (libmemcpy) if (USE_MYSQL) add_subdirectory (libmysqlxx) diff --git a/libs/libmemcpy/CMakeLists.txt b/libs/libmemcpy/CMakeLists.txt new file mode 100644 index 00000000000..c06085aacca --- /dev/null +++ b/libs/libmemcpy/CMakeLists.txt @@ -0,0 +1 @@ +add_library (memcpy memcpy.c) diff --git a/libs/libmemcpy/memcpy.c b/libs/libmemcpy/memcpy.c new file mode 100644 index 00000000000..9e1b175bc57 --- /dev/null +++ b/libs/libmemcpy/memcpy.c @@ -0,0 +1,3 @@ +#include "memcpy.h" + +/// This is needed to generate an object file for linking. diff --git a/libs/libmemcpy/memcpy.h b/libs/libmemcpy/memcpy.h new file mode 100644 index 00000000000..902133ed3ae --- /dev/null +++ b/libs/libmemcpy/memcpy.h @@ -0,0 +1,17 @@ +#pragma once + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "FastMemcpy.h" + +void * __attribute__((__weak__)) memcpy(void * __restrict destination, const void * __restrict source, size_t size) +{ + return memcpy_fast(destination, source, size); +} + +#ifdef __cplusplus +} +#endif From 1f818dfa78a47ce10b378459d02b4d837419cc2e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:28:57 +0300 Subject: [PATCH 24/84] Moved third-party sources deeper [#CLICKHOUSE-2]. --- libs/libmemcpy/{ => impl}/FastMemcpy.h | 0 libs/libmemcpy/{ => impl}/LICENSE | 0 libs/libmemcpy/{ => impl}/README.md | 0 libs/libmemcpy/memcpy.h | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) rename libs/libmemcpy/{ => impl}/FastMemcpy.h (100%) rename libs/libmemcpy/{ => impl}/LICENSE (100%) rename libs/libmemcpy/{ => impl}/README.md (100%) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/impl/FastMemcpy.h similarity index 100% rename from libs/libmemcpy/FastMemcpy.h rename to libs/libmemcpy/impl/FastMemcpy.h diff --git a/libs/libmemcpy/LICENSE b/libs/libmemcpy/impl/LICENSE similarity index 100% rename from libs/libmemcpy/LICENSE rename to libs/libmemcpy/impl/LICENSE diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/impl/README.md similarity index 100% rename from libs/libmemcpy/README.md rename to libs/libmemcpy/impl/README.md diff --git a/libs/libmemcpy/memcpy.h b/libs/libmemcpy/memcpy.h index 902133ed3ae..1f898d4aff6 100644 --- a/libs/libmemcpy/memcpy.h +++ b/libs/libmemcpy/memcpy.h @@ -5,7 +5,7 @@ extern "C" { #endif -#include "FastMemcpy.h" +#include "impl/FastMemcpy.h" void * __attribute__((__weak__)) memcpy(void * __restrict destination, const void * __restrict source, size_t size) { From b700af1d2764e75db1da83b4bc2af0b87f14c427 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:44:34 +0300 Subject: [PATCH 25/84] Removed build of shared library and test executables for libzlib-ng [#CLICKHOUSE-2]. --- contrib/libzlib-ng/CMakeLists.txt | 57 ++----------------------------- 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/contrib/libzlib-ng/CMakeLists.txt b/contrib/libzlib-ng/CMakeLists.txt index 7054ab81115..cd2eeb97c55 100644 --- a/contrib/libzlib-ng/CMakeLists.txt +++ b/contrib/libzlib-ng/CMakeLists.txt @@ -489,39 +489,15 @@ if(MINGW OR MSYS) set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) endif(MINGW OR MSYS) -add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_GZFILE_SRCS} ${ZLIB_ARCH_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_GZFILE_SRCS} ${ZLIB_ARCH_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) -set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) -set_target_properties(zlib PROPERTIES SOVERSION 1) - -if(NOT CYGWIN) - # This property causes shared libraries on Linux to have the full version - # encoded into their final filename. We disable this on Cygwin because - # it causes cygz-${ZLIB_FULL_VERSION}.dll to be created when cygz.dll - # seems to be the default. - # - # This has no effect with MSVC, on that platform the version info for - # the DLL comes from the resource file win32/zlib1.rc - set_target_properties(zlib PROPERTIES VERSION ${ZLIB_FULL_VERSION}) -endif() - if(UNIX) # On unix-like platforms the library is almost always called libz - set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z) - if(NOT APPLE) - set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"") - endif() -elseif(MSYS) - # Suppress version number from shared library name - set(CMAKE_SHARED_LIBRARY_NAME_WITH_VERSION 0) -elseif(BUILD_SHARED_LIBS AND WIN32) - # Creates zlib1.dll when building shared library version - set_target_properties(zlib PROPERTIES SUFFIX "1.dll") + set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME z) endif() if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL ) - install(TARGETS zlib zlibstatic + install(TARGETS zlibstatic RUNTIME DESTINATION "${INSTALL_BIN_DIR}" ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" LIBRARY DESTINATION "${INSTALL_LIB_DIR}" ) @@ -529,35 +505,6 @@ endif() if(NOT SKIP_INSTALL_HEADERS AND NOT SKIP_INSTALL_ALL ) install(FILES ${ZLIB_PUBLIC_HDRS} DESTINATION "${INSTALL_INC_DIR}") endif() -if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL ) - install(FILES zlib.3 DESTINATION "${INSTALL_MAN_DIR}/man3") -endif() if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL ) install(FILES ${ZLIB_PC} DESTINATION "${INSTALL_PKGCONFIG_DIR}") endif() - -#============================================================================ -# Example binaries -#============================================================================ - -if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set (CMAKE_EXE_LINKER_FLAGS "") -endif () - -add_executable(example test/example.c) -target_link_libraries(example zlib) -add_test(example example) - -add_executable(minigzip test/minigzip.c) -target_link_libraries(minigzip zlib) - -if(HAVE_OFF64_T) - add_executable(example64 test/example.c) - target_link_libraries(example64 zlib) - set_target_properties(example64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") - add_test(example64 example64) - - add_executable(minigzip64 test/minigzip.c) - target_link_libraries(minigzip64 zlib) - set_target_properties(minigzip64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") -endif() From 09002844f209fd5f140f353e428deebe959077e8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:06:52 +0300 Subject: [PATCH 26/84] Using internal memcpy [#CLICKHOUSE-2]. --- libs/libcommon/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index a3a4ff14326..3b6489d0bc7 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -90,7 +90,8 @@ target_link_libraries ( ${Boost_FILESYSTEM_LIBRARY} ${MALLOC_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} - ${RT_LIBRARIES}) + ${RT_LIBRARIES} + memcpy) if (ENABLE_TESTS) add_subdirectory (src/tests) From c7f036e2bb86086aa3bb1fb5e6e6059f6a500a46 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:07:42 +0300 Subject: [PATCH 27/84] Added README [#CLICKHOUSE-2]. --- libs/libmemcpy/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 libs/libmemcpy/README.md diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md new file mode 100644 index 00000000000..d3dd14ac55b --- /dev/null +++ b/libs/libmemcpy/README.md @@ -0,0 +1,20 @@ +Internal implementation of `memcpy` function. + +It has the following advantages over `libc`-supplied implementation: +- it is linked statically, so the function is called directly, not through a `PLT` (procedure lookup table of shared library); +- it is linked statically, so the function can have position-dependent code; +- your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library; +- you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions; +- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is withing few percents). + +Currently it uses the implementation from **Linwei** (skywind3000@163.com). +Look at https://www.zhihu.com/question/35172305 for discussion. + +Drawbacks: +- only use SSE 2, doesn't use wider (AVX, AVX 512) vector registers when available; +- no CPU dispatching; doesn't take into account actual cache size. + +Also worth to look at: +- simple implementation from Facebook: https://github.com/facebook/folly/blob/master/folly/memcpy.S +- implementation from Agner Fog: http://www.agner.org/optimize/ +- glibc source code. From a44208ba2517b0803c65c53b477b45ba1fcf9008 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:27:09 +0300 Subject: [PATCH 28/84] Refined "GLIBC_COMPATIBILITY" option [#CLICKHOUSE-3275]. --- CMakeLists.txt | 6 +- libs/CMakeLists.txt | 9 ++- libs/libcommon/CMakeLists.txt | 11 ++- .../include/common/glibc_compatibility.h | 79 ------------------- libs/libglibc-compatibility/CMakeLists.txt | 1 + .../glibc-compatibility.c | 52 ++++++++++++ .../glibc-compatibility.h | 37 +++++++++ 7 files changed, 111 insertions(+), 84 deletions(-) delete mode 100644 libs/libcommon/include/common/glibc_compatibility.h create mode 100644 libs/libglibc-compatibility/CMakeLists.txt create mode 100644 libs/libglibc-compatibility/glibc-compatibility.c create mode 100644 libs/libglibc-compatibility/glibc-compatibility.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e0fe38e84a..a671fe1b775 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,11 +71,11 @@ if (USE_STATIC_LIBRARIES) list(REVERSE CMAKE_FIND_LIBRARY_SUFFIXES) endif () -option (GLIBC_COMPATIBILITY "Set to TRUE to enable compatibility with older glibc libraries. Note that it is not compatible with ASan." OFF) +option (USE_INTERNAL_MEMCPY "Use internal implementation of 'memcpy' function instead of provided by libc. Only for x86_64." ON) +option (GLIBC_COMPATIBILITY "Set to TRUE to enable compatibility with older glibc libraries. Implies USE_INTERNAL_MEMCPY." OFF) if (GLIBC_COMPATIBILITY) - set (GLIBC_COMPATIBILITY_COMPILE_FLAGS "-include ${ClickHouse_SOURCE_DIR}/libs/libcommon/include/common/glibc_compatibility.h") - set (GLIBC_COMPATIBILITY_LINK_FLAGS "-Wl,--wrap=memcpy") + set (USE_INTERNAL_MEMCPY ON) endif () if (CXX11_ABI STREQUAL ENABLE) diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 3420ba2ebc8..970d2be15b4 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -6,7 +6,14 @@ endif () add_subdirectory (libcommon) add_subdirectory (libpocoext) add_subdirectory (libdaemon) -add_subdirectory (libmemcpy) + +if (USE_INTERNAL_MEMCPY) + add_subdirectory (libmemcpy) +endif() + +if (GLIBC_COMPATIBILITY) + add_subdirectory (libglibc-compatibility) +endif () if (USE_MYSQL) add_subdirectory (libmysqlxx) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 3b6489d0bc7..3ba10c85636 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -75,6 +75,14 @@ else () message (STATUS "Disabling libtcmalloc for valgrind better analysis") endif () +if (GLIBC_COMPATIBILITY) + set (GLIBC_COMPATIBILITY_LIBRARIES glibc-compatibility) +endif () + +if (USE_INTERNAL_MEMCPY) + set (MEMCPY_LIBRARIES memcpy) +endif () + find_package (Threads) target_include_directories (common BEFORE PRIVATE ${CCTZ_INCLUDE_DIR}) @@ -91,7 +99,8 @@ target_link_libraries ( ${MALLOC_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${RT_LIBRARIES} - memcpy) + ${GLIBC_COMPATIBILITY_LIBRARIES} + ${MEMCPY_LIBRARIES}) if (ENABLE_TESTS) add_subdirectory (src/tests) diff --git a/libs/libcommon/include/common/glibc_compatibility.h b/libs/libcommon/include/common/glibc_compatibility.h deleted file mode 100644 index b2a13176f27..00000000000 --- a/libs/libcommon/include/common/glibc_compatibility.h +++ /dev/null @@ -1,79 +0,0 @@ -/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, - * such as Ubuntu Lucid or CentOS 6. - * - * Highly experimental, not recommended, disabled by default. - * - * To use, include this file with -include compiler parameter. - * And add -Wl,--wrap=memcpy for linking. - * - * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc - * - * If you want even older systems, such as Ubuntu Hardy, - * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. - */ - -#if defined (__cplusplus) -extern "C" { -#endif - - -#include -#include - -__attribute__((__weak__)) long int __fdelt_chk(long int d) -{ - if (d < 0 || d >= FD_SETSIZE) - abort(); - return d / __NFDBITS; -} - -#include - -__attribute__((__weak__)) int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, __SIZE_TYPE__ fdslen) -{ - if (fdslen / sizeof(*fds) < nfds) - abort(); - return poll(fds, nfds, timeout); -} - - -__attribute__((__weak__)) void * __memcpy_glibc_2_2_5(void *, const void *, size_t); - -__asm__(".symver __memcpy_glibc_2_2_5, memcpy@GLIBC_2.2.5"); - -__attribute__((__weak__)) void * __wrap_memcpy(void * dest, const void * src, size_t n) -{ - return __memcpy_glibc_2_2_5(dest, src, n); -} - - -__attribute__((__weak__)) size_t __pthread_get_minstack(const pthread_attr_t * attr) -{ - return 1048576; /// This is a guess. Don't sure it is correct. -} - -#include -#include -#include -#include - -extern long int syscall (long int __sysno, ...) __THROW; - -__attribute__((__weak__)) int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) -{ - siginfo_t info; - - memset(&info, 0, sizeof(siginfo_t)); - info.si_signo = sig; - info.si_code = SI_ASYNCNL; - info.si_pid = caller_pid; - info.si_uid = getuid(); - info.si_value = val; - - return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); -} - - -#if defined (__cplusplus) -} -#endif diff --git a/libs/libglibc-compatibility/CMakeLists.txt b/libs/libglibc-compatibility/CMakeLists.txt new file mode 100644 index 00000000000..e8c5a4ba757 --- /dev/null +++ b/libs/libglibc-compatibility/CMakeLists.txt @@ -0,0 +1 @@ +add_library (glibc-compatibility glibc-compatibility.c) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c new file mode 100644 index 00000000000..5524017b61f --- /dev/null +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -0,0 +1,52 @@ +#include "glibc-compatibility.h" + +#if defined (__cplusplus) +extern "C" { +#endif + +#include +#include + +long int __fdelt_chk(long int d) +{ + if (d < 0 || d >= FD_SETSIZE) + abort(); + return d / __NFDBITS; +} + + +int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) +{ + if (fdslen / sizeof(*fds) < nfds) + abort(); + return poll(fds, nfds, timeout); +} + + +size_t __pthread_get_minstack(const pthread_attr_t * attr) +{ + return 1048576; /// This is a guess. Don't sure it is correct. +} + +#include +#include + +extern long int syscall (long int __sysno, ...) __THROW; + +int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) +{ + siginfo_t info; + + memset(&info, 0, sizeof(siginfo_t)); + info.si_signo = sig; + info.si_code = SI_ASYNCNL; + info.si_pid = caller_pid; + info.si_uid = getuid(); + info.si_value = val; + + return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); +} + +#if defined (__cplusplus) +} +#endif diff --git a/libs/libglibc-compatibility/glibc-compatibility.h b/libs/libglibc-compatibility/glibc-compatibility.h new file mode 100644 index 00000000000..aeea2fc4adb --- /dev/null +++ b/libs/libglibc-compatibility/glibc-compatibility.h @@ -0,0 +1,37 @@ +#pragma once + +/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, + * such as Ubuntu Lucid or CentOS 6. + * + * Highly experimental, not recommended, disabled by default. + * + * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc + * + * If you want even older systems, such as Ubuntu Hardy, + * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. + */ + +#if defined (__cplusplus) +extern "C" { +#endif + + +__attribute__((__weak__)) long int __fdelt_chk(long int d); + +#include +#include + +__attribute__((__weak__)) int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen); + +#include + +__attribute__((__weak__)) size_t __pthread_get_minstack(const pthread_attr_t * attr); + +#include +#include + +__attribute__((__weak__)) int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid); + +#if defined (__cplusplus) +} +#endif From dd8348421ceac19af30279af73eb30ddc5aee600 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:43:43 +0300 Subject: [PATCH 29/84] Simplification [#CLICKHOUSE-3275]. --- .../glibc-compatibility.c | 16 +++++++- .../glibc-compatibility.h | 37 ------------------- 2 files changed, 15 insertions(+), 38 deletions(-) delete mode 100644 libs/libglibc-compatibility/glibc-compatibility.h diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index 5524017b61f..cb0dd617bdf 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -1,4 +1,13 @@ -#include "glibc-compatibility.h" +/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, + * such as Ubuntu Lucid or CentOS 6. + * + * Highly experimental, not recommended, disabled by default. + * + * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc + * + * If you want even older systems, such as Ubuntu Hardy, + * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. + */ #if defined (__cplusplus) extern "C" { @@ -14,6 +23,8 @@ long int __fdelt_chk(long int d) return d / __NFDBITS; } +#include +#include int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) { @@ -22,12 +33,15 @@ int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) return poll(fds, nfds, timeout); } +#include size_t __pthread_get_minstack(const pthread_attr_t * attr) { return 1048576; /// This is a guess. Don't sure it is correct. } +#include +#include #include #include diff --git a/libs/libglibc-compatibility/glibc-compatibility.h b/libs/libglibc-compatibility/glibc-compatibility.h deleted file mode 100644 index aeea2fc4adb..00000000000 --- a/libs/libglibc-compatibility/glibc-compatibility.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, - * such as Ubuntu Lucid or CentOS 6. - * - * Highly experimental, not recommended, disabled by default. - * - * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc - * - * If you want even older systems, such as Ubuntu Hardy, - * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. - */ - -#if defined (__cplusplus) -extern "C" { -#endif - - -__attribute__((__weak__)) long int __fdelt_chk(long int d); - -#include -#include - -__attribute__((__weak__)) int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen); - -#include - -__attribute__((__weak__)) size_t __pthread_get_minstack(const pthread_attr_t * attr); - -#include -#include - -__attribute__((__weak__)) int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid); - -#if defined (__cplusplus) -} -#endif From db431bf9496f8e9f74087207ff6db91a09c297bc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:24:53 +0300 Subject: [PATCH 30/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- CMakeLists.txt | 2 +- libs/libglibc-compatibility/CMakeLists.txt | 3 +- .../glibc-compatibility.c | 92 ++++-- libs/libglibc-compatibility/musl/COPYRIGHT | 163 +++++++++++ libs/libglibc-compatibility/musl/README | 6 + libs/libglibc-compatibility/musl/fallocate.c | 10 + libs/libglibc-compatibility/musl/lgamma.c | 268 ++++++++++++++++++ libs/libglibc-compatibility/musl/longjmp.s | 22 ++ libs/libglibc-compatibility/musl/pipe2.c | 24 ++ libs/libglibc-compatibility/musl/vasprintf.c | 15 + 10 files changed, 580 insertions(+), 25 deletions(-) create mode 100644 libs/libglibc-compatibility/musl/COPYRIGHT create mode 100644 libs/libglibc-compatibility/musl/README create mode 100644 libs/libglibc-compatibility/musl/fallocate.c create mode 100644 libs/libglibc-compatibility/musl/lgamma.c create mode 100644 libs/libglibc-compatibility/musl/longjmp.s create mode 100644 libs/libglibc-compatibility/musl/pipe2.c create mode 100644 libs/libglibc-compatibility/musl/vasprintf.c diff --git a/CMakeLists.txt b/CMakeLists.txt index a671fe1b775..98d8fcd578b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -192,7 +192,7 @@ if (UNBUNDLED) else () set(NOT_UNBUNDLED 1) endif () -# Using system libs can cause lot of warnings in includes. +# Using system libs can cause lot of warnings in includes. if (UNBUNDLED OR NOT (CMAKE_SYSTEM MATCHES "Linux" OR APPLE) OR ARCH_32) option (NO_WERROR "Disable -Werror compiler option" ON) endif () diff --git a/libs/libglibc-compatibility/CMakeLists.txt b/libs/libglibc-compatibility/CMakeLists.txt index e8c5a4ba757..9521dba9eab 100644 --- a/libs/libglibc-compatibility/CMakeLists.txt +++ b/libs/libglibc-compatibility/CMakeLists.txt @@ -1 +1,2 @@ -add_library (glibc-compatibility glibc-compatibility.c) +enable_language (ASM) +add_library (glibc-compatibility glibc-compatibility.c musl/pipe2.c musl/fallocate.c musl/longjmp.s musl/vasprintf.c musl/lgamma.c) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index cb0dd617bdf..e06bac8e290 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -1,18 +1,46 @@ -/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, +/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.4, * such as Ubuntu Lucid or CentOS 6. * * Highly experimental, not recommended, disabled by default. * * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc - * - * If you want even older systems, such as Ubuntu Hardy, - * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. */ #if defined (__cplusplus) extern "C" { #endif +#include + +size_t __pthread_get_minstack(const pthread_attr_t * attr) +{ + return 1048576; /// This is a guess. Don't sure it is correct. +} + +#include +#include +#include +#include + +long int syscall(long int __sysno, ...) __THROW; + +int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) +{ + siginfo_t info; + + memset(&info, 0, sizeof(siginfo_t)); + info.si_signo = sig; + info.si_code = SI_ASYNCNL; + info.si_pid = caller_pid; + info.si_uid = getuid(); + info.si_value = val; + + return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); +} + + +/// NOTE This disables some of FORTIFY_SOURCE functionality. + #include #include @@ -33,34 +61,52 @@ int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) return poll(fds, nfds, timeout); } -#include +#include -size_t __pthread_get_minstack(const pthread_attr_t * attr) +void longjmp(jmp_buf env, int val); + +void __longjmp_chk(jmp_buf env, int val) { - return 1048576; /// This is a guess. Don't sure it is correct. + return longjmp(env, val); } -#include -#include -#include -#include +#include -extern long int syscall (long int __sysno, ...) __THROW; +int vasprintf(char **s, const char *fmt, va_list ap); -int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) +int __vasprintf_chk(char **s, const char *fmt, va_list ap) { - siginfo_t info; - - memset(&info, 0, sizeof(siginfo_t)); - info.si_signo = sig; - info.si_code = SI_ASYNCNL; - info.si_pid = caller_pid; - info.si_uid = getuid(); - info.si_value = val; - - return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); + return vasprintf(s, fmt, ap); } +size_t __fread_chk(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + return fread(ptr, size, nmemb, stream); +} + + +#include + +int vsscanf(const char *str, const char *format, va_list ap); + +int __isoc99_vsscanf(const char *str, const char *format, va_list ap) +{ + return vsscanf(str, format, ap); +} + +int sscanf(const char *restrict s, const char *restrict fmt, ...) +{ + int ret; + va_list ap; + va_start(ap, fmt); + ret = vsscanf(s, fmt, ap); + va_end(ap); + return ret; +} + +int __isoc99_sscanf(const char *str, const char *format, ...) __attribute__((weak, alias("sscanf"))); + + #if defined (__cplusplus) } #endif diff --git a/libs/libglibc-compatibility/musl/COPYRIGHT b/libs/libglibc-compatibility/musl/COPYRIGHT new file mode 100644 index 00000000000..f0ee3b78d87 --- /dev/null +++ b/libs/libglibc-compatibility/musl/COPYRIGHT @@ -0,0 +1,163 @@ +musl as a whole is licensed under the following standard MIT license: + +---------------------------------------------------------------------- +Copyright © 2005-2014 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +---------------------------------------------------------------------- + +Authors/contributors include: + +Alex Dowad +Alexander Monakov +Anthony G. Basile +Arvid Picciani +Bobby Bingham +Boris Brezillon +Brent Cook +Chris Spiegel +Clément Vasseur +Daniel Micay +Denys Vlasenko +Emil Renner Berthing +Felix Fietkau +Felix Janda +Gianluca Anzolin +Hauke Mehrtens +Hiltjo Posthuma +Isaac Dunham +Jaydeep Patil +Jens Gustedt +Jeremy Huntwork +Jo-Philipp Wich +Joakim Sindholt +John Spencer +Josiah Worcester +Justin Cormack +Khem Raj +Kylie McClain +Luca Barbato +Luka Perkov +M Farkas-Dyck (Strake) +Mahesh Bodapati +Michael Forney +Natanael Copa +Nicholas J. Kain +orc +Pascal Cuoq +Petr Hosek +Pierre Carrier +Rich Felker +Richard Pennington +Shiz +sin +Solar Designer +Stefan Kristiansson +Szabolcs Nagy +Timo Teräs +Trutz Behn +Valentin Ochs +William Haddon + +Portions of this software are derived from third-party works licensed +under terms compatible with the above MIT license: + +The TRE regular expression implementation (src/regex/reg* and +src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed +under a 2-clause BSD license (license text in the source files). The +included version has been heavily modified by Rich Felker in 2012, in +the interests of size, simplicity, and namespace cleanliness. + +Much of the math library code (src/math/* and src/complex/*) is +Copyright © 1993,2004 Sun Microsystems or +Copyright © 2003-2011 David Schultz or +Copyright © 2003-2009 Steven G. Kargl or +Copyright © 2003-2009 Bruce D. Evans or +Copyright © 2008 Stephen L. Moshier +and labelled as such in comments in the individual source files. All +have been licensed under extremely permissive terms. + +The ARM memcpy code (src/string/arm/memcpy_el.S) is Copyright © 2008 +The Android Open Source Project and is licensed under a two-clause BSD +license. It was taken from Bionic libc, used on Android. + +The implementation of DES for crypt (src/crypt/crypt_des.c) is +Copyright © 1994 David Burren. It is licensed under a BSD license. + +The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was +originally written by Solar Designer and placed into the public +domain. The code also comes with a fallback permissive license for use +in jurisdictions that may not recognize the public domain. + +The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 +Valentin Ochs and is licensed under an MIT-style license. + +The BSD PRNG implementation (src/prng/random.c) and XSI search API +(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and +licensed under following terms: "Permission to use, copy, modify, +and/or distribute this code for any purpose with or without fee is +hereby granted. There is no warranty." + +The x86_64 port was written by Nicholas J. Kain and is licensed under +the standard MIT terms. + +The mips and microblaze ports were originally written by Richard +Pennington for use in the ellcc project. The original code was adapted +by Rich Felker for build system and code conventions during upstream +integration. It is licensed under the standard MIT terms. + +The mips64 port was contributed by Imagination Technologies and is +licensed under the standard MIT terms. + +The powerpc port was also originally written by Richard Pennington, +and later supplemented and integrated by John Spencer. It is licensed +under the standard MIT terms. + +All other files which have no copyright comments are original works +produced specifically for use as part of this library, written either +by Rich Felker, the main author of the library, or by one or more +contibutors listed above. Details on authorship of individual files +can be found in the git version control history of the project. The +omission of copyright and license comments in each file is in the +interest of source tree size. + +In addition, permission is hereby granted for all public header files +(include/* and arch/*/bits/*) and crt files intended to be linked into +applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit +the copyright notice and permission notice otherwise required by the +license, and to use these files without any requirement of +attribution. These files include substantial contributions from: + +Bobby Bingham +John Spencer +Nicholas J. Kain +Rich Felker +Richard Pennington +Stefan Kristiansson +Szabolcs Nagy + +all of whom have explicitly granted such permission. + +This file previously contained text expressing a belief that most of +the files covered by the above exception were sufficiently trivial not +to be subject to copyright, resulting in confusion over whether it +negated the permissions granted in the license. In the spirit of +permissive licensing, and of not having licensing issues being an +obstacle to adoption, that text has been removed. diff --git a/libs/libglibc-compatibility/musl/README b/libs/libglibc-compatibility/musl/README new file mode 100644 index 00000000000..11f6caa2d7e --- /dev/null +++ b/libs/libglibc-compatibility/musl/README @@ -0,0 +1,6 @@ +Tiny pieces extracted from MUSL library. + +git://git.musl-libc.org/musl +c10bc61508dc52b8315084e628f36a6c3c2dabb1 + +NOTE: Files was edited. diff --git a/libs/libglibc-compatibility/musl/fallocate.c b/libs/libglibc-compatibility/musl/fallocate.c new file mode 100644 index 00000000000..31e63822c94 --- /dev/null +++ b/libs/libglibc-compatibility/musl/fallocate.c @@ -0,0 +1,10 @@ +#define _GNU_SOURCE +#include +#include + +extern long int syscall (long int __sysno, ...) __THROW; + +int fallocate(int fd, int mode, off_t base, off_t len) +{ + return syscall(SYS_fallocate, fd, mode, base, len); +} diff --git a/libs/libglibc-compatibility/musl/lgamma.c b/libs/libglibc-compatibility/musl/lgamma.c new file mode 100644 index 00000000000..55a1f9e46a4 --- /dev/null +++ b/libs/libglibc-compatibility/musl/lgamma.c @@ -0,0 +1,268 @@ +/* origin: FreeBSD /usr/src/lib/msun/src/e_lgamma_r.c */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + * + */ +/* lgamma_r(x, signgamp) + * Reentrant version of the logarithm of the Gamma function + * with user provide pointer for the sign of Gamma(x). + * + * Method: + * 1. Argument Reduction for 0 < x <= 8 + * Since gamma(1+s)=s*gamma(s), for x in [0,8], we may + * reduce x to a number in [1.5,2.5] by + * lgamma(1+s) = log(s) + lgamma(s) + * for example, + * lgamma(7.3) = log(6.3) + lgamma(6.3) + * = log(6.3*5.3) + lgamma(5.3) + * = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3) + * 2. Polynomial approximation of lgamma around its + * minimun ymin=1.461632144968362245 to maintain monotonicity. + * On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use + * Let z = x-ymin; + * lgamma(x) = -1.214862905358496078218 + z^2*poly(z) + * where + * poly(z) is a 14 degree polynomial. + * 2. Rational approximation in the primary interval [2,3] + * We use the following approximation: + * s = x-2.0; + * lgamma(x) = 0.5*s + s*P(s)/Q(s) + * with accuracy + * |P/Q - (lgamma(x)-0.5s)| < 2**-61.71 + * Our algorithms are based on the following observation + * + * zeta(2)-1 2 zeta(3)-1 3 + * lgamma(2+s) = s*(1-Euler) + --------- * s - --------- * s + ... + * 2 3 + * + * where Euler = 0.5771... is the Euler constant, which is very + * close to 0.5. + * + * 3. For x>=8, we have + * lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+.... + * (better formula: + * lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...) + * Let z = 1/x, then we approximation + * f(z) = lgamma(x) - (x-0.5)(log(x)-1) + * by + * 3 5 11 + * w = w0 + w1*z + w2*z + w3*z + ... + w6*z + * where + * |w - f(z)| < 2**-58.74 + * + * 4. For negative x, since (G is gamma function) + * -x*G(-x)*G(x) = pi/sin(pi*x), + * we have + * G(x) = pi/(sin(pi*x)*(-x)*G(-x)) + * since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0 + * Hence, for x<0, signgam = sign(sin(pi*x)) and + * lgamma(x) = log(|Gamma(x)|) + * = log(pi/(|x*sin(pi*x)|)) - lgamma(-x); + * Note: one should avoid compute pi*(-x) directly in the + * computation of sin(pi*(-x)). + * + * 5. Special Cases + * lgamma(2+s) ~ s*(1-Euler) for tiny s + * lgamma(1) = lgamma(2) = 0 + * lgamma(x) ~ -log(|x|) for tiny x + * lgamma(0) = lgamma(neg.integer) = inf and raise divide-by-zero + * lgamma(inf) = inf + * lgamma(-inf) = inf (bug for bug compatible with C99!?) + * + */ + +static const double +pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */ +a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */ +a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */ +a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */ +a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */ +a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */ +a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */ +a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */ +a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */ +a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */ +a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */ +a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */ +a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */ +tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */ +tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */ +/* tt = -(tail of tf) */ +tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */ +t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */ +t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */ +t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */ +t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */ +t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */ +t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */ +t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */ +t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */ +t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */ +t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */ +t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */ +t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */ +t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */ +t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */ +t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */ +u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ +u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */ +u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */ +u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */ +u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */ +u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */ +v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */ +v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */ +v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */ +v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */ +v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */ +s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ +s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */ +s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */ +s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */ +s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */ +s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */ +s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */ +r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */ +r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */ +r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */ +r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */ +r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */ +r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */ +w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */ +w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */ +w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */ +w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */ +w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */ +w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */ +w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ + +#include +#include + +double lgamma_r(double x, int *signgamp) +{ + union {double f; uint64_t i;} u = {x}; + double_t t,y,z,nadj,p,p1,p2,p3,q,r,w; + uint32_t ix; + int sign,i; + + /* purge off +-inf, NaN, +-0, tiny and negative arguments */ + *signgamp = 1; + sign = u.i>>63; + ix = u.i>>32 & 0x7fffffff; + if (ix >= 0x7ff00000) + return x*x; + if (ix < (0x3ff-70)<<20) { /* |x|<2**-70, return -log(|x|) */ + if(sign) { + x = -x; + *signgamp = -1; + } + return -log(x); + } + if (sign) { + x = -x; + t = sin(pi * x); + if (t == 0.0) /* -integer */ + return 1.0/(x-x); + if (t > 0.0) + *signgamp = -1; + else + t = -t; + nadj = log(pi/(t*x)); + } + + /* purge off 1 and 2 */ + if ((ix == 0x3ff00000 || ix == 0x40000000) && (uint32_t)u.i == 0) + r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + r = -log(x); + if (ix >= 0x3FE76944) { + y = 1.0 - x; + i = 0; + } else if (ix >= 0x3FCDA661) { + y = x - (tc-1.0); + i = 1; + } else { + y = x; + i = 2; + } + } else { + r = 0.0; + if (ix >= 0x3FFBB4C3) { /* [1.7316,2] */ + y = 2.0 - x; + i = 0; + } else if(ix >= 0x3FF3B4C4) { /* [1.23,1.73] */ + y = x - tc; + i = 1; + } else { + y = x - 1.0; + i = 2; + } + } + switch (i) { + case 0: + z = y*y; + p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); + p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); + p = y*p1+p2; + r += (p-0.5*y); + break; + case 1: + z = y*y; + w = z*y; + p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ + p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); + p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); + p = z*p1-(tt-w*(p2+y*p3)); + r += tf + p; + break; + case 2: + p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); + p2 = 1.0+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); + r += -0.5*y + p1/p2; + } + } else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int)x; + y = x - (double)i; + p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); + q = 1.0+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); + r = 0.5*y+p/q; + z = 1.0; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: z *= y + 6.0; /* FALLTHRU */ + case 6: z *= y + 5.0; /* FALLTHRU */ + case 5: z *= y + 4.0; /* FALLTHRU */ + case 4: z *= y + 3.0; /* FALLTHRU */ + case 3: z *= y + 2.0; /* FALLTHRU */ + r += log(z); + break; + } + } else if (ix < 0x43900000) { /* 8.0 <= x < 2**58 */ + t = log(x); + z = 1.0/x; + y = z*z; + w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); + r = (x-0.5)*(t-1.0)+w; + } else /* 2**58 <= x <= inf */ + r = x*(log(x)-1.0); + if (sign) + r = nadj - r; + return r; +} + + +int signgam; + +double lgamma(double x) +{ + return lgamma_r(x, &signgam); +} diff --git a/libs/libglibc-compatibility/musl/longjmp.s b/libs/libglibc-compatibility/musl/longjmp.s new file mode 100644 index 00000000000..e175a4b9606 --- /dev/null +++ b/libs/libglibc-compatibility/musl/longjmp.s @@ -0,0 +1,22 @@ +/* Copyright 2011-2012 Nicholas J. Kain, licensed under standard MIT license */ +.global _longjmp +.global longjmp +.type _longjmp,@function +.type longjmp,@function +_longjmp: +longjmp: + mov %rsi,%rax /* val will be longjmp return */ + test %rax,%rax + jnz 1f + inc %rax /* if val==0, val=1 per longjmp semantics */ +1: + mov (%rdi),%rbx /* rdi is the jmp_buf, restore regs from it */ + mov 8(%rdi),%rbp + mov 16(%rdi),%r12 + mov 24(%rdi),%r13 + mov 32(%rdi),%r14 + mov 40(%rdi),%r15 + mov 48(%rdi),%rdx /* this ends up being the stack pointer */ + mov %rdx,%rsp + mov 56(%rdi),%rdx /* this is the instruction pointer */ + jmp *%rdx /* goto saved address without altering rsp */ diff --git a/libs/libglibc-compatibility/musl/pipe2.c b/libs/libglibc-compatibility/musl/pipe2.c new file mode 100644 index 00000000000..061f2e07313 --- /dev/null +++ b/libs/libglibc-compatibility/musl/pipe2.c @@ -0,0 +1,24 @@ +#include +#include +#include +#include + +extern long int syscall (long int __sysno, ...) __THROW; + +int pipe2(int fd[2], int flag) +{ + if (!flag) return pipe(fd); + int ret = syscall(SYS_pipe2, fd, flag); + if (ret != -ENOSYS) return -ret; + ret = pipe(fd); + if (ret) return ret; + if (flag & O_CLOEXEC) { + syscall(SYS_fcntl, fd[0], F_SETFD, FD_CLOEXEC); + syscall(SYS_fcntl, fd[1], F_SETFD, FD_CLOEXEC); + } + if (flag & O_NONBLOCK) { + syscall(SYS_fcntl, fd[0], F_SETFL, O_NONBLOCK); + syscall(SYS_fcntl, fd[1], F_SETFL, O_NONBLOCK); + } + return 0; +} diff --git a/libs/libglibc-compatibility/musl/vasprintf.c b/libs/libglibc-compatibility/musl/vasprintf.c new file mode 100644 index 00000000000..08251bc20ec --- /dev/null +++ b/libs/libglibc-compatibility/musl/vasprintf.c @@ -0,0 +1,15 @@ +#define _GNU_SOURCE +#include +#include +#include + +int vasprintf(char **s, const char *fmt, va_list ap) +{ + va_list ap2; + va_copy(ap2, ap); + int l = vsnprintf(0, 0, fmt, ap2); + va_end(ap2); + + if (l<0 || !(*s=malloc(l+1U))) return -1; + return vsnprintf(*s, l+1U, fmt, ap); +} From 705c159eaf44cc4fe51028aa0b851b49de3352af Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:29:03 +0300 Subject: [PATCH 31/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- libs/libglibc-compatibility/glibc-compatibility.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index e06bac8e290..6aed44a3efd 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -70,7 +70,7 @@ void __longjmp_chk(jmp_buf env, int val) return longjmp(env, val); } -#include +#include int vasprintf(char **s, const char *fmt, va_list ap); @@ -79,14 +79,13 @@ int __vasprintf_chk(char **s, const char *fmt, va_list ap) return vasprintf(s, fmt, ap); } -size_t __fread_chk(void *ptr, size_t size, size_t nmemb, FILE *stream) +size_t fread(void *ptr, size_t size, size_t nmemb, void *stream); + +size_t __fread_chk(void *ptr, size_t size, size_t nmemb, void *stream) { return fread(ptr, size, nmemb, stream); } - -#include - int vsscanf(const char *str, const char *format, va_list ap); int __isoc99_vsscanf(const char *str, const char *format, va_list ap) From 4f4c972019259cd7ffd4e6a3f7ad3b9092f5cde9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:32:27 +0300 Subject: [PATCH 32/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- libs/libglibc-compatibility/musl/lgamma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libglibc-compatibility/musl/lgamma.c b/libs/libglibc-compatibility/musl/lgamma.c index 55a1f9e46a4..b0e4f3aa537 100644 --- a/libs/libglibc-compatibility/musl/lgamma.c +++ b/libs/libglibc-compatibility/musl/lgamma.c @@ -149,7 +149,7 @@ w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ double lgamma_r(double x, int *signgamp) { union {double f; uint64_t i;} u = {x}; - double_t t,y,z,nadj,p,p1,p2,p3,q,r,w; + double_t t,y,z,nadj=0,p,p1,p2,p3,q,r,w; uint32_t ix; int sign,i; From c4b8f0067aef3ac12c4aa738df74eaedfedf0122 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:51:38 +0300 Subject: [PATCH 33/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- libs/libglibc-compatibility/glibc-compatibility.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index 6aed44a3efd..367bd2de765 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -105,6 +105,13 @@ int sscanf(const char *restrict s, const char *restrict fmt, ...) int __isoc99_sscanf(const char *str, const char *format, ...) __attribute__((weak, alias("sscanf"))); +int open(const char *path, int oflag); + +int __open_2(const char *path, int oflag) +{ + return open(path, oflag); +} + #if defined (__cplusplus) } From 4b3e2cd61ef6eaef87dcfc7c821dada577df2d7d Mon Sep 17 00:00:00 2001 From: Date: Fri, 15 Sep 2017 12:00:53 +0300 Subject: [PATCH 34/84] `force_index_by_date` and `force_primary_key` are documented. --- docs/ru/operations/settings/settings.rst | 24 ++++++++++++++++++++++++ docs/ru/table_engines/mergetree.rst | 4 ++++ 2 files changed, 28 insertions(+) diff --git a/docs/ru/operations/settings/settings.rst b/docs/ru/operations/settings/settings.rst index 2555bc56cc3..740403cfc44 100644 --- a/docs/ru/operations/settings/settings.rst +++ b/docs/ru/operations/settings/settings.rst @@ -43,6 +43,30 @@ fallback_to_stale_replicas_for_distributed_queries По умолчанию - 1 (включена). +.. _settings-settings-force_index_by_date: + +force_index_by_date +------------------- + +Запрещает выполнение запросов, если использовать индекс по дате невозможно. + +Работает с таблицами семейства MergeTree. + +При ``force_index_by_date=1`` ClickHouse проверяет, есть ли в запросе условие на ключ даты, которое может использоваться для отсечения диапазонов данных. Если подходящего условия нет - кидается исключение. Обратите внимание на то, что условие ``Date != '2000-01-01'`` подходит даже в том случае, когда соответствует всем данным в таблице (т.е. для выполнения запроса требуется full scan). Подробнее про диапазоны данных в таблицах MergeTree читайте в разделе :ref:`table_engines-mergetree`. + + +.. _settings-settings-force_primary_key: + +force_primary_key +----------------- + +Запрещает выполнение запросов, если использовать индекс по первичному ключу невозможно. + +Работает с таблицами семейства MergeTree. + +При ``force_primary_key=1`` ClickHouse проверяет, есть ли в запросе условие на первичный ключ, которое может использоваться для отсечения диапазонов данных. Если подходящего условия нет - кидается исключение. Подробнее про диапазоны данных в таблицах MergeTree читайте в разделе :ref:`table_engines-mergetree`. + + input_format_allow_errors_num ----------------------------- Устанавливает максимальное количество допустимых ошибок при чтении из текстовых форматов (CSV, TSV и т.п.). diff --git a/docs/ru/table_engines/mergetree.rst b/docs/ru/table_engines/mergetree.rst index c7d9a2ccebd..ca01a7defb1 100644 --- a/docs/ru/table_engines/mergetree.rst +++ b/docs/ru/table_engines/mergetree.rst @@ -57,8 +57,12 @@ MergeTree SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' +Чтобы проверить сможет ли ClickHouse использовать индекс при выполнении запроса, используйте настройки :ref:`settings-settings-force_index_by_date` и :ref:`settings-settings-force_primary_key`. + Индекс по дате обеспечивает чтение только кусков, содержащих даты из нужного диапазона. При этом, кусок данных может содержать данные за многие даты (до целого месяца), а в пределах одного куска, данные лежат упорядоченными по первичному ключу, который может не содержать дату в качестве первого столбца. В связи с этим, при использовании запроса с указанием условия только на дату, но не на префикс первичного ключа, будет читаться данных больше, чем за одну дату. + + Для конкуррентного доступа к таблице, используется мульти-версионность. То есть, при одновременном чтении и обновлении таблицы, данные будут читаться из набора кусочков, актуального на момент запроса. Длинных блокировок нет. Вставки никак не мешают чтениям. Чтения из таблицы автоматически распараллеливаются. From 84dad4538a1f148096eb4072486070e0daaae02e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 12:18:52 +0300 Subject: [PATCH 35/84] Fixed incompatibility of debian packages with older systems [#CLICKHOUSE-3275]. --- debian/clickhouse-client.postinst | 3 +++ debian/rules | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/debian/clickhouse-client.postinst b/debian/clickhouse-client.postinst index 2e84aad51ed..06b4b8654b0 100644 --- a/debian/clickhouse-client.postinst +++ b/debian/clickhouse-client.postinst @@ -1,2 +1,5 @@ +#!/bin/sh +set -e + mkdir -p /etc/clickhouse-client/conf.d chown -R clickhouse: /etc/clickhouse-client diff --git a/debian/rules b/debian/rules index c74140a2c51..4738dde4fb6 100755 --- a/debian/rules +++ b/debian/rules @@ -95,4 +95,7 @@ endif dh_install --list-missing --sourcedir=$(DESTDIR) override_dh_shlibdeps: - dh_shlibdeps -Xdebian/clickhouse-server-base/usr/share/clickhouse/bin/* + true # We depend only on libc and dh_shlibdeps gives us wrong (too strict) dependency. + +override_dh_builddeb: + dh_builddeb -- -Z gzip # Older systems don't have "xz", so use "gzip" instead. From ea3a36e15af786f54105031f581192ee9a3cbcd3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 12:49:50 +0300 Subject: [PATCH 36/84] Fixed readme [#CLICKHOUSE-3275]. --- libs/libmemcpy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md index d3dd14ac55b..e253f6bf5dd 100644 --- a/libs/libmemcpy/README.md +++ b/libs/libmemcpy/README.md @@ -5,7 +5,7 @@ It has the following advantages over `libc`-supplied implementation: - it is linked statically, so the function can have position-dependent code; - your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library; - you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions; -- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is withing few percents). +- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is within few percents). Currently it uses the implementation from **Linwei** (skywind3000@163.com). Look at https://www.zhihu.com/question/35172305 for discussion. From 7dc2cd91a78a2798e204cea2f010a0569e1023b1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 05:58:17 +0300 Subject: [PATCH 37/84] Imported https://github.com/skywind3000/FastMemcpy at a74a33a1fb6d400252ab73e417d12f622dd8fe61 [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 692 ++++++++++++++++++++++++++++++++++++ libs/libmemcpy/LICENSE | 22 ++ libs/libmemcpy/README.md | 97 +++++ 3 files changed, 811 insertions(+) create mode 100644 libs/libmemcpy/FastMemcpy.h create mode 100644 libs/libmemcpy/LICENSE create mode 100644 libs/libmemcpy/README.md diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h new file mode 100644 index 00000000000..65c249dabc1 --- /dev/null +++ b/libs/libmemcpy/FastMemcpy.h @@ -0,0 +1,692 @@ +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#ifndef __FAST_MEMCPY_H__ +#define __FAST_MEMCPY_H__ + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + + + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_sse2_16(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); +} + +static INLINE void memcpy_sse2_32(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); +} + +static INLINE void memcpy_sse2_64(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); +} + +static INLINE void memcpy_sse2_128(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); + __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); + __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); + __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); + _mm_storeu_si128(((__m128i*)dst) + 4, m4); + _mm_storeu_si128(((__m128i*)dst) + 5, m5); + _mm_storeu_si128(((__m128i*)dst) + 6, m6); + _mm_storeu_si128(((__m128i*)dst) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; + + switch (size) { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); + case 0: + break; + + case 65: + memcpy_sse2_64(dd - 65, ss - 65); + case 1: + dd[-1] = ss[-1]; + break; + + case 66: + memcpy_sse2_64(dd - 66, ss - 66); + case 2: + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 67: + memcpy_sse2_64(dd - 67, ss - 67); + case 3: + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 68: + memcpy_sse2_64(dd - 68, ss - 68); + case 4: + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 69: + memcpy_sse2_64(dd - 69, ss - 69); + case 5: + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 70: + memcpy_sse2_64(dd - 70, ss - 70); + case 6: + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 71: + memcpy_sse2_64(dd - 71, ss - 71); + case 7: + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 72: + memcpy_sse2_64(dd - 72, ss - 72); + case 8: + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 73: + memcpy_sse2_64(dd - 73, ss - 73); + case 9: + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 74: + memcpy_sse2_64(dd - 74, ss - 74); + case 10: + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 75: + memcpy_sse2_64(dd - 75, ss - 75); + case 11: + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 76: + memcpy_sse2_64(dd - 76, ss - 76); + case 12: + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 77: + memcpy_sse2_64(dd - 77, ss - 77); + case 13: + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 78: + memcpy_sse2_64(dd - 78, ss - 78); + case 14: + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 79: + memcpy_sse2_64(dd - 79, ss - 79); + case 15: + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 80: + memcpy_sse2_64(dd - 80, ss - 80); + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 81: + memcpy_sse2_64(dd - 81, ss - 81); + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 82: + memcpy_sse2_64(dd - 82, ss - 82); + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 83: + memcpy_sse2_64(dd - 83, ss - 83); + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +static void* memcpy_fast(void *destination, const void *source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) { + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) { + __m128i head = _mm_loadu_si128((const __m128i*)src); + _mm_storeu_si128((__m128i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128((((__m128i*)dst) + 0), c0); + _mm_store_si128((((__m128i*)dst) + 1), c1); + _mm_store_si128((((__m128i*)dst) + 2), c2); + _mm_store_si128((((__m128i*)dst) + 3), c3); + _mm_store_si128((((__m128i*)dst) + 4), c4); + _mm_store_si128((((__m128i*)dst) + 5), c5); + _mm_store_si128((((__m128i*)dst) + 6), c6); + _mm_store_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) { // source aligned + for (; size >= 128; size -= 128) { + c0 = _mm_load_si128(((const __m128i*)src) + 0); + c1 = _mm_load_si128(((const __m128i*)src) + 1); + c2 = _mm_load_si128(((const __m128i*)src) + 2); + c3 = _mm_load_si128(((const __m128i*)src) + 3); + c4 = _mm_load_si128(((const __m128i*)src) + 4); + c5 = _mm_load_si128(((const __m128i*)src) + 5); + c6 = _mm_load_si128(((const __m128i*)src) + 6); + c7 = _mm_load_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // source unaligned + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; +} + + +#endif + + + diff --git a/libs/libmemcpy/LICENSE b/libs/libmemcpy/LICENSE new file mode 100644 index 00000000000..c449da6aa8a --- /dev/null +++ b/libs/libmemcpy/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Linwei + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md new file mode 100644 index 00000000000..2c63dc21a81 --- /dev/null +++ b/libs/libmemcpy/README.md @@ -0,0 +1,97 @@ +Build +===== + +with gcc: +> gcc -O3 -msse2 FastMemcpy.c -o FastMemcpy + +with msvc: +> cl -nologo -O2 FastMemcpy.c + +Features +======== + +* 50% speedup in avg. vs traditional memcpy in msvc 2012 or gcc 4.9 +* small size copy optimized with jump table +* medium size copy optimized with sse2 vector copy +* huge size copy optimized with cache prefetch & movntdq + +Reference +========= + +[Using Block Prefetch for Optimized Memory Performance](http://files.rsdn.ru/23380/AMD_block_prefetch_paper.pdf) + +The artical only focused on aligned huge memory copy. You need handle other conditions by your self. + + +Results +======= + +``` +result: gcc4.9 (msvc 2012 got a similar result): + +benchmark(size=32 bytes, times=16777216): +result(dst aligned, src aligned): memcpy_fast=81ms memcpy=281 ms +result(dst aligned, src unalign): memcpy_fast=88ms memcpy=254 ms +result(dst unalign, src aligned): memcpy_fast=87ms memcpy=245 ms +result(dst unalign, src unalign): memcpy_fast=81ms memcpy=258 ms + +benchmark(size=64 bytes, times=16777216): +result(dst aligned, src aligned): memcpy_fast=91ms memcpy=364 ms +result(dst aligned, src unalign): memcpy_fast=95ms memcpy=336 ms +result(dst unalign, src aligned): memcpy_fast=96ms memcpy=353 ms +result(dst unalign, src unalign): memcpy_fast=99ms memcpy=346 ms + +benchmark(size=512 bytes, times=8388608): +result(dst aligned, src aligned): memcpy_fast=124ms memcpy=242 ms +result(dst aligned, src unalign): memcpy_fast=166ms memcpy=555 ms +result(dst unalign, src aligned): memcpy_fast=168ms memcpy=602 ms +result(dst unalign, src unalign): memcpy_fast=174ms memcpy=614 ms + +benchmark(size=1024 bytes, times=4194304): +result(dst aligned, src aligned): memcpy_fast=119ms memcpy=171 ms +result(dst aligned, src unalign): memcpy_fast=182ms memcpy=442 ms +result(dst unalign, src aligned): memcpy_fast=163ms memcpy=466 ms +result(dst unalign, src unalign): memcpy_fast=168ms memcpy=472 ms + +benchmark(size=4096 bytes, times=524288): +result(dst aligned, src aligned): memcpy_fast=68ms memcpy=82 ms +result(dst aligned, src unalign): memcpy_fast=94ms memcpy=226 ms +result(dst unalign, src aligned): memcpy_fast=134ms memcpy=216 ms +result(dst unalign, src unalign): memcpy_fast=84ms memcpy=188 ms + +benchmark(size=8192 bytes, times=262144): +result(dst aligned, src aligned): memcpy_fast=55ms memcpy=70 ms +result(dst aligned, src unalign): memcpy_fast=75ms memcpy=192 ms +result(dst unalign, src aligned): memcpy_fast=79ms memcpy=223 ms +result(dst unalign, src unalign): memcpy_fast=91ms memcpy=219 ms + +benchmark(size=1048576 bytes, times=2048): +result(dst aligned, src aligned): memcpy_fast=181ms memcpy=165 ms +result(dst aligned, src unalign): memcpy_fast=192ms memcpy=303 ms +result(dst unalign, src aligned): memcpy_fast=218ms memcpy=310 ms +result(dst unalign, src unalign): memcpy_fast=183ms memcpy=307 ms + +benchmark(size=4194304 bytes, times=512): +result(dst aligned, src aligned): memcpy_fast=263ms memcpy=398 ms +result(dst aligned, src unalign): memcpy_fast=269ms memcpy=433 ms +result(dst unalign, src aligned): memcpy_fast=306ms memcpy=497 ms +result(dst unalign, src unalign): memcpy_fast=285ms memcpy=417 ms + +benchmark(size=8388608 bytes, times=256): +result(dst aligned, src aligned): memcpy_fast=287ms memcpy=421 ms +result(dst aligned, src unalign): memcpy_fast=288ms memcpy=430 ms +result(dst unalign, src aligned): memcpy_fast=285ms memcpy=510 ms +result(dst unalign, src unalign): memcpy_fast=291ms memcpy=440 ms + +benchmark random access: +memcpy_fast=487ms memcpy=1000ms + +``` + + +About +===== + +skywind + +http://www.skywind.me From d4d346b8ef61368f63cfe0c2c313fcd24a3138a7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:00:06 +0300 Subject: [PATCH 38/84] Modified whitespaces [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 1384 +++++++++++++++++------------------ libs/libmemcpy/README.md | 16 +- 2 files changed, 700 insertions(+), 700 deletions(-) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h index 65c249dabc1..306bd19ac58 100644 --- a/libs/libmemcpy/FastMemcpy.h +++ b/libs/libmemcpy/FastMemcpy.h @@ -1,692 +1,692 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - - - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -} - -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); -} - -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); -} - -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); - __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); - __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); - __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); - _mm_storeu_si128(((__m128i*)dst) + 4, m4); - _mm_storeu_si128(((__m128i*)dst) + 5, m5); - _mm_storeu_si128(((__m128i*)dst) + 6, m6); - _mm_storeu_si128(((__m128i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); - case 0: - break; - - case 65: - memcpy_sse2_64(dd - 65, ss - 65); - case 1: - dd[-1] = ss[-1]; - break; - - case 66: - memcpy_sse2_64(dd - 66, ss - 66); - case 2: - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 67: - memcpy_sse2_64(dd - 67, ss - 67); - case 3: - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 68: - memcpy_sse2_64(dd - 68, ss - 68); - case 4: - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 69: - memcpy_sse2_64(dd - 69, ss - 69); - case 5: - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 70: - memcpy_sse2_64(dd - 70, ss - 70); - case 6: - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 71: - memcpy_sse2_64(dd - 71, ss - 71); - case 7: - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 72: - memcpy_sse2_64(dd - 72, ss - 72); - case 8: - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 73: - memcpy_sse2_64(dd - 73, ss - 73); - case 9: - *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 74: - memcpy_sse2_64(dd - 74, ss - 74); - case 10: - *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 75: - memcpy_sse2_64(dd - 75, ss - 75); - case 11: - *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 76: - memcpy_sse2_64(dd - 76, ss - 76); - case 12: - *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 77: - memcpy_sse2_64(dd - 77, ss - 77); - case 13: - *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 78: - memcpy_sse2_64(dd - 78, ss - 78); - case 14: - *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 79: - memcpy_sse2_64(dd - 79, ss - 79); - case 15: - *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 80: - memcpy_sse2_64(dd - 80, ss - 80); - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 81: - memcpy_sse2_64(dd - 81, ss - 81); - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 82: - memcpy_sse2_64(dd - 82, ss - 82); - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 83: - memcpy_sse2_64(dd - 83, ss - 83); - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128((const __m128i*)src); - _mm_storeu_si128((__m128i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128((((__m128i*)dst) + 0), c0); - _mm_store_si128((((__m128i*)dst) + 1), c1); - _mm_store_si128((((__m128i*)dst) + 2), c2); - _mm_store_si128((((__m128i*)dst) + 3), c3); - _mm_store_si128((((__m128i*)dst) + 4), c4); - _mm_store_si128((((__m128i*)dst) + 5), c5); - _mm_store_si128((((__m128i*)dst) + 6), c6); - _mm_store_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128(((const __m128i*)src) + 0); - c1 = _mm_load_si128(((const __m128i*)src) + 1); - c2 = _mm_load_si128(((const __m128i*)src) + 2); - c3 = _mm_load_si128(((const __m128i*)src) + 3); - c4 = _mm_load_si128(((const __m128i*)src) + 4); - c5 = _mm_load_si128(((const __m128i*)src) + 5); - c6 = _mm_load_si128(((const __m128i*)src) + 6); - c7 = _mm_load_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; -} - - -#endif - - - +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#ifndef __FAST_MEMCPY_H__ +#define __FAST_MEMCPY_H__ + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + + + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_sse2_16(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); +} + +static INLINE void memcpy_sse2_32(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); +} + +static INLINE void memcpy_sse2_64(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); +} + +static INLINE void memcpy_sse2_128(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); + __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); + __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); + __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); + __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); + __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); + __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); + _mm_storeu_si128(((__m128i*)dst) + 1, m1); + _mm_storeu_si128(((__m128i*)dst) + 2, m2); + _mm_storeu_si128(((__m128i*)dst) + 3, m3); + _mm_storeu_si128(((__m128i*)dst) + 4, m4); + _mm_storeu_si128(((__m128i*)dst) + 5, m5); + _mm_storeu_si128(((__m128i*)dst) + 6, m6); + _mm_storeu_si128(((__m128i*)dst) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; + + switch (size) { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); + case 0: + break; + + case 65: + memcpy_sse2_64(dd - 65, ss - 65); + case 1: + dd[-1] = ss[-1]; + break; + + case 66: + memcpy_sse2_64(dd - 66, ss - 66); + case 2: + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 67: + memcpy_sse2_64(dd - 67, ss - 67); + case 3: + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 68: + memcpy_sse2_64(dd - 68, ss - 68); + case 4: + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 69: + memcpy_sse2_64(dd - 69, ss - 69); + case 5: + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 70: + memcpy_sse2_64(dd - 70, ss - 70); + case 6: + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 71: + memcpy_sse2_64(dd - 71, ss - 71); + case 7: + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 72: + memcpy_sse2_64(dd - 72, ss - 72); + case 8: + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 73: + memcpy_sse2_64(dd - 73, ss - 73); + case 9: + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 74: + memcpy_sse2_64(dd - 74, ss - 74); + case 10: + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 75: + memcpy_sse2_64(dd - 75, ss - 75); + case 11: + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 76: + memcpy_sse2_64(dd - 76, ss - 76); + case 12: + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 77: + memcpy_sse2_64(dd - 77, ss - 77); + case 13: + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 78: + memcpy_sse2_64(dd - 78, ss - 78); + case 14: + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 79: + memcpy_sse2_64(dd - 79, ss - 79); + case 15: + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 80: + memcpy_sse2_64(dd - 80, ss - 80); + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 81: + memcpy_sse2_64(dd - 81, ss - 81); + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 82: + memcpy_sse2_64(dd - 82, ss - 82); + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 83: + memcpy_sse2_64(dd - 83, ss - 83); + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); + *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); + *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); + *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +static void* memcpy_fast(void *destination, const void *source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) { + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) { + __m128i head = _mm_loadu_si128((const __m128i*)src); + _mm_storeu_si128((__m128i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128((((__m128i*)dst) + 0), c0); + _mm_store_si128((((__m128i*)dst) + 1), c1); + _mm_store_si128((((__m128i*)dst) + 2), c2); + _mm_store_si128((((__m128i*)dst) + 3), c3); + _mm_store_si128((((__m128i*)dst) + 4), c4); + _mm_store_si128((((__m128i*)dst) + 5), c5); + _mm_store_si128((((__m128i*)dst) + 6), c6); + _mm_store_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) { // source aligned + for (; size >= 128; size -= 128) { + c0 = _mm_load_si128(((const __m128i*)src) + 0); + c1 = _mm_load_si128(((const __m128i*)src) + 1); + c2 = _mm_load_si128(((const __m128i*)src) + 2); + c3 = _mm_load_si128(((const __m128i*)src) + 3); + c4 = _mm_load_si128(((const __m128i*)src) + 4); + c5 = _mm_load_si128(((const __m128i*)src) + 5); + c6 = _mm_load_si128(((const __m128i*)src) + 6); + c7 = _mm_load_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + else { // source unaligned + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128(((const __m128i*)src) + 0); + c1 = _mm_loadu_si128(((const __m128i*)src) + 1); + c2 = _mm_loadu_si128(((const __m128i*)src) + 2); + c3 = _mm_loadu_si128(((const __m128i*)src) + 3); + c4 = _mm_loadu_si128(((const __m128i*)src) + 4); + c5 = _mm_loadu_si128(((const __m128i*)src) + 5); + c6 = _mm_loadu_si128(((const __m128i*)src) + 6); + c7 = _mm_loadu_si128(((const __m128i*)src) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128((((__m128i*)dst) + 0), c0); + _mm_stream_si128((((__m128i*)dst) + 1), c1); + _mm_stream_si128((((__m128i*)dst) + 2), c2); + _mm_stream_si128((((__m128i*)dst) + 3), c3); + _mm_stream_si128((((__m128i*)dst) + 4), c4); + _mm_stream_si128((((__m128i*)dst) + 5), c5); + _mm_stream_si128((((__m128i*)dst) + 6), c6); + _mm_stream_si128((((__m128i*)dst) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; +} + + +#endif + + + diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md index 2c63dc21a81..91e01d4a5a0 100644 --- a/libs/libmemcpy/README.md +++ b/libs/libmemcpy/README.md @@ -1,8 +1,8 @@ Build ===== - -with gcc: -> gcc -O3 -msse2 FastMemcpy.c -o FastMemcpy + +with gcc: +> gcc -O3 -msse2 FastMemcpy.c -o FastMemcpy with msvc: > cl -nologo -O2 FastMemcpy.c @@ -10,10 +10,10 @@ with msvc: Features ======== -* 50% speedup in avg. vs traditional memcpy in msvc 2012 or gcc 4.9 -* small size copy optimized with jump table -* medium size copy optimized with sse2 vector copy -* huge size copy optimized with cache prefetch & movntdq +* 50% speedup in avg. vs traditional memcpy in msvc 2012 or gcc 4.9 +* small size copy optimized with jump table +* medium size copy optimized with sse2 vector copy +* huge size copy optimized with cache prefetch & movntdq Reference ========= @@ -28,7 +28,7 @@ Results ``` result: gcc4.9 (msvc 2012 got a similar result): - + benchmark(size=32 bytes, times=16777216): result(dst aligned, src aligned): memcpy_fast=81ms memcpy=281 ms result(dst aligned, src unalign): memcpy_fast=88ms memcpy=254 ms From 85dfdafaeb190ada99a87eb51ada937ec0203fb4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:01:16 +0300 Subject: [PATCH 39/84] Fixed comment [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h index 306bd19ac58..9cb8bb2d019 100644 --- a/libs/libmemcpy/FastMemcpy.h +++ b/libs/libmemcpy/FastMemcpy.h @@ -581,7 +581,7 @@ static void* memcpy_fast(void *destination, const void *source, size_t size) { unsigned char *dst = (unsigned char*)destination; const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size + static size_t cachesize = 0x200000; // something around half of LL-cache size size_t padding; // small memory copy From ab8ac9c143a721d68e92e884f3a1bcb8de75459c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:26:40 +0300 Subject: [PATCH 40/84] Preparation [#CLICKHOUSE-2]. --- libs/libmemcpy/FastMemcpy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/FastMemcpy.h index 9cb8bb2d019..f6c79eaba60 100644 --- a/libs/libmemcpy/FastMemcpy.h +++ b/libs/libmemcpy/FastMemcpy.h @@ -577,11 +577,10 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { //--------------------------------------------------------------------- // main routine //--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) +static INLINE void* memcpy_fast(void *destination, const void *source, size_t size) { unsigned char *dst = (unsigned char*)destination; const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // something around half of LL-cache size size_t padding; // small memory copy @@ -601,7 +600,8 @@ static void* memcpy_fast(void *destination, const void *source, size_t size) } // medium size copy - if (size <= cachesize) { + if (size <= 0x200000) // something around half of LL-cache size + { __m128i c0, c1, c2, c3, c4, c5, c6, c7; for (; size >= 128; size -= 128) { From effbbfd035251b7732ef5a4a2083eab7d0f9a9d9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:27:37 +0300 Subject: [PATCH 41/84] Added wrapper [#CLICKHOUSE-2]. --- libs/CMakeLists.txt | 1 + libs/libmemcpy/CMakeLists.txt | 1 + libs/libmemcpy/memcpy.c | 3 +++ libs/libmemcpy/memcpy.h | 17 +++++++++++++++++ 4 files changed, 22 insertions(+) create mode 100644 libs/libmemcpy/CMakeLists.txt create mode 100644 libs/libmemcpy/memcpy.c create mode 100644 libs/libmemcpy/memcpy.h diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index e5887d0813f..3420ba2ebc8 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -6,6 +6,7 @@ endif () add_subdirectory (libcommon) add_subdirectory (libpocoext) add_subdirectory (libdaemon) +add_subdirectory (libmemcpy) if (USE_MYSQL) add_subdirectory (libmysqlxx) diff --git a/libs/libmemcpy/CMakeLists.txt b/libs/libmemcpy/CMakeLists.txt new file mode 100644 index 00000000000..c06085aacca --- /dev/null +++ b/libs/libmemcpy/CMakeLists.txt @@ -0,0 +1 @@ +add_library (memcpy memcpy.c) diff --git a/libs/libmemcpy/memcpy.c b/libs/libmemcpy/memcpy.c new file mode 100644 index 00000000000..9e1b175bc57 --- /dev/null +++ b/libs/libmemcpy/memcpy.c @@ -0,0 +1,3 @@ +#include "memcpy.h" + +/// This is needed to generate an object file for linking. diff --git a/libs/libmemcpy/memcpy.h b/libs/libmemcpy/memcpy.h new file mode 100644 index 00000000000..902133ed3ae --- /dev/null +++ b/libs/libmemcpy/memcpy.h @@ -0,0 +1,17 @@ +#pragma once + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "FastMemcpy.h" + +void * __attribute__((__weak__)) memcpy(void * __restrict destination, const void * __restrict source, size_t size) +{ + return memcpy_fast(destination, source, size); +} + +#ifdef __cplusplus +} +#endif From 50cf5623a2b86c37dd4678f854a50e3434353d84 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:28:57 +0300 Subject: [PATCH 42/84] Moved third-party sources deeper [#CLICKHOUSE-2]. --- libs/libmemcpy/{ => impl}/FastMemcpy.h | 0 libs/libmemcpy/{ => impl}/LICENSE | 0 libs/libmemcpy/{ => impl}/README.md | 0 libs/libmemcpy/memcpy.h | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) rename libs/libmemcpy/{ => impl}/FastMemcpy.h (100%) rename libs/libmemcpy/{ => impl}/LICENSE (100%) rename libs/libmemcpy/{ => impl}/README.md (100%) diff --git a/libs/libmemcpy/FastMemcpy.h b/libs/libmemcpy/impl/FastMemcpy.h similarity index 100% rename from libs/libmemcpy/FastMemcpy.h rename to libs/libmemcpy/impl/FastMemcpy.h diff --git a/libs/libmemcpy/LICENSE b/libs/libmemcpy/impl/LICENSE similarity index 100% rename from libs/libmemcpy/LICENSE rename to libs/libmemcpy/impl/LICENSE diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/impl/README.md similarity index 100% rename from libs/libmemcpy/README.md rename to libs/libmemcpy/impl/README.md diff --git a/libs/libmemcpy/memcpy.h b/libs/libmemcpy/memcpy.h index 902133ed3ae..1f898d4aff6 100644 --- a/libs/libmemcpy/memcpy.h +++ b/libs/libmemcpy/memcpy.h @@ -5,7 +5,7 @@ extern "C" { #endif -#include "FastMemcpy.h" +#include "impl/FastMemcpy.h" void * __attribute__((__weak__)) memcpy(void * __restrict destination, const void * __restrict source, size_t size) { From c20822056698890c7dd115fd91e7894d52602dc4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 06:44:34 +0300 Subject: [PATCH 43/84] Removed build of shared library and test executables for libzlib-ng [#CLICKHOUSE-2]. --- contrib/libzlib-ng/CMakeLists.txt | 57 ++----------------------------- 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/contrib/libzlib-ng/CMakeLists.txt b/contrib/libzlib-ng/CMakeLists.txt index 7054ab81115..cd2eeb97c55 100644 --- a/contrib/libzlib-ng/CMakeLists.txt +++ b/contrib/libzlib-ng/CMakeLists.txt @@ -489,39 +489,15 @@ if(MINGW OR MSYS) set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) endif(MINGW OR MSYS) -add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_GZFILE_SRCS} ${ZLIB_ARCH_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_GZFILE_SRCS} ${ZLIB_ARCH_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) -set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) -set_target_properties(zlib PROPERTIES SOVERSION 1) - -if(NOT CYGWIN) - # This property causes shared libraries on Linux to have the full version - # encoded into their final filename. We disable this on Cygwin because - # it causes cygz-${ZLIB_FULL_VERSION}.dll to be created when cygz.dll - # seems to be the default. - # - # This has no effect with MSVC, on that platform the version info for - # the DLL comes from the resource file win32/zlib1.rc - set_target_properties(zlib PROPERTIES VERSION ${ZLIB_FULL_VERSION}) -endif() - if(UNIX) # On unix-like platforms the library is almost always called libz - set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z) - if(NOT APPLE) - set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"") - endif() -elseif(MSYS) - # Suppress version number from shared library name - set(CMAKE_SHARED_LIBRARY_NAME_WITH_VERSION 0) -elseif(BUILD_SHARED_LIBS AND WIN32) - # Creates zlib1.dll when building shared library version - set_target_properties(zlib PROPERTIES SUFFIX "1.dll") + set_target_properties(zlibstatic PROPERTIES OUTPUT_NAME z) endif() if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL ) - install(TARGETS zlib zlibstatic + install(TARGETS zlibstatic RUNTIME DESTINATION "${INSTALL_BIN_DIR}" ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" LIBRARY DESTINATION "${INSTALL_LIB_DIR}" ) @@ -529,35 +505,6 @@ endif() if(NOT SKIP_INSTALL_HEADERS AND NOT SKIP_INSTALL_ALL ) install(FILES ${ZLIB_PUBLIC_HDRS} DESTINATION "${INSTALL_INC_DIR}") endif() -if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL ) - install(FILES zlib.3 DESTINATION "${INSTALL_MAN_DIR}/man3") -endif() if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL ) install(FILES ${ZLIB_PC} DESTINATION "${INSTALL_PKGCONFIG_DIR}") endif() - -#============================================================================ -# Example binaries -#============================================================================ - -if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set (CMAKE_EXE_LINKER_FLAGS "") -endif () - -add_executable(example test/example.c) -target_link_libraries(example zlib) -add_test(example example) - -add_executable(minigzip test/minigzip.c) -target_link_libraries(minigzip zlib) - -if(HAVE_OFF64_T) - add_executable(example64 test/example.c) - target_link_libraries(example64 zlib) - set_target_properties(example64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") - add_test(example64 example64) - - add_executable(minigzip64 test/minigzip.c) - target_link_libraries(minigzip64 zlib) - set_target_properties(minigzip64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") -endif() From cd9c387ed48caa0dc0d7a14a6145faae68f6fe34 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:06:52 +0300 Subject: [PATCH 44/84] Using internal memcpy [#CLICKHOUSE-2]. --- libs/libcommon/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index a3a4ff14326..3b6489d0bc7 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -90,7 +90,8 @@ target_link_libraries ( ${Boost_FILESYSTEM_LIBRARY} ${MALLOC_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} - ${RT_LIBRARIES}) + ${RT_LIBRARIES} + memcpy) if (ENABLE_TESTS) add_subdirectory (src/tests) From 165518d4b065c1a0ed556e96bb62c88c7057cf47 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:07:42 +0300 Subject: [PATCH 45/84] Added README [#CLICKHOUSE-2]. --- libs/libmemcpy/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 libs/libmemcpy/README.md diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md new file mode 100644 index 00000000000..d3dd14ac55b --- /dev/null +++ b/libs/libmemcpy/README.md @@ -0,0 +1,20 @@ +Internal implementation of `memcpy` function. + +It has the following advantages over `libc`-supplied implementation: +- it is linked statically, so the function is called directly, not through a `PLT` (procedure lookup table of shared library); +- it is linked statically, so the function can have position-dependent code; +- your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library; +- you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions; +- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is withing few percents). + +Currently it uses the implementation from **Linwei** (skywind3000@163.com). +Look at https://www.zhihu.com/question/35172305 for discussion. + +Drawbacks: +- only use SSE 2, doesn't use wider (AVX, AVX 512) vector registers when available; +- no CPU dispatching; doesn't take into account actual cache size. + +Also worth to look at: +- simple implementation from Facebook: https://github.com/facebook/folly/blob/master/folly/memcpy.S +- implementation from Agner Fog: http://www.agner.org/optimize/ +- glibc source code. From 860c19bad3685d5dd1bc5f7d173fe80d3985135f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:27:09 +0300 Subject: [PATCH 46/84] Refined "GLIBC_COMPATIBILITY" option [#CLICKHOUSE-3275]. --- CMakeLists.txt | 6 +- libs/CMakeLists.txt | 9 ++- libs/libcommon/CMakeLists.txt | 11 ++- .../include/common/glibc_compatibility.h | 79 ------------------- libs/libglibc-compatibility/CMakeLists.txt | 1 + .../glibc-compatibility.c | 52 ++++++++++++ .../glibc-compatibility.h | 37 +++++++++ 7 files changed, 111 insertions(+), 84 deletions(-) delete mode 100644 libs/libcommon/include/common/glibc_compatibility.h create mode 100644 libs/libglibc-compatibility/CMakeLists.txt create mode 100644 libs/libglibc-compatibility/glibc-compatibility.c create mode 100644 libs/libglibc-compatibility/glibc-compatibility.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e0fe38e84a..a671fe1b775 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,11 +71,11 @@ if (USE_STATIC_LIBRARIES) list(REVERSE CMAKE_FIND_LIBRARY_SUFFIXES) endif () -option (GLIBC_COMPATIBILITY "Set to TRUE to enable compatibility with older glibc libraries. Note that it is not compatible with ASan." OFF) +option (USE_INTERNAL_MEMCPY "Use internal implementation of 'memcpy' function instead of provided by libc. Only for x86_64." ON) +option (GLIBC_COMPATIBILITY "Set to TRUE to enable compatibility with older glibc libraries. Implies USE_INTERNAL_MEMCPY." OFF) if (GLIBC_COMPATIBILITY) - set (GLIBC_COMPATIBILITY_COMPILE_FLAGS "-include ${ClickHouse_SOURCE_DIR}/libs/libcommon/include/common/glibc_compatibility.h") - set (GLIBC_COMPATIBILITY_LINK_FLAGS "-Wl,--wrap=memcpy") + set (USE_INTERNAL_MEMCPY ON) endif () if (CXX11_ABI STREQUAL ENABLE) diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 3420ba2ebc8..970d2be15b4 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -6,7 +6,14 @@ endif () add_subdirectory (libcommon) add_subdirectory (libpocoext) add_subdirectory (libdaemon) -add_subdirectory (libmemcpy) + +if (USE_INTERNAL_MEMCPY) + add_subdirectory (libmemcpy) +endif() + +if (GLIBC_COMPATIBILITY) + add_subdirectory (libglibc-compatibility) +endif () if (USE_MYSQL) add_subdirectory (libmysqlxx) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 3b6489d0bc7..3ba10c85636 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -75,6 +75,14 @@ else () message (STATUS "Disabling libtcmalloc for valgrind better analysis") endif () +if (GLIBC_COMPATIBILITY) + set (GLIBC_COMPATIBILITY_LIBRARIES glibc-compatibility) +endif () + +if (USE_INTERNAL_MEMCPY) + set (MEMCPY_LIBRARIES memcpy) +endif () + find_package (Threads) target_include_directories (common BEFORE PRIVATE ${CCTZ_INCLUDE_DIR}) @@ -91,7 +99,8 @@ target_link_libraries ( ${MALLOC_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${RT_LIBRARIES} - memcpy) + ${GLIBC_COMPATIBILITY_LIBRARIES} + ${MEMCPY_LIBRARIES}) if (ENABLE_TESTS) add_subdirectory (src/tests) diff --git a/libs/libcommon/include/common/glibc_compatibility.h b/libs/libcommon/include/common/glibc_compatibility.h deleted file mode 100644 index b2a13176f27..00000000000 --- a/libs/libcommon/include/common/glibc_compatibility.h +++ /dev/null @@ -1,79 +0,0 @@ -/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, - * such as Ubuntu Lucid or CentOS 6. - * - * Highly experimental, not recommended, disabled by default. - * - * To use, include this file with -include compiler parameter. - * And add -Wl,--wrap=memcpy for linking. - * - * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc - * - * If you want even older systems, such as Ubuntu Hardy, - * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. - */ - -#if defined (__cplusplus) -extern "C" { -#endif - - -#include -#include - -__attribute__((__weak__)) long int __fdelt_chk(long int d) -{ - if (d < 0 || d >= FD_SETSIZE) - abort(); - return d / __NFDBITS; -} - -#include - -__attribute__((__weak__)) int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, __SIZE_TYPE__ fdslen) -{ - if (fdslen / sizeof(*fds) < nfds) - abort(); - return poll(fds, nfds, timeout); -} - - -__attribute__((__weak__)) void * __memcpy_glibc_2_2_5(void *, const void *, size_t); - -__asm__(".symver __memcpy_glibc_2_2_5, memcpy@GLIBC_2.2.5"); - -__attribute__((__weak__)) void * __wrap_memcpy(void * dest, const void * src, size_t n) -{ - return __memcpy_glibc_2_2_5(dest, src, n); -} - - -__attribute__((__weak__)) size_t __pthread_get_minstack(const pthread_attr_t * attr) -{ - return 1048576; /// This is a guess. Don't sure it is correct. -} - -#include -#include -#include -#include - -extern long int syscall (long int __sysno, ...) __THROW; - -__attribute__((__weak__)) int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) -{ - siginfo_t info; - - memset(&info, 0, sizeof(siginfo_t)); - info.si_signo = sig; - info.si_code = SI_ASYNCNL; - info.si_pid = caller_pid; - info.si_uid = getuid(); - info.si_value = val; - - return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); -} - - -#if defined (__cplusplus) -} -#endif diff --git a/libs/libglibc-compatibility/CMakeLists.txt b/libs/libglibc-compatibility/CMakeLists.txt new file mode 100644 index 00000000000..e8c5a4ba757 --- /dev/null +++ b/libs/libglibc-compatibility/CMakeLists.txt @@ -0,0 +1 @@ +add_library (glibc-compatibility glibc-compatibility.c) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c new file mode 100644 index 00000000000..5524017b61f --- /dev/null +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -0,0 +1,52 @@ +#include "glibc-compatibility.h" + +#if defined (__cplusplus) +extern "C" { +#endif + +#include +#include + +long int __fdelt_chk(long int d) +{ + if (d < 0 || d >= FD_SETSIZE) + abort(); + return d / __NFDBITS; +} + + +int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) +{ + if (fdslen / sizeof(*fds) < nfds) + abort(); + return poll(fds, nfds, timeout); +} + + +size_t __pthread_get_minstack(const pthread_attr_t * attr) +{ + return 1048576; /// This is a guess. Don't sure it is correct. +} + +#include +#include + +extern long int syscall (long int __sysno, ...) __THROW; + +int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) +{ + siginfo_t info; + + memset(&info, 0, sizeof(siginfo_t)); + info.si_signo = sig; + info.si_code = SI_ASYNCNL; + info.si_pid = caller_pid; + info.si_uid = getuid(); + info.si_value = val; + + return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); +} + +#if defined (__cplusplus) +} +#endif diff --git a/libs/libglibc-compatibility/glibc-compatibility.h b/libs/libglibc-compatibility/glibc-compatibility.h new file mode 100644 index 00000000000..aeea2fc4adb --- /dev/null +++ b/libs/libglibc-compatibility/glibc-compatibility.h @@ -0,0 +1,37 @@ +#pragma once + +/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, + * such as Ubuntu Lucid or CentOS 6. + * + * Highly experimental, not recommended, disabled by default. + * + * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc + * + * If you want even older systems, such as Ubuntu Hardy, + * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. + */ + +#if defined (__cplusplus) +extern "C" { +#endif + + +__attribute__((__weak__)) long int __fdelt_chk(long int d); + +#include +#include + +__attribute__((__weak__)) int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen); + +#include + +__attribute__((__weak__)) size_t __pthread_get_minstack(const pthread_attr_t * attr); + +#include +#include + +__attribute__((__weak__)) int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid); + +#if defined (__cplusplus) +} +#endif From 8254bbe7b87263c5e911ba5e0b9bc3f967c39871 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 08:43:43 +0300 Subject: [PATCH 47/84] Simplification [#CLICKHOUSE-3275]. --- .../glibc-compatibility.c | 16 +++++++- .../glibc-compatibility.h | 37 ------------------- 2 files changed, 15 insertions(+), 38 deletions(-) delete mode 100644 libs/libglibc-compatibility/glibc-compatibility.h diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index 5524017b61f..cb0dd617bdf 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -1,4 +1,13 @@ -#include "glibc-compatibility.h" +/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, + * such as Ubuntu Lucid or CentOS 6. + * + * Highly experimental, not recommended, disabled by default. + * + * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc + * + * If you want even older systems, such as Ubuntu Hardy, + * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. + */ #if defined (__cplusplus) extern "C" { @@ -14,6 +23,8 @@ long int __fdelt_chk(long int d) return d / __NFDBITS; } +#include +#include int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) { @@ -22,12 +33,15 @@ int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) return poll(fds, nfds, timeout); } +#include size_t __pthread_get_minstack(const pthread_attr_t * attr) { return 1048576; /// This is a guess. Don't sure it is correct. } +#include +#include #include #include diff --git a/libs/libglibc-compatibility/glibc-compatibility.h b/libs/libglibc-compatibility/glibc-compatibility.h deleted file mode 100644 index aeea2fc4adb..00000000000 --- a/libs/libglibc-compatibility/glibc-compatibility.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, - * such as Ubuntu Lucid or CentOS 6. - * - * Highly experimental, not recommended, disabled by default. - * - * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc - * - * If you want even older systems, such as Ubuntu Hardy, - * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. - */ - -#if defined (__cplusplus) -extern "C" { -#endif - - -__attribute__((__weak__)) long int __fdelt_chk(long int d); - -#include -#include - -__attribute__((__weak__)) int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen); - -#include - -__attribute__((__weak__)) size_t __pthread_get_minstack(const pthread_attr_t * attr); - -#include -#include - -__attribute__((__weak__)) int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid); - -#if defined (__cplusplus) -} -#endif From 660926c2150651c05edf77d189a6ae7cf15f423f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:24:53 +0300 Subject: [PATCH 48/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- CMakeLists.txt | 2 +- libs/libglibc-compatibility/CMakeLists.txt | 3 +- .../glibc-compatibility.c | 92 ++++-- libs/libglibc-compatibility/musl/COPYRIGHT | 163 +++++++++++ libs/libglibc-compatibility/musl/README | 6 + libs/libglibc-compatibility/musl/fallocate.c | 10 + libs/libglibc-compatibility/musl/lgamma.c | 268 ++++++++++++++++++ libs/libglibc-compatibility/musl/longjmp.s | 22 ++ libs/libglibc-compatibility/musl/pipe2.c | 24 ++ libs/libglibc-compatibility/musl/vasprintf.c | 15 + 10 files changed, 580 insertions(+), 25 deletions(-) create mode 100644 libs/libglibc-compatibility/musl/COPYRIGHT create mode 100644 libs/libglibc-compatibility/musl/README create mode 100644 libs/libglibc-compatibility/musl/fallocate.c create mode 100644 libs/libglibc-compatibility/musl/lgamma.c create mode 100644 libs/libglibc-compatibility/musl/longjmp.s create mode 100644 libs/libglibc-compatibility/musl/pipe2.c create mode 100644 libs/libglibc-compatibility/musl/vasprintf.c diff --git a/CMakeLists.txt b/CMakeLists.txt index a671fe1b775..98d8fcd578b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -192,7 +192,7 @@ if (UNBUNDLED) else () set(NOT_UNBUNDLED 1) endif () -# Using system libs can cause lot of warnings in includes. +# Using system libs can cause lot of warnings in includes. if (UNBUNDLED OR NOT (CMAKE_SYSTEM MATCHES "Linux" OR APPLE) OR ARCH_32) option (NO_WERROR "Disable -Werror compiler option" ON) endif () diff --git a/libs/libglibc-compatibility/CMakeLists.txt b/libs/libglibc-compatibility/CMakeLists.txt index e8c5a4ba757..9521dba9eab 100644 --- a/libs/libglibc-compatibility/CMakeLists.txt +++ b/libs/libglibc-compatibility/CMakeLists.txt @@ -1 +1,2 @@ -add_library (glibc-compatibility glibc-compatibility.c) +enable_language (ASM) +add_library (glibc-compatibility glibc-compatibility.c musl/pipe2.c musl/fallocate.c musl/longjmp.s musl/vasprintf.c musl/lgamma.c) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index cb0dd617bdf..e06bac8e290 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -1,18 +1,46 @@ -/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.11, +/** Allows to build programs with libc 2.18 and run on systems with at least libc 2.4, * such as Ubuntu Lucid or CentOS 6. * * Highly experimental, not recommended, disabled by default. * * Also look at http://www.lightofdawn.org/wiki/wiki.cgi/NewAppsOnOldGlibc - * - * If you want even older systems, such as Ubuntu Hardy, - * add fallocate, pipe2, __longjmp_chk, __vasprintf_chk. */ #if defined (__cplusplus) extern "C" { #endif +#include + +size_t __pthread_get_minstack(const pthread_attr_t * attr) +{ + return 1048576; /// This is a guess. Don't sure it is correct. +} + +#include +#include +#include +#include + +long int syscall(long int __sysno, ...) __THROW; + +int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) +{ + siginfo_t info; + + memset(&info, 0, sizeof(siginfo_t)); + info.si_signo = sig; + info.si_code = SI_ASYNCNL; + info.si_pid = caller_pid; + info.si_uid = getuid(); + info.si_value = val; + + return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); +} + + +/// NOTE This disables some of FORTIFY_SOURCE functionality. + #include #include @@ -33,34 +61,52 @@ int __poll_chk(struct pollfd * fds, nfds_t nfds, int timeout, size_t fdslen) return poll(fds, nfds, timeout); } -#include +#include -size_t __pthread_get_minstack(const pthread_attr_t * attr) +void longjmp(jmp_buf env, int val); + +void __longjmp_chk(jmp_buf env, int val) { - return 1048576; /// This is a guess. Don't sure it is correct. + return longjmp(env, val); } -#include -#include -#include -#include +#include -extern long int syscall (long int __sysno, ...) __THROW; +int vasprintf(char **s, const char *fmt, va_list ap); -int __gai_sigqueue(int sig, const union sigval val, pid_t caller_pid) +int __vasprintf_chk(char **s, const char *fmt, va_list ap) { - siginfo_t info; - - memset(&info, 0, sizeof(siginfo_t)); - info.si_signo = sig; - info.si_code = SI_ASYNCNL; - info.si_pid = caller_pid; - info.si_uid = getuid(); - info.si_value = val; - - return syscall(__NR_rt_sigqueueinfo, info.si_pid, sig, &info); + return vasprintf(s, fmt, ap); } +size_t __fread_chk(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + return fread(ptr, size, nmemb, stream); +} + + +#include + +int vsscanf(const char *str, const char *format, va_list ap); + +int __isoc99_vsscanf(const char *str, const char *format, va_list ap) +{ + return vsscanf(str, format, ap); +} + +int sscanf(const char *restrict s, const char *restrict fmt, ...) +{ + int ret; + va_list ap; + va_start(ap, fmt); + ret = vsscanf(s, fmt, ap); + va_end(ap); + return ret; +} + +int __isoc99_sscanf(const char *str, const char *format, ...) __attribute__((weak, alias("sscanf"))); + + #if defined (__cplusplus) } #endif diff --git a/libs/libglibc-compatibility/musl/COPYRIGHT b/libs/libglibc-compatibility/musl/COPYRIGHT new file mode 100644 index 00000000000..f0ee3b78d87 --- /dev/null +++ b/libs/libglibc-compatibility/musl/COPYRIGHT @@ -0,0 +1,163 @@ +musl as a whole is licensed under the following standard MIT license: + +---------------------------------------------------------------------- +Copyright © 2005-2014 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +---------------------------------------------------------------------- + +Authors/contributors include: + +Alex Dowad +Alexander Monakov +Anthony G. Basile +Arvid Picciani +Bobby Bingham +Boris Brezillon +Brent Cook +Chris Spiegel +Clément Vasseur +Daniel Micay +Denys Vlasenko +Emil Renner Berthing +Felix Fietkau +Felix Janda +Gianluca Anzolin +Hauke Mehrtens +Hiltjo Posthuma +Isaac Dunham +Jaydeep Patil +Jens Gustedt +Jeremy Huntwork +Jo-Philipp Wich +Joakim Sindholt +John Spencer +Josiah Worcester +Justin Cormack +Khem Raj +Kylie McClain +Luca Barbato +Luka Perkov +M Farkas-Dyck (Strake) +Mahesh Bodapati +Michael Forney +Natanael Copa +Nicholas J. Kain +orc +Pascal Cuoq +Petr Hosek +Pierre Carrier +Rich Felker +Richard Pennington +Shiz +sin +Solar Designer +Stefan Kristiansson +Szabolcs Nagy +Timo Teräs +Trutz Behn +Valentin Ochs +William Haddon + +Portions of this software are derived from third-party works licensed +under terms compatible with the above MIT license: + +The TRE regular expression implementation (src/regex/reg* and +src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed +under a 2-clause BSD license (license text in the source files). The +included version has been heavily modified by Rich Felker in 2012, in +the interests of size, simplicity, and namespace cleanliness. + +Much of the math library code (src/math/* and src/complex/*) is +Copyright © 1993,2004 Sun Microsystems or +Copyright © 2003-2011 David Schultz or +Copyright © 2003-2009 Steven G. Kargl or +Copyright © 2003-2009 Bruce D. Evans or +Copyright © 2008 Stephen L. Moshier +and labelled as such in comments in the individual source files. All +have been licensed under extremely permissive terms. + +The ARM memcpy code (src/string/arm/memcpy_el.S) is Copyright © 2008 +The Android Open Source Project and is licensed under a two-clause BSD +license. It was taken from Bionic libc, used on Android. + +The implementation of DES for crypt (src/crypt/crypt_des.c) is +Copyright © 1994 David Burren. It is licensed under a BSD license. + +The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was +originally written by Solar Designer and placed into the public +domain. The code also comes with a fallback permissive license for use +in jurisdictions that may not recognize the public domain. + +The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 +Valentin Ochs and is licensed under an MIT-style license. + +The BSD PRNG implementation (src/prng/random.c) and XSI search API +(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and +licensed under following terms: "Permission to use, copy, modify, +and/or distribute this code for any purpose with or without fee is +hereby granted. There is no warranty." + +The x86_64 port was written by Nicholas J. Kain and is licensed under +the standard MIT terms. + +The mips and microblaze ports were originally written by Richard +Pennington for use in the ellcc project. The original code was adapted +by Rich Felker for build system and code conventions during upstream +integration. It is licensed under the standard MIT terms. + +The mips64 port was contributed by Imagination Technologies and is +licensed under the standard MIT terms. + +The powerpc port was also originally written by Richard Pennington, +and later supplemented and integrated by John Spencer. It is licensed +under the standard MIT terms. + +All other files which have no copyright comments are original works +produced specifically for use as part of this library, written either +by Rich Felker, the main author of the library, or by one or more +contibutors listed above. Details on authorship of individual files +can be found in the git version control history of the project. The +omission of copyright and license comments in each file is in the +interest of source tree size. + +In addition, permission is hereby granted for all public header files +(include/* and arch/*/bits/*) and crt files intended to be linked into +applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit +the copyright notice and permission notice otherwise required by the +license, and to use these files without any requirement of +attribution. These files include substantial contributions from: + +Bobby Bingham +John Spencer +Nicholas J. Kain +Rich Felker +Richard Pennington +Stefan Kristiansson +Szabolcs Nagy + +all of whom have explicitly granted such permission. + +This file previously contained text expressing a belief that most of +the files covered by the above exception were sufficiently trivial not +to be subject to copyright, resulting in confusion over whether it +negated the permissions granted in the license. In the spirit of +permissive licensing, and of not having licensing issues being an +obstacle to adoption, that text has been removed. diff --git a/libs/libglibc-compatibility/musl/README b/libs/libglibc-compatibility/musl/README new file mode 100644 index 00000000000..11f6caa2d7e --- /dev/null +++ b/libs/libglibc-compatibility/musl/README @@ -0,0 +1,6 @@ +Tiny pieces extracted from MUSL library. + +git://git.musl-libc.org/musl +c10bc61508dc52b8315084e628f36a6c3c2dabb1 + +NOTE: Files was edited. diff --git a/libs/libglibc-compatibility/musl/fallocate.c b/libs/libglibc-compatibility/musl/fallocate.c new file mode 100644 index 00000000000..31e63822c94 --- /dev/null +++ b/libs/libglibc-compatibility/musl/fallocate.c @@ -0,0 +1,10 @@ +#define _GNU_SOURCE +#include +#include + +extern long int syscall (long int __sysno, ...) __THROW; + +int fallocate(int fd, int mode, off_t base, off_t len) +{ + return syscall(SYS_fallocate, fd, mode, base, len); +} diff --git a/libs/libglibc-compatibility/musl/lgamma.c b/libs/libglibc-compatibility/musl/lgamma.c new file mode 100644 index 00000000000..55a1f9e46a4 --- /dev/null +++ b/libs/libglibc-compatibility/musl/lgamma.c @@ -0,0 +1,268 @@ +/* origin: FreeBSD /usr/src/lib/msun/src/e_lgamma_r.c */ +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + * + */ +/* lgamma_r(x, signgamp) + * Reentrant version of the logarithm of the Gamma function + * with user provide pointer for the sign of Gamma(x). + * + * Method: + * 1. Argument Reduction for 0 < x <= 8 + * Since gamma(1+s)=s*gamma(s), for x in [0,8], we may + * reduce x to a number in [1.5,2.5] by + * lgamma(1+s) = log(s) + lgamma(s) + * for example, + * lgamma(7.3) = log(6.3) + lgamma(6.3) + * = log(6.3*5.3) + lgamma(5.3) + * = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3) + * 2. Polynomial approximation of lgamma around its + * minimun ymin=1.461632144968362245 to maintain monotonicity. + * On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use + * Let z = x-ymin; + * lgamma(x) = -1.214862905358496078218 + z^2*poly(z) + * where + * poly(z) is a 14 degree polynomial. + * 2. Rational approximation in the primary interval [2,3] + * We use the following approximation: + * s = x-2.0; + * lgamma(x) = 0.5*s + s*P(s)/Q(s) + * with accuracy + * |P/Q - (lgamma(x)-0.5s)| < 2**-61.71 + * Our algorithms are based on the following observation + * + * zeta(2)-1 2 zeta(3)-1 3 + * lgamma(2+s) = s*(1-Euler) + --------- * s - --------- * s + ... + * 2 3 + * + * where Euler = 0.5771... is the Euler constant, which is very + * close to 0.5. + * + * 3. For x>=8, we have + * lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+.... + * (better formula: + * lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...) + * Let z = 1/x, then we approximation + * f(z) = lgamma(x) - (x-0.5)(log(x)-1) + * by + * 3 5 11 + * w = w0 + w1*z + w2*z + w3*z + ... + w6*z + * where + * |w - f(z)| < 2**-58.74 + * + * 4. For negative x, since (G is gamma function) + * -x*G(-x)*G(x) = pi/sin(pi*x), + * we have + * G(x) = pi/(sin(pi*x)*(-x)*G(-x)) + * since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0 + * Hence, for x<0, signgam = sign(sin(pi*x)) and + * lgamma(x) = log(|Gamma(x)|) + * = log(pi/(|x*sin(pi*x)|)) - lgamma(-x); + * Note: one should avoid compute pi*(-x) directly in the + * computation of sin(pi*(-x)). + * + * 5. Special Cases + * lgamma(2+s) ~ s*(1-Euler) for tiny s + * lgamma(1) = lgamma(2) = 0 + * lgamma(x) ~ -log(|x|) for tiny x + * lgamma(0) = lgamma(neg.integer) = inf and raise divide-by-zero + * lgamma(inf) = inf + * lgamma(-inf) = inf (bug for bug compatible with C99!?) + * + */ + +static const double +pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */ +a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */ +a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */ +a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */ +a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */ +a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */ +a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */ +a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */ +a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */ +a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */ +a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */ +a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */ +a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */ +tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */ +tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */ +/* tt = -(tail of tf) */ +tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */ +t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */ +t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */ +t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */ +t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */ +t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */ +t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */ +t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */ +t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */ +t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */ +t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */ +t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */ +t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */ +t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */ +t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */ +t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */ +u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ +u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */ +u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */ +u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */ +u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */ +u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */ +v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */ +v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */ +v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */ +v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */ +v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */ +s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ +s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */ +s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */ +s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */ +s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */ +s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */ +s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */ +r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */ +r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */ +r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */ +r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */ +r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */ +r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */ +w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */ +w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */ +w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */ +w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */ +w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */ +w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */ +w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ + +#include +#include + +double lgamma_r(double x, int *signgamp) +{ + union {double f; uint64_t i;} u = {x}; + double_t t,y,z,nadj,p,p1,p2,p3,q,r,w; + uint32_t ix; + int sign,i; + + /* purge off +-inf, NaN, +-0, tiny and negative arguments */ + *signgamp = 1; + sign = u.i>>63; + ix = u.i>>32 & 0x7fffffff; + if (ix >= 0x7ff00000) + return x*x; + if (ix < (0x3ff-70)<<20) { /* |x|<2**-70, return -log(|x|) */ + if(sign) { + x = -x; + *signgamp = -1; + } + return -log(x); + } + if (sign) { + x = -x; + t = sin(pi * x); + if (t == 0.0) /* -integer */ + return 1.0/(x-x); + if (t > 0.0) + *signgamp = -1; + else + t = -t; + nadj = log(pi/(t*x)); + } + + /* purge off 1 and 2 */ + if ((ix == 0x3ff00000 || ix == 0x40000000) && (uint32_t)u.i == 0) + r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + r = -log(x); + if (ix >= 0x3FE76944) { + y = 1.0 - x; + i = 0; + } else if (ix >= 0x3FCDA661) { + y = x - (tc-1.0); + i = 1; + } else { + y = x; + i = 2; + } + } else { + r = 0.0; + if (ix >= 0x3FFBB4C3) { /* [1.7316,2] */ + y = 2.0 - x; + i = 0; + } else if(ix >= 0x3FF3B4C4) { /* [1.23,1.73] */ + y = x - tc; + i = 1; + } else { + y = x - 1.0; + i = 2; + } + } + switch (i) { + case 0: + z = y*y; + p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); + p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); + p = y*p1+p2; + r += (p-0.5*y); + break; + case 1: + z = y*y; + w = z*y; + p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ + p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); + p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); + p = z*p1-(tt-w*(p2+y*p3)); + r += tf + p; + break; + case 2: + p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); + p2 = 1.0+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); + r += -0.5*y + p1/p2; + } + } else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int)x; + y = x - (double)i; + p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); + q = 1.0+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); + r = 0.5*y+p/q; + z = 1.0; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: z *= y + 6.0; /* FALLTHRU */ + case 6: z *= y + 5.0; /* FALLTHRU */ + case 5: z *= y + 4.0; /* FALLTHRU */ + case 4: z *= y + 3.0; /* FALLTHRU */ + case 3: z *= y + 2.0; /* FALLTHRU */ + r += log(z); + break; + } + } else if (ix < 0x43900000) { /* 8.0 <= x < 2**58 */ + t = log(x); + z = 1.0/x; + y = z*z; + w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); + r = (x-0.5)*(t-1.0)+w; + } else /* 2**58 <= x <= inf */ + r = x*(log(x)-1.0); + if (sign) + r = nadj - r; + return r; +} + + +int signgam; + +double lgamma(double x) +{ + return lgamma_r(x, &signgam); +} diff --git a/libs/libglibc-compatibility/musl/longjmp.s b/libs/libglibc-compatibility/musl/longjmp.s new file mode 100644 index 00000000000..e175a4b9606 --- /dev/null +++ b/libs/libglibc-compatibility/musl/longjmp.s @@ -0,0 +1,22 @@ +/* Copyright 2011-2012 Nicholas J. Kain, licensed under standard MIT license */ +.global _longjmp +.global longjmp +.type _longjmp,@function +.type longjmp,@function +_longjmp: +longjmp: + mov %rsi,%rax /* val will be longjmp return */ + test %rax,%rax + jnz 1f + inc %rax /* if val==0, val=1 per longjmp semantics */ +1: + mov (%rdi),%rbx /* rdi is the jmp_buf, restore regs from it */ + mov 8(%rdi),%rbp + mov 16(%rdi),%r12 + mov 24(%rdi),%r13 + mov 32(%rdi),%r14 + mov 40(%rdi),%r15 + mov 48(%rdi),%rdx /* this ends up being the stack pointer */ + mov %rdx,%rsp + mov 56(%rdi),%rdx /* this is the instruction pointer */ + jmp *%rdx /* goto saved address without altering rsp */ diff --git a/libs/libglibc-compatibility/musl/pipe2.c b/libs/libglibc-compatibility/musl/pipe2.c new file mode 100644 index 00000000000..061f2e07313 --- /dev/null +++ b/libs/libglibc-compatibility/musl/pipe2.c @@ -0,0 +1,24 @@ +#include +#include +#include +#include + +extern long int syscall (long int __sysno, ...) __THROW; + +int pipe2(int fd[2], int flag) +{ + if (!flag) return pipe(fd); + int ret = syscall(SYS_pipe2, fd, flag); + if (ret != -ENOSYS) return -ret; + ret = pipe(fd); + if (ret) return ret; + if (flag & O_CLOEXEC) { + syscall(SYS_fcntl, fd[0], F_SETFD, FD_CLOEXEC); + syscall(SYS_fcntl, fd[1], F_SETFD, FD_CLOEXEC); + } + if (flag & O_NONBLOCK) { + syscall(SYS_fcntl, fd[0], F_SETFL, O_NONBLOCK); + syscall(SYS_fcntl, fd[1], F_SETFL, O_NONBLOCK); + } + return 0; +} diff --git a/libs/libglibc-compatibility/musl/vasprintf.c b/libs/libglibc-compatibility/musl/vasprintf.c new file mode 100644 index 00000000000..08251bc20ec --- /dev/null +++ b/libs/libglibc-compatibility/musl/vasprintf.c @@ -0,0 +1,15 @@ +#define _GNU_SOURCE +#include +#include +#include + +int vasprintf(char **s, const char *fmt, va_list ap) +{ + va_list ap2; + va_copy(ap2, ap); + int l = vsnprintf(0, 0, fmt, ap2); + va_end(ap2); + + if (l<0 || !(*s=malloc(l+1U))) return -1; + return vsnprintf(*s, l+1U, fmt, ap); +} From 148421d10f24202a8e8f68ab6a55f342b18afa9d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:29:03 +0300 Subject: [PATCH 49/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- libs/libglibc-compatibility/glibc-compatibility.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index e06bac8e290..6aed44a3efd 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -70,7 +70,7 @@ void __longjmp_chk(jmp_buf env, int val) return longjmp(env, val); } -#include +#include int vasprintf(char **s, const char *fmt, va_list ap); @@ -79,14 +79,13 @@ int __vasprintf_chk(char **s, const char *fmt, va_list ap) return vasprintf(s, fmt, ap); } -size_t __fread_chk(void *ptr, size_t size, size_t nmemb, FILE *stream) +size_t fread(void *ptr, size_t size, size_t nmemb, void *stream); + +size_t __fread_chk(void *ptr, size_t size, size_t nmemb, void *stream) { return fread(ptr, size, nmemb, stream); } - -#include - int vsscanf(const char *str, const char *format, va_list ap); int __isoc99_vsscanf(const char *str, const char *format, va_list ap) From a3f3ed12ab95ea4c972ffbb3a1b91c7f8a0c5e96 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:32:27 +0300 Subject: [PATCH 50/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- libs/libglibc-compatibility/musl/lgamma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libglibc-compatibility/musl/lgamma.c b/libs/libglibc-compatibility/musl/lgamma.c index 55a1f9e46a4..b0e4f3aa537 100644 --- a/libs/libglibc-compatibility/musl/lgamma.c +++ b/libs/libglibc-compatibility/musl/lgamma.c @@ -149,7 +149,7 @@ w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ double lgamma_r(double x, int *signgamp) { union {double f; uint64_t i;} u = {x}; - double_t t,y,z,nadj,p,p1,p2,p3,q,r,w; + double_t t,y,z,nadj=0,p,p1,p2,p3,q,r,w; uint32_t ix; int sign,i; From 1544187f7b8a1a7d48ff6f7621139ee3009ba480 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 10:51:38 +0300 Subject: [PATCH 51/84] Better GLIBC_COMPATIBILITY option [#CLICKHOUSE-3275]. --- libs/libglibc-compatibility/glibc-compatibility.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/libs/libglibc-compatibility/glibc-compatibility.c b/libs/libglibc-compatibility/glibc-compatibility.c index 6aed44a3efd..367bd2de765 100644 --- a/libs/libglibc-compatibility/glibc-compatibility.c +++ b/libs/libglibc-compatibility/glibc-compatibility.c @@ -105,6 +105,13 @@ int sscanf(const char *restrict s, const char *restrict fmt, ...) int __isoc99_sscanf(const char *str, const char *format, ...) __attribute__((weak, alias("sscanf"))); +int open(const char *path, int oflag); + +int __open_2(const char *path, int oflag) +{ + return open(path, oflag); +} + #if defined (__cplusplus) } From 9b8722b0b505d8a985c085ed45605f7e837ac5b3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 12:18:52 +0300 Subject: [PATCH 52/84] Fixed incompatibility of debian packages with older systems [#CLICKHOUSE-3275]. --- debian/clickhouse-client.postinst | 3 +++ debian/rules | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/debian/clickhouse-client.postinst b/debian/clickhouse-client.postinst index 2e84aad51ed..06b4b8654b0 100644 --- a/debian/clickhouse-client.postinst +++ b/debian/clickhouse-client.postinst @@ -1,2 +1,5 @@ +#!/bin/sh +set -e + mkdir -p /etc/clickhouse-client/conf.d chown -R clickhouse: /etc/clickhouse-client diff --git a/debian/rules b/debian/rules index c74140a2c51..4738dde4fb6 100755 --- a/debian/rules +++ b/debian/rules @@ -95,4 +95,7 @@ endif dh_install --list-missing --sourcedir=$(DESTDIR) override_dh_shlibdeps: - dh_shlibdeps -Xdebian/clickhouse-server-base/usr/share/clickhouse/bin/* + true # We depend only on libc and dh_shlibdeps gives us wrong (too strict) dependency. + +override_dh_builddeb: + dh_builddeb -- -Z gzip # Older systems don't have "xz", so use "gzip" instead. From 5d038312b79f4555153e530abd28474f1cdeaddc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 12:49:50 +0300 Subject: [PATCH 53/84] Fixed readme [#CLICKHOUSE-3275]. --- libs/libmemcpy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libmemcpy/README.md b/libs/libmemcpy/README.md index d3dd14ac55b..e253f6bf5dd 100644 --- a/libs/libmemcpy/README.md +++ b/libs/libmemcpy/README.md @@ -5,7 +5,7 @@ It has the following advantages over `libc`-supplied implementation: - it is linked statically, so the function can have position-dependent code; - your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library; - you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions; -- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is withing few percents). +- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is within few percents). Currently it uses the implementation from **Linwei** (skywind3000@163.com). Look at https://www.zhihu.com/question/35172305 for discussion. From 3523059be9bbde604ec9da33375dadf0a9eb9641 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 13:12:50 +0300 Subject: [PATCH 54/84] Increased inline depth for clang (it must inline as good as gcc) [#CLICKHOUSE-2]. --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 69b20377459..b65f625142f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,11 @@ if (USE_LIBCXX) set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_DEBUG=1") # More checks in debug build. endif () +# Special options for better optimized code with clang +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -mllvm -inline-threshold=10000") +endif () + set (CMAKE_BUILD_COLOR_MAKEFILE ON) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} -std=gnu++1z ${PLATFORM_EXTRA_CXX_FLAG} -fno-omit-frame-pointer ${COMMON_WARNING_FLAGS} ${CXX_WARNING_FLAGS} ${GLIBC_COMPATIBILITY_COMPILE_FLAGS}") #set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") From cfd4120f4a587660f2d223590cb648cc37a34e5e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 14:08:39 +0300 Subject: [PATCH 55/84] Fixed build with increased inline-depth in clang [#CLICKHOUSE-2]. --- dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp index 37cfc759e6c..489557f8a85 100644 --- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp +++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.cpp @@ -466,6 +466,17 @@ StringRef ComplexKeyCacheDictionary::placeKeysInPool( return { place, sum_keys_size }; } +/// Explicit instantiations. + +template StringRef ComplexKeyCacheDictionary::placeKeysInPool( + const size_t row, const Columns & key_columns, StringRefs & keys, + const std::vector & key_attributes, Arena & pool); + +template StringRef ComplexKeyCacheDictionary::placeKeysInPool( + const size_t row, const Columns & key_columns, StringRefs & keys, + const std::vector & key_attributes, ArenaWithFreeLists & pool); + + StringRef ComplexKeyCacheDictionary::placeKeysInFixedSizePool( const size_t row, const Columns & key_columns) const { From 23d94797c348956187645c55d4aed955b781bf72 Mon Sep 17 00:00:00 2001 From: Amy Krishnevsky Date: Fri, 15 Sep 2017 14:16:02 +0300 Subject: [PATCH 56/84] Translate update to changelog.md --- CHANGELOG.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a7a389bac9..30a1b4b2558 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,41 @@ +# ClickHouse release 1.1.54289 + +## New features: +* `SYSTEM` requests for server administration: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. +* Added functions for working with arrays: `concat`, `arraySlice`, `arrayPushBack`, `arrayPushFront`, `arrayPopBack`, `arrayPopFront`. +* Added the `root` and `identity` parameters for the ZooKeeper configuration. This allows you to isolate individual users on the same ZooKeeper cluster. +* Added the aggregate functions `groupBitAnd`, `groupBitOr`, and `groupBitXor` (for compatibility, they can also be accessed with the names `BIT_AND`, `BIT_OR`, and `BIT_XOR`). +* External dictionaries can be loaded from MySQL by specifying a socket in the file system. +* External dictionaries can be loaded from MySQL over SSL (the `ssl_cert`, `ssl_key`, and `ssl_ca` parameters). +* Added the `max_network_bandwidth_for_user` setting to restrict the overall bandwidth use for requests per user. +* Support for `DROP TABLE` for temporary tables. +* Support for reading `DateTime` values in Unix timestamp format from the `CSV` and `JSONEachRow` formats. +* Lagging replicas in distributed queries are now excluded by default (the default threshold is 5 minutes). +* FIFO blocking is used during ALTER: an ALTER query isn't blocked indefinitely for continuously running queries. +* Option to set `umask` in the config file. +* Improved performance for queries with `DISTINCT`. + +## Bug fixes: +* Improved the process for deleting old nodes in ZooKeeper. Previously, old nodes sometimes didn't get deleted if there were frequent inserts, which caused the server to be slow to shut down, among other things. +* Fixed randomization when choosing hosts for the connection to ZooKeeper. +* Fixed the exclusion of lagging replicas in distributed queries if the replica is localhost. +* Fixed an error where a data part in a `ReplicatedMergeTree` table could be broken after running `ALTER MODIFY` on an element in a `Nested` structure. +* Fixed an error that could cause SELECT queries to "hang". +* Improvements to distributed DDL queries. +* Fixed the query `CREATE TABLE ... AS `. +* Resolved the deadlock in the `ALTER ... CLEAR COLUMN IN PARTITION` query for `Buffer` tables. +* Fixed the invalid default value for `Enum`s (0 instead of the minimum) when using the `JSONEachRow` and `TSKV` formats. +* Resolved the appearance of zombie processes when using a dictionary with an `executable` source. +* Fixed the segfault for the HEAD query. + +## Improvements to development workflow and ClickHouse assembly: +* You can use `pbuilder` to build ClickHouse. +* You can use `libc++` instead of `libstdc++` for builds on Linux. +* Added instructions for using static code analysis tools: `Coverity`, `clang-tidy`, and `cppcheck`. + +## Please note when upgrading: +* There is now a higher default value for the MergeTree setting `max_bytes_to_merge_at_max_space_in_pool` (the maximum total size of data parts to merge, in bytes): it has increased from 100 GiB to 150 GiB. This might result in large merges running after the server upgrade, which could cause an increased load on the disk subsystem. If the free space available on the server is less than twice the total amount of the merges that are running, this will cause all other merges to stop running, including merges of small data parts. As a result, INSERT requests will fail with the message "Merges are processing significantly slower than inserts." Use the `SELECT * FROM system.merges` request to monitor the situation. You can also check the `DiskSpaceReservedForMerge` metric in the `system.metrics` table, or in Graphite. You don't need to do anything to fix this, since the issue will resolve itself once the large merges finish. If you find this unacceptable, you can restore the previous value for the `max_bytes_to_merge_at_max_space_in_pool` setting (to do this, go to the `` section in config.xml, set `107374182400` and restart the server). + # ClickHouse release 1.1.54284 * This is bugfix release for previous 1.1.54282 release. It fixes ZooKeeper nodes leak in `parts/` directory. From 2bba5b2f02b63665d8f9755f2c50f960c1dcf668 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Fri, 15 Sep 2017 14:33:45 +0300 Subject: [PATCH 57/84] Update CHANGELOG.md --- CHANGELOG.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30a1b4b2558..a926e5cdc4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,22 +1,22 @@ # ClickHouse release 1.1.54289 ## New features: -* `SYSTEM` requests for server administration: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. +* `SYSTEM` queries for server administration: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. * Added functions for working with arrays: `concat`, `arraySlice`, `arrayPushBack`, `arrayPushFront`, `arrayPopBack`, `arrayPopFront`. * Added the `root` and `identity` parameters for the ZooKeeper configuration. This allows you to isolate individual users on the same ZooKeeper cluster. * Added the aggregate functions `groupBitAnd`, `groupBitOr`, and `groupBitXor` (for compatibility, they can also be accessed with the names `BIT_AND`, `BIT_OR`, and `BIT_XOR`). -* External dictionaries can be loaded from MySQL by specifying a socket in the file system. +* External dictionaries can be loaded from MySQL by specifying a socket in the filesystem. * External dictionaries can be loaded from MySQL over SSL (the `ssl_cert`, `ssl_key`, and `ssl_ca` parameters). -* Added the `max_network_bandwidth_for_user` setting to restrict the overall bandwidth use for requests per user. +* Added the `max_network_bandwidth_for_user` setting to restrict the overall bandwidth use for queries per user. * Support for `DROP TABLE` for temporary tables. * Support for reading `DateTime` values in Unix timestamp format from the `CSV` and `JSONEachRow` formats. * Lagging replicas in distributed queries are now excluded by default (the default threshold is 5 minutes). -* FIFO blocking is used during ALTER: an ALTER query isn't blocked indefinitely for continuously running queries. +* FIFO locking is used during ALTER: an ALTER query isn't blocked indefinitely for continuously running queries. * Option to set `umask` in the config file. * Improved performance for queries with `DISTINCT`. ## Bug fixes: -* Improved the process for deleting old nodes in ZooKeeper. Previously, old nodes sometimes didn't get deleted if there were frequent inserts, which caused the server to be slow to shut down, among other things. +* Improved the process for deleting old nodes in ZooKeeper. Previously, old nodes sometimes didn't get deleted if there were very frequent inserts, which caused the server to be slow to shut down, among other things. * Fixed randomization when choosing hosts for the connection to ZooKeeper. * Fixed the exclusion of lagging replicas in distributed queries if the replica is localhost. * Fixed an error where a data part in a `ReplicatedMergeTree` table could be broken after running `ALTER MODIFY` on an element in a `Nested` structure. @@ -26,9 +26,9 @@ * Resolved the deadlock in the `ALTER ... CLEAR COLUMN IN PARTITION` query for `Buffer` tables. * Fixed the invalid default value for `Enum`s (0 instead of the minimum) when using the `JSONEachRow` and `TSKV` formats. * Resolved the appearance of zombie processes when using a dictionary with an `executable` source. -* Fixed the segfault for the HEAD query. +* Fixed segfault for the HEAD query. -## Improvements to development workflow and ClickHouse assembly: +## Improvements to development workflow and ClickHouse build: * You can use `pbuilder` to build ClickHouse. * You can use `libc++` instead of `libstdc++` for builds on Linux. * Added instructions for using static code analysis tools: `Coverity`, `clang-tidy`, and `cppcheck`. From 86c46ad1bdd0fc90903a569d169a9f8ae9195ac2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 15 Sep 2017 15:16:12 +0300 Subject: [PATCH 58/84] Unification [#CLICKHOUSE-2]. --- .../AggregateFunctionGroupArray.cpp | 2 +- .../AggregateFunctionStatistics.h | 30 ++-- dbms/src/AggregateFunctions/Helpers.h | 4 +- .../src/AggregateFunctions/ReservoirSampler.h | 6 +- .../src/Common/CombinedCardinalityEstimator.h | 4 +- dbms/src/Common/ConcurrentBoundedQueue.h | 8 +- dbms/src/Common/HyperLogLogCounter.h | 20 +-- dbms/src/Common/PoolWithFailoverBase.h | 10 +- dbms/src/Common/UInt128.h | 2 +- dbms/src/Common/ZooKeeper/ZooKeeperHolder.h | 4 +- dbms/src/Common/tests/AvalancheTest.h | 6 +- .../tests/hopscotch-map/src/hopscotch_hash.h | 156 +++++++++--------- .../tests/hopscotch-map/src/hopscotch_map.h | 68 ++++---- .../hopscotch-map/src/hopscotch_sc_map.h | 68 ++++---- .../hopscotch-map/src/hopscotch_sc_set.h | 40 ++--- .../tests/hopscotch-map/src/hopscotch_set.h | 40 ++--- dbms/src/Core/AccurateComparison.h | 48 +++--- dbms/src/Core/QualifiedTableName.h | 2 +- .../AggregatingSortedBlockInputStream.cpp | 4 +- .../AggregatingSortedBlockInputStream.h | 4 +- .../CollapsingSortedBlockInputStream.cpp | 2 +- .../CollapsingSortedBlockInputStream.h | 2 +- dbms/src/DataStreams/ColumnGathererStream.h | 4 +- .../GraphiteRollupSortedBlockInputStream.cpp | 2 +- .../GraphiteRollupSortedBlockInputStream.h | 2 +- .../MergingSortedBlockInputStream.h | 6 +- .../ReplacingSortedBlockInputStream.cpp | 2 +- .../ReplacingSortedBlockInputStream.h | 2 +- .../SummingSortedBlockInputStream.cpp | 8 +- .../SummingSortedBlockInputStream.h | 8 +- dbms/src/DataStreams/UnionBlockInputStream.h | 4 +- .../Dictionaries/DictionaryBlockInputStream.h | 60 +++---- .../RangeDictionaryBlockInputStream.h | 30 ++-- dbms/src/Functions/FunctionsComparison.h | 2 +- dbms/src/Functions/FunctionsConversion.h | 8 +- dbms/src/Functions/FunctionsDateTime.h | 2 +- dbms/src/Functions/FunctionsFindCluster.h | 6 +- dbms/src/Functions/FunctionsHigherOrder.h | 2 +- dbms/src/Functions/FunctionsLogical.h | 8 +- dbms/src/Functions/FunctionsReinterpret.h | 6 +- dbms/src/Functions/FunctionsRound.h | 86 +++++----- dbms/src/Functions/FunctionsVisitParam.h | 4 +- .../tests/logical_functions_performance.cpp | 6 +- dbms/src/IO/HashingWriteBuffer.cpp | 2 +- dbms/src/IO/HashingWriteBuffer.h | 2 +- dbms/src/IO/ReadHelpers.cpp | 2 +- dbms/src/IO/ReadHelpers.h | 10 +- dbms/src/Parsers/ParserCreateQuery.h | 8 +- dbms/src/Storages/ColumnDefault.h | 2 +- dbms/src/Storages/MarkCache.h | 2 +- dbms/src/Storages/StorageDictionary.h | 2 +- 51 files changed, 409 insertions(+), 409 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index 6019cb3bf06..ead604e30ee 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -8,7 +8,7 @@ namespace DB namespace { -template