diff --git a/CHANGELOG.md b/CHANGELOG.md
index 59343cd9936..7fa13dacdf9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,109 @@
+## ClickHouse release 19.5.2.6, 2019-04-15
+
+### New Features
+
+* [Hyperscan](https://github.com/intel/hyperscan) multiple regular expression matching was added (functions `multiMatchAny`, `multiMatchAnyIndex`, `multiFuzzyMatchAny`, `multiFuzzyMatchAnyIndex`). [#4780](https://github.com/yandex/ClickHouse/pull/4780), [#4841](https://github.com/yandex/ClickHouse/pull/4841) ([Danila Kutenin](https://github.com/danlark1))
+* `multiSearchFirstPosition` function was added. [#4780](https://github.com/yandex/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1))
+* Implement the predefined expression filter per row for tables. [#4792](https://github.com/yandex/ClickHouse/pull/4792) ([Ivan](https://github.com/abyss7))
+* A new type of data skipping indices based on bloom filters (can be used for `equal`, `in` and `like` functions). [#4499](https://github.com/yandex/ClickHouse/pull/4499) ([Nikita Vasilev](https://github.com/nikvas0))
+* Added `ASOF JOIN` which allows to run queries that join to the most recent value known. [#4774](https://github.com/yandex/ClickHouse/pull/4774) [#4867](https://github.com/yandex/ClickHouse/pull/4867) [#4863](https://github.com/yandex/ClickHouse/pull/4863) [#4875](https://github.com/yandex/ClickHouse/pull/4875) ([Martijn Bakker](https://github.com/Gladdy), [Artem Zuikov](https://github.com/4ertus2))
+* Rewrite multiple `COMMA JOIN` to `CROSS JOIN`. Then rewrite them to `INNER JOIN` if possible. [#4661](https://github.com/yandex/ClickHouse/pull/4661) ([Artem Zuikov](https://github.com/4ertus2))
+
+### Improvement
+
+* `topK` and `topKWeighted` now supports custom `loadFactor` (fixes issue [#4252](https://github.com/yandex/ClickHouse/issues/4252)). [#4634](https://github.com/yandex/ClickHouse/pull/4634) ([Kirill Danshin](https://github.com/kirillDanshin))
+* Allow to use `parallel_replicas_count > 1` even for tables without sampling (the setting is simply ignored for them). In previous versions it was lead to exception. [#4637](https://github.com/yandex/ClickHouse/pull/4637) ([Alexey Elymanov](https://github.com/digitalist))
+* Support for `CREATE OR REPLACE VIEW`. Allow to create a view or set a new definition in a single statement. [#4654](https://github.com/yandex/ClickHouse/pull/4654) ([Boris Granveaud](https://github.com/bgranvea))
+* `Buffer` table engine now supports `PREWHERE`. [#4671](https://github.com/yandex/ClickHouse/pull/4671) ([Yangkuan Liu](https://github.com/LiuYangkuan))
+* Add ability to start replicated table without metadata in zookeeper in `readonly` mode. [#4691](https://github.com/yandex/ClickHouse/pull/4691) ([alesapin](https://github.com/alesapin))
+* Fixed flicker of progress bar in clickhouse-client. The issue was most noticeable when using `FORMAT Null` with streaming queries. [#4811](https://github.com/yandex/ClickHouse/pull/4811) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Allow to disable functions with `hyperscan` library on per user basis to limit potentially excessive and uncontrolled resource usage. [#4816](https://github.com/yandex/ClickHouse/pull/4816) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Add version number logging in all errors. [#4824](https://github.com/yandex/ClickHouse/pull/4824) ([proller](https://github.com/proller))
+* Added restriction to the `multiMatch` functions which requires string size to fit into `unsigned int`. Also added the number of arguments limit to the `multiSearch` functions. [#4834](https://github.com/yandex/ClickHouse/pull/4834) ([Danila Kutenin](https://github.com/danlark1))
+* Improved usage of scratch space and error handling in Hyperscan. [#4866](https://github.com/yandex/ClickHouse/pull/4866) ([Danila Kutenin](https://github.com/danlark1))
+* Fill `system.graphite_detentions` from a table config of `*GraphiteMergeTree` engine tables. [#4584](https://github.com/yandex/ClickHouse/pull/4584) ([Mikhail f. Shiryaev](https://github.com/Felixoid))
+* Rename `trigramDistance` function to `ngramDistance` and add more functions with `CaseInsensitive` and `UTF`. [#4602](https://github.com/yandex/ClickHouse/pull/4602) ([Danila Kutenin](https://github.com/danlark1))
+* Improved data skipping indices calculation. [#4640](https://github.com/yandex/ClickHouse/pull/4640) ([Nikita Vasilev](https://github.com/nikvas0))
+
+### Bug Fix
+
+* Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [#4665](https://github.com/yandex/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fixes capnproto reading from buffer. Sometimes files wasn't loaded successfully by HTTP. [#4674](https://github.com/yandex/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs))
+* Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [#4683](https://github.com/yandex/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird))
+* Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [#4698](https://github.com/yandex/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Deadlock may happen while executing `DROP DATABASE dictionary` query. [#4701](https://github.com/yandex/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fix undefinied behavior in `median` and `quantile` functions. [#4702](https://github.com/yandex/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz))
+* Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [#4706](https://github.com/yandex/ClickHouse/pull/4706) ([proller](https://github.com/proller))
+* Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [#2867](https://github.com/yandex/ClickHouse/issues/2867)). [#4707](https://github.com/yandex/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn))
+* Fixed ignorance of `UTC` setting (fixes issue [#4658](https://github.com/yandex/ClickHouse/issues/4658)). [#4718](https://github.com/yandex/ClickHouse/pull/4718) ([proller](https://github.com/proller))
+* Fix `histogram` function behaviour with `Distributed` tables. [#4741](https://github.com/yandex/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv))
+* Fixed tsan report `destroy of a locked mutex`. [#4742](https://github.com/yandex/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part_log is enabled. [#4758](https://github.com/yandex/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [#4772](https://github.com/yandex/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
+* Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [#4776](https://github.com/yandex/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Always backquote column names in metadata. Otherwise it's impossible to create a table with column named `index` (server won't restart due to malformed `ATTACH` query in metadata). [#4782](https://github.com/yandex/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [#4790](https://github.com/yandex/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason))
+* Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [#4794](https://github.com/yandex/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014))
+* Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [#4808](https://github.com/yandex/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar))
+* Fix crash of `JOIN` on not-nullable vs nullable column. Fix `NULLs` in right keys in `ANY JOIN` + `join_use_nulls`. [#4815](https://github.com/yandex/ClickHouse/pull/4815) ([Artem Zuikov](https://github.com/4ertus2))
+* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/yandex/ClickHouse/pull/4835) ([proller](https://github.com/proller))
+* Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [#4836](https://github.com/yandex/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fixed data race when fetching data part that is already obsolete. [#4839](https://github.com/yandex/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fixed rare data race that can happen during `RENAME` table of MergeTree family. [#4844](https://github.com/yandex/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [#4847](https://github.com/yandex/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx))
+* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/yandex/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
+* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/yandex/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2))
+* Fix `No message received` exception while fetching parts between replicas. [#4856](https://github.com/yandex/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin))
+* Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [#4871](https://github.com/yandex/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
+* Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [#3421](https://github.com/yandex/ClickHouse/issues/3421)). [#4592](https://github.com/yandex/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn))
+* Fix incorrect result in `FULL/RIGHT JOIN` with const column. [#4723](https://github.com/yandex/ClickHouse/pull/4723) ([Artem Zuikov](https://github.com/4ertus2))
+* Fix duplicates in `GLOBAL JOIN` with asterisk. [#4705](https://github.com/yandex/ClickHouse/pull/4705) ([Artem Zuikov](https://github.com/4ertus2))
+* Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [#4883](https://github.com/yandex/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin))
+* Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [#4894](https://github.com/yandex/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar))
+* Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [#4897](https://github.com/yandex/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin))
+* Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [#4919](https://github.com/yandex/ClickHouse/issues/4919)). [#4922](https://github.com/yandex/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
+* Fix wrong name qualification in `GLOBAL JOIN`. [#4969](https://github.com/yandex/ClickHouse/pull/4969) ([Artem Zuikov](https://github.com/4ertus2))
+* Function `toISOWeek` result for year 1970. [#4988](https://github.com/yandex/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [#4991](https://github.com/yandex/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin))
+
+### Backward Incompatible Change
+
+* Rename setting `insert_sample_with_metadata` to setting `input_format_defaults_for_omitted_fields`. [#4771](https://github.com/yandex/ClickHouse/pull/4771) ([Artem Zuikov](https://github.com/4ertus2))
+* Added setting `max_partitions_per_insert_block` (with value 100 by default). If inserted block contains larger number of partitions, an exception is thrown. Set it to 0 if you want to remove the limit (not recommended). [#4845](https://github.com/yandex/ClickHouse/pull/4845) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Multi-search functions were renamed (`multiPosition` to `multiSearchAllPositions`, `multiSearch` to `multiSearchAny`, `firstMatch` to `multiSearchFirstIndex`). [#4780](https://github.com/yandex/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1))
+
+### Performance Improvement
+
+* Optimize Volnitsky searcher by inlining, giving about 5-10% search improvement for queries with many needles or many similar bigrams. [#4862](https://github.com/yandex/ClickHouse/pull/4862) ([Danila Kutenin](https://github.com/danlark1))
+* Fix performance issue when setting `use_uncompressed_cache` is greater than zero, which appeared when all read data contained in cache. [#4913](https://github.com/yandex/ClickHouse/pull/4913) ([alesapin](https://github.com/alesapin))
+
+
+### Build/Testing/Packaging Improvement
+
+* Hardening debug build: more granular memory mappings and ASLR; add memory protection for mark cache and index. This allows to find more memory stomping bugs in case when ASan and MSan cannot do it. [#4632](https://github.com/yandex/ClickHouse/pull/4632) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Add support for cmake variables `ENABLE_PROTOBUF`, `ENABLE_PARQUET` and `ENABLE_BROTLI` which allows to enable/disable the above features (same as we can do for librdkafka, mysql, etc). [#4669](https://github.com/yandex/ClickHouse/pull/4669) ([Silviu Caragea](https://github.com/silviucpp))
+* Add ability to print process list and stacktraces of all threads if some queries are hung after test run. [#4675](https://github.com/yandex/ClickHouse/pull/4675) ([alesapin](https://github.com/alesapin))
+* Add retries on `Connection loss` error in `clickhouse-test`. [#4682](https://github.com/yandex/ClickHouse/pull/4682) ([alesapin](https://github.com/alesapin))
+* Add freebsd build with vagrant and build with thread sanitizer to packager script. [#4712](https://github.com/yandex/ClickHouse/pull/4712) [#4748](https://github.com/yandex/ClickHouse/pull/4748) ([alesapin](https://github.com/alesapin))
+* Now user asked for password for user `'default'` during installation. [#4725](https://github.com/yandex/ClickHouse/pull/4725) ([proller](https://github.com/proller))
+* Suppress warning in `rdkafka` library. [#4740](https://github.com/yandex/ClickHouse/pull/4740) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Allow ability to build without ssl. [#4750](https://github.com/yandex/ClickHouse/pull/4750) ([proller](https://github.com/proller))
+* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/yandex/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid))
+* Upgrade contrib boost to 1.69. [#4793](https://github.com/yandex/ClickHouse/pull/4793) ([proller](https://github.com/proller))
+* Disable usage of `mremap` when compiled with Thread Sanitizer. Surprisingly enough, TSan does not intercept `mremap` (though it does intercept `mmap`, `munmap`) that leads to false positives. Fixed TSan report in stateful tests. [#4859](https://github.com/yandex/ClickHouse/pull/4859) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Add test checking using format schema via HTTP interface. [#4864](https://github.com/yandex/ClickHouse/pull/4864) ([Vitaly Baranov](https://github.com/vitlibar))
+
+## ClickHouse release 19.4.3.11, 2019-04-02
+
+### Bug Fixes
+
+* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/yandex/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2))
+* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/yandex/ClickHouse/pull/4835) ([proller](https://github.com/proller))
+
+### Build/Testing/Packaging Improvement
+
+* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/yandex/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid))
+
## ClickHouse release 19.4.2.7, 2019-03-30
### Bug Fixes
@@ -13,11 +119,11 @@
### New Features
* Added full support for `Protobuf` format (input and output, nested data structures). [#4174](https://github.com/yandex/ClickHouse/pull/4174) [#4493](https://github.com/yandex/ClickHouse/pull/4493) ([Vitaly Baranov](https://github.com/vitlibar))
* Added bitmap functions with Roaring Bitmaps. [#4207](https://github.com/yandex/ClickHouse/pull/4207) ([Andy Yang](https://github.com/andyyzh)) [#4568](https://github.com/yandex/ClickHouse/pull/4568) ([Vitaly Baranov](https://github.com/vitlibar))
-* Parquet format support [#4448](https://github.com/yandex/ClickHouse/pull/4448) ([proller](https://github.com/proller))
+* Parquet format support. [#4448](https://github.com/yandex/ClickHouse/pull/4448) ([proller](https://github.com/proller))
* N-gram distance was added for fuzzy string comparison. It is similar to q-gram metrics in R language. [#4466](https://github.com/yandex/ClickHouse/pull/4466) ([Danila Kutenin](https://github.com/danlark1))
* Combine rules for graphite rollup from dedicated aggregation and retention patterns. [#4426](https://github.com/yandex/ClickHouse/pull/4426) ([Mikhail f. Shiryaev](https://github.com/Felixoid))
* Added `max_execution_speed` and `max_execution_speed_bytes` to limit resource usage. Added `min_execution_speed_bytes` setting to complement the `min_execution_speed`. [#4430](https://github.com/yandex/ClickHouse/pull/4430) ([Winter Zhang](https://github.com/zhang2014))
-* Implemented function `flatten` [#4555](https://github.com/yandex/ClickHouse/pull/4555) [#4409](https://github.com/yandex/ClickHouse/pull/4409) ([alexey-milovidov](https://github.com/alexey-milovidov), [kzon](https://github.com/kzon))
+* Implemented function `flatten`. [#4555](https://github.com/yandex/ClickHouse/pull/4555) [#4409](https://github.com/yandex/ClickHouse/pull/4409) ([alexey-milovidov](https://github.com/alexey-milovidov), [kzon](https://github.com/kzon))
* Added functions `arrayEnumerateDenseRanked` and `arrayEnumerateUniqRanked` (it's like `arrayEnumerateUniq` but allows to fine tune array depth to look inside multidimensional arrays). [#4475](https://github.com/yandex/ClickHouse/pull/4475) ([proller](https://github.com/proller)) [#4601](https://github.com/yandex/ClickHouse/pull/4601) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Multiple JOINS with some restrictions: no asterisks, no complex aliases in ON/WHERE/GROUP BY/... [#4462](https://github.com/yandex/ClickHouse/pull/4462) ([Artem Zuikov](https://github.com/4ertus2))
@@ -26,25 +132,25 @@
* Fixed bug in data skipping indices: order of granules after INSERT was incorrect. [#4407](https://github.com/yandex/ClickHouse/pull/4407) ([Nikita Vasilev](https://github.com/nikvas0))
* Fixed `set` index for `Nullable` and `LowCardinality` columns. Before it, `set` index with `Nullable` or `LowCardinality` column led to error `Data type must be deserialized with multiple streams` while selecting. [#4594](https://github.com/yandex/ClickHouse/pull/4594) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
* Correctly set update_time on full `executable` dictionary update. [#4551](https://github.com/yandex/ClickHouse/pull/4551) ([Tema Novikov](https://github.com/temoon))
-* Fix broken progress bar in 19.3 [#4627](https://github.com/yandex/ClickHouse/pull/4627) ([filimonov](https://github.com/filimonov))
+* Fix broken progress bar in 19.3. [#4627](https://github.com/yandex/ClickHouse/pull/4627) ([filimonov](https://github.com/filimonov))
* Fixed inconsistent values of MemoryTracker when memory region was shrinked, in certain cases. [#4619](https://github.com/yandex/ClickHouse/pull/4619) ([alexey-milovidov](https://github.com/alexey-milovidov))
-* Fixed undefined behaviour in ThreadPool [#4612](https://github.com/yandex/ClickHouse/pull/4612) ([alexey-milovidov](https://github.com/alexey-milovidov))
+* Fixed undefined behaviour in ThreadPool. [#4612](https://github.com/yandex/ClickHouse/pull/4612) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Fixed a very rare crash with the message `mutex lock failed: Invalid argument` that could happen when a MergeTree table was dropped concurrently with a SELECT. [#4608](https://github.com/yandex/ClickHouse/pull/4608) ([Alex Zatelepin](https://github.com/ztlpn))
-* ODBC driver compatibility with `LowCardinality` data type [#4381](https://github.com/yandex/ClickHouse/pull/4381) ([proller](https://github.com/proller))
-* FreeBSD: Fixup for `AIOcontextPool: Found io_event with unknown id 0` error [#4438](https://github.com/yandex/ClickHouse/pull/4438) ([urgordeadbeef](https://github.com/urgordeadbeef))
+* ODBC driver compatibility with `LowCardinality` data type. [#4381](https://github.com/yandex/ClickHouse/pull/4381) ([proller](https://github.com/proller))
+* FreeBSD: Fixup for `AIOcontextPool: Found io_event with unknown id 0` error. [#4438](https://github.com/yandex/ClickHouse/pull/4438) ([urgordeadbeef](https://github.com/urgordeadbeef))
* `system.part_log` table was created regardless to configuration. [#4483](https://github.com/yandex/ClickHouse/pull/4483) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Fix undefined behaviour in `dictIsIn` function for cache dictionaries. [#4515](https://github.com/yandex/ClickHouse/pull/4515) ([alesapin](https://github.com/alesapin))
* Fixed a deadlock when a SELECT query locks the same table multiple times (e.g. from different threads or when executing multiple subqueries) and there is a concurrent DDL query. [#4535](https://github.com/yandex/ClickHouse/pull/4535) ([Alex Zatelepin](https://github.com/ztlpn))
* Disable compile_expressions by default until we get own `llvm` contrib and can test it with `clang` and `asan`. [#4579](https://github.com/yandex/ClickHouse/pull/4579) ([alesapin](https://github.com/alesapin))
* Prevent `std::terminate` when `invalidate_query` for `clickhouse` external dictionary source has returned wrong resultset (empty or more than one row or more than one column). Fixed issue when the `invalidate_query` was performed every five seconds regardless to the `lifetime`. [#4583](https://github.com/yandex/ClickHouse/pull/4583) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Avoid deadlock when the `invalidate_query` for a dictionary with `clickhouse` source was involving `system.dictionaries` table or `Dictionaries` database (rare case). [#4599](https://github.com/yandex/ClickHouse/pull/4599) ([alexey-milovidov](https://github.com/alexey-milovidov))
-* Fixes for CROSS JOIN with empty WHERE [#4598](https://github.com/yandex/ClickHouse/pull/4598) ([Artem Zuikov](https://github.com/4ertus2))
+* Fixes for CROSS JOIN with empty WHERE. [#4598](https://github.com/yandex/ClickHouse/pull/4598) ([Artem Zuikov](https://github.com/4ertus2))
* Fixed segfault in function "replicate" when constant argument is passed. [#4603](https://github.com/yandex/ClickHouse/pull/4603) ([alexey-milovidov](https://github.com/alexey-milovidov))
* Fix lambda function with predicate optimizer. [#4408](https://github.com/yandex/ClickHouse/pull/4408) ([Winter Zhang](https://github.com/zhang2014))
* Multiple JOINs multiple fixes. [#4595](https://github.com/yandex/ClickHouse/pull/4595) ([Artem Zuikov](https://github.com/4ertus2))
### Improvements
-* Support aliases in JOIN ON section for right table columns [#4412](https://github.com/yandex/ClickHouse/pull/4412) ([Artem Zuikov](https://github.com/4ertus2))
+* Support aliases in JOIN ON section for right table columns. [#4412](https://github.com/yandex/ClickHouse/pull/4412) ([Artem Zuikov](https://github.com/4ertus2))
* Result of multiple JOINs need correct result names to be used in subselects. Replace flat aliases with source names in result. [#4474](https://github.com/yandex/ClickHouse/pull/4474) ([Artem Zuikov](https://github.com/4ertus2))
* Improve push-down logic for joined statements. [#4387](https://github.com/yandex/ClickHouse/pull/4387) ([Ivan](https://github.com/abyss7))
@@ -67,6 +173,18 @@
* Fix compilation on Mac. [#4371](https://github.com/yandex/ClickHouse/pull/4371) ([Vitaly Baranov](https://github.com/vitlibar))
* Build fixes for FreeBSD and various unusual build configurations. [#4444](https://github.com/yandex/ClickHouse/pull/4444) ([proller](https://github.com/proller))
+## ClickHouse release 19.3.9.1, 2019-04-02
+
+### Bug Fixes
+
+* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/yandex/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2))
+* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/yandex/ClickHouse/pull/4835) ([proller](https://github.com/proller))
+* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/yandex/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
+
+### Build/Testing/Packaging Improvement
+
+* Add a way to launch clickhouse-server image from a custom user [#4753](https://github.com/yandex/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid))
+
## ClickHouse release 19.3.7, 2019-03-12
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c4802295a7..45ee1dfbb41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,6 +317,7 @@ include (cmake/find_hdfs3.cmake) # uses protobuf
include (cmake/find_consistent-hashing.cmake)
include (cmake/find_base64.cmake)
include (cmake/find_hyperscan.cmake)
+include (cmake/find_lfalloc.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(metrohash)
diff --git a/README.md b/README.md
index 0e9974f763f..02a50be007b 100644
--- a/README.md
+++ b/README.md
@@ -10,3 +10,10 @@ ClickHouse is an open-source column-oriented database management system that all
* [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announces and reports about events.
* [Contacts](https://clickhouse.yandex/#contacts) can help to get your questions answered if there are any.
* You can also [fill this form](https://forms.yandex.com/surveys/meet-yandex-clickhouse-team/) to meet Yandex ClickHouse team in person.
+
+## Upcoming Events
+* [ClickHouse Community Meetup in Limassol](https://www.facebook.com/events/386638262181785/) on May 7.
+* ClickHouse at [Percona Live 2019](https://www.percona.com/live/19/other-open-source-databases-track) in Austin on May 28-30.
+* [ClickHouse Community Meetup in Beijing](https://www.huodongxing.com/event/2483759276200) on June 8.
+* [ClickHouse Community Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20.
+* [ClickHouse Community Meetup in Shanghai](https://www.huodongxing.com/event/4483760336000) on October 27.
diff --git a/cmake/find_lfalloc.cmake b/cmake/find_lfalloc.cmake
new file mode 100644
index 00000000000..c9b2ce5d436
--- /dev/null
+++ b/cmake/find_lfalloc.cmake
@@ -0,0 +1,10 @@
+if (NOT SANITIZE AND NOT ARCH_ARM AND NOT ARCH_32 AND NOT ARCH_PPC64LE AND NOT OS_FREEBSD)
+ option (ENABLE_LFALLOC "Set to FALSE to use system libgsasl library instead of bundled" ${NOT_UNBUNDLED})
+endif ()
+
+if (ENABLE_LFALLOC)
+ set (USE_LFALLOC 1)
+ set (USE_LFALLOC_RANDOM_HINT 1)
+ set (LFALLOC_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/lfalloc/src)
+ message (STATUS "Using lfalloc=${USE_LFALLOC}: ${LFALLOC_INCLUDE_DIR}")
+endif ()
diff --git a/cmake/find_poco.cmake b/cmake/find_poco.cmake
index 012f269d48d..07d81c1bbe8 100644
--- a/cmake/find_poco.cmake
+++ b/cmake/find_poco.cmake
@@ -36,6 +36,8 @@ elseif (NOT MISSING_INTERNAL_POCO_LIBRARY)
set (ENABLE_DATA_SQLITE 0 CACHE BOOL "")
set (ENABLE_DATA_MYSQL 0 CACHE BOOL "")
set (ENABLE_DATA_POSTGRESQL 0 CACHE BOOL "")
+ set (ENABLE_ENCODINGS 0 CACHE BOOL "")
+
# new after 2.0.0:
set (POCO_ENABLE_ZIP 0 CACHE BOOL "")
set (POCO_ENABLE_PAGECOMPILER 0 CACHE BOOL "")
diff --git a/contrib/lfalloc/src/lf_allocX64.h b/contrib/lfalloc/src/lf_allocX64.h
new file mode 100644
index 00000000000..2c4cf3f1021
--- /dev/null
+++ b/contrib/lfalloc/src/lf_allocX64.h
@@ -0,0 +1,1813 @@
+#pragma once
+
+#include
+#include
+#include
+
+#include "lfmalloc.h"
+
+#include "util/system/compiler.h"
+#include "util/system/types.h"
+#include
+
+#ifdef _MSC_VER
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifdef _M_X64
+#define _64_
+#endif
+#include
+#define WIN32_LEAN_AND_MEAN
+#include
+#pragma intrinsic(_InterlockedCompareExchange)
+#pragma intrinsic(_InterlockedExchangeAdd)
+
+#include
+#include
+#include
+
+#define PERTHREAD __declspec(thread)
+#define _win_
+#define Y_FORCE_INLINE __forceinline
+
+using TAtomic = volatile long;
+
+static inline long AtomicAdd(TAtomic& a, long b) {
+ return _InterlockedExchangeAdd(&a, b) + b;
+}
+
+static inline long AtomicSub(TAtomic& a, long b) {
+ return AtomicAdd(a, -b);
+}
+
+#define Y_ASSERT_NOBT(x) ((void)0)
+
+#else
+
+#include "util/system/defaults.h"
+#include "util/system/atomic.h"
+#include
+
+#if !defined(NDEBUG) && !defined(__GCCXML__)
+#define Y_ASSERT_NOBT(a) \
+ do { \
+ if (Y_UNLIKELY(!(a))) { \
+ assert(false && (a)); \
+ } \
+ } while (0)
+#else
+#define Y_ASSERT_NOBT(a) \
+ do { \
+ if (false) { \
+ bool __xxx = static_cast(a); \
+ Y_UNUSED(__xxx); \
+ } \
+ } while (0)
+#endif
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#if defined(_linux_)
+#if !defined(MADV_HUGEPAGE)
+#define MADV_HUGEPAGE 14
+#endif
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+#endif
+
+#define PERTHREAD __thread
+
+#endif
+
+#ifndef _darwin_
+
+#ifndef Y_ARRAY_SIZE
+#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#endif
+
+#ifndef NDEBUG
+#define DBG_FILL_MEMORY
+static bool FillMemoryOnAllocation = true;
+#endif
+
+static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs
+static bool MapHugeTLB = false; // force MAP_HUGETLB for small allocs
+static bool EnableDefrag = true;
+
+// Buffers that are larger than this size will not be filled with 0xcf
+#ifndef DBG_FILL_MAX_SIZE
+#define DBG_FILL_MAX_SIZE 0x01000000000000ULL
+#endif
+
+template
+inline T* DoCas(T* volatile* target, T* exchange, T* compare) {
+#if defined(_linux_)
+ return __sync_val_compare_and_swap(target, compare, exchange);
+#elif defined(_WIN32)
+#ifdef _64_
+ return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare);
+#else
+ //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare);
+ return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare);
+#endif
+#elif defined(__i386) || defined(__x86_64__)
+ union {
+ T* volatile* NP;
+ void* volatile* VoidP;
+ } gccSucks;
+ gccSucks.NP = target;
+ void* volatile* targetVoidP = gccSucks.VoidP;
+
+ __asm__ __volatile__(
+ "lock\n\t"
+ "cmpxchg %2,%0\n\t"
+ : "+m"(*(targetVoidP)), "+a"(compare)
+ : "r"(exchange)
+ : "cc", "memory");
+ return compare;
+#else
+#error inline_cas not defined for this platform
+#endif
+}
+
+#ifdef _64_
+const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200;
+const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll;
+#ifndef _freebsd_
+const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll;
+static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START;
+static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
+#endif
+#else
+const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff;
+#endif
+#define ALLOC_START ((char*)0)
+
+const size_t N_CHUNK_SIZE = 1024 * 1024;
+const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE;
+const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128;
+
+// map size idx to size in bytes
+#ifdef LFALLOC_YT
+const int N_SIZES = 27;
+#else
+const int N_SIZES = 25;
+#endif
+const int nSizeIdxToSize[N_SIZES] = {
+ -1,
+#if defined(_64_)
+ 16, 16, 32, 32, 48, 64, 96, 128,
+#else
+ 8,
+ 16,
+ 24,
+ 32,
+ 48,
+ 64,
+ 96,
+ 128,
+#endif
+ 192, 256, 384, 512, 768, 1024, 1536, 2048,
+ 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768,
+#ifdef LFALLOC_YT
+ 49152, 65536
+#endif
+};
+#ifdef LFALLOC_YT
+const size_t N_MAX_FAST_SIZE = 65536;
+#else
+const size_t N_MAX_FAST_SIZE = 32768;
+#endif
+const unsigned char size2idxArr1[64 + 1] = {
+ 1,
+#if defined(_64_)
+ 2, 2, 4, 4, // 16, 16, 32, 32
+#else
+ 1, 2, 3, 4, // 8, 16, 24, 32
+#endif
+ 5, 5, 6, 6, // 48, 64
+ 7, 7, 7, 7, 8, 8, 8, 8, // 96, 128
+ 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, // 192, 256
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 // 512
+};
+#ifdef LFALLOC_YT
+const unsigned char size2idxArr2[256] = {
+#else
+const unsigned char size2idxArr2[128] = {
+#endif
+ 12, 12, 13, 14, // 512, 512, 768, 1024
+ 15, 15, 16, 16, // 1536, 2048
+ 17, 17, 17, 17, 18, 18, 18, 18, // 3072, 4096
+ 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768
+#ifdef LFALLOC_YT
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536
+#endif
+};
+
+// map entry number to size idx
+// special size idx's: 0 = not used, -1 = mem locked, but not allocated
+static volatile char chunkSizeIdx[N_CHUNKS];
+const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually
+static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF];
+static volatile int freeChunkCount;
+
+static void AddFreeChunk(uintptr_t chunkId) {
+ chunkSizeIdx[chunkId] = -1;
+ if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF))
+ NMalloc::AbortFromCorruptedAllocator(); // free chunks arrray overflowed
+ freeChunkArr[freeChunkCount++] = chunkId;
+}
+
+static bool GetFreeChunk(uintptr_t* res) {
+ if (freeChunkCount == 0) {
+ *res = 0;
+ return false;
+ }
+ *res = freeChunkArr[--freeChunkCount];
+ return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+enum ELFAllocCounter {
+ CT_USER_ALLOC, // accumulated size requested by user code
+ CT_MMAP, // accumulated mmapped size
+ CT_MMAP_CNT, // number of mmapped regions
+ CT_MUNMAP, // accumulated unmmapped size
+ CT_MUNMAP_CNT, // number of munmaped regions
+ CT_SYSTEM_ALLOC, // accumulated allocated size for internal lfalloc needs
+ CT_SYSTEM_FREE, // accumulated deallocated size for internal lfalloc needs
+ CT_SMALL_ALLOC, // accumulated allocated size for fixed-size blocks
+ CT_SMALL_FREE, // accumulated deallocated size for fixed-size blocks
+ CT_LARGE_ALLOC, // accumulated allocated size for large blocks
+ CT_LARGE_FREE, // accumulated deallocated size for large blocks
+ CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
+ CT_DEGRAGMENT_CNT, // number of memory defragmentations
+ CT_MAX
+};
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value);
+
+//////////////////////////////////////////////////////////////////////////
+enum EMMapMode {
+ MM_NORMAL, // memory for small allocs
+ MM_HUGE // memory for large allocs
+};
+
+#ifndef _MSC_VER
+inline void VerifyMmapResult(void* result) {
+ if (Y_UNLIKELY(result == MAP_FAILED))
+ NMalloc::AbortFromCorruptedAllocator(); // negative size requested? or just out of mem
+}
+#endif
+
+#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_)
+static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) {
+ char* volatile* areaPtr;
+ char* areaStart;
+ uintptr_t areaFinish;
+
+ int mapProt = PROT_READ | PROT_WRITE;
+ int mapFlags = MAP_PRIVATE | MAP_ANON;
+
+ if (mode == MM_HUGE) {
+ areaPtr = reinterpret_cast(&linuxAllocPointerHuge);
+ areaStart = reinterpret_cast(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE);
+ areaFinish = N_HUGE_AREA_FINISH;
+ } else {
+ areaPtr = reinterpret_cast(&linuxAllocPointer);
+ areaStart = reinterpret_cast(LINUX_MMAP_AREA_START);
+ areaFinish = N_MAX_WORKSET_SIZE;
+
+ if (MapHugeTLB) {
+ mapFlags |= MAP_HUGETLB;
+ }
+ }
+
+ bool wrapped = false;
+ for (;;) {
+ char* prevAllocPtr = *areaPtr;
+ char* nextAllocPtr = prevAllocPtr + sz;
+ if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) {
+ if (Y_UNLIKELY(wrapped)) {
+ // virtual memory is over fragmented
+ NMalloc::AbortFromCorruptedAllocator();
+ }
+ // wrap after all area is used
+ DoCas(areaPtr, areaStart, prevAllocPtr);
+ wrapped = true;
+ continue;
+ }
+
+ if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr)
+ continue;
+
+ char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0);
+ VerifyMmapResult(largeBlock);
+ if (largeBlock == prevAllocPtr)
+ return largeBlock;
+ if (largeBlock)
+ munmap(largeBlock, sz);
+
+ if (sz < 0x80000) {
+ // skip utilized area with big steps
+ DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr);
+ }
+ }
+}
+#endif
+
+static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) {
+ (void)mode;
+#ifdef _MSC_VER
+ char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE);
+ if (Y_UNLIKELY(largeBlock == nullptr))
+ NMalloc::AbortFromCorruptedAllocator(); // out of memory
+ if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+ NMalloc::AbortFromCorruptedAllocator(); // out of working set, something has broken
+#else
+#if defined(_freebsd_) || !defined(_64_) || defined(USE_LFALLOC_RANDOM_HINT)
+ uintptr_t areaStart;
+ uintptr_t areaFinish;
+ if (mode == MM_HUGE) {
+ areaStart = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
+ areaFinish = N_HUGE_AREA_FINISH;
+ } else {
+ areaStart = LINUX_MMAP_AREA_START;
+ areaFinish = N_MAX_WORKSET_SIZE;
+ }
+#if defined(USE_LFALLOC_RANDOM_HINT)
+ static thread_local std::mt19937_64 generator(std::random_device{}());
+ std::uniform_int_distribution distr(areaStart, areaFinish / 2);
+ char* largeBlock = (char*)mmap(reinterpret_cast(distr(generator)), sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+#else
+ char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+#endif
+ VerifyMmapResult(largeBlock);
+ if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= areaFinish))
+ NMalloc::AbortFromCorruptedAllocator(); // out of working set, something has broken
+#else
+ char* largeBlock = AllocWithMMapLinuxImpl(sz, mode);
+ if (TransparentHugePages) {
+ madvise(largeBlock, sz, MADV_HUGEPAGE);
+ }
+#endif
+#endif
+ Y_ASSERT_NOBT(largeBlock);
+ IncrementCounter(CT_MMAP, sz);
+ IncrementCounter(CT_MMAP_CNT, 1);
+ return largeBlock;
+}
+
+enum class ELarge : ui8 {
+ Free = 0, // block in free cache
+ Alloc = 1, // block is allocated
+ Gone = 2, // block was unmapped
+};
+
+struct TLargeBlk {
+
+ static TLargeBlk* As(void *raw) {
+ return reinterpret_cast((char*)raw - 4096ll);
+ }
+
+ static const TLargeBlk* As(const void *raw) {
+ return reinterpret_cast((const char*)raw - 4096ll);
+ }
+
+ void SetSize(size_t bytes, size_t pages) {
+ Pages = pages;
+ Bytes = bytes;
+ }
+
+ void Mark(ELarge state) {
+ const ui64 marks[] = {
+ 0x8b38aa5ca4953c98, // ELarge::Free
+ 0xf916d33584eb5087, // ELarge::Alloc
+ 0xd33b0eca7651bc3f // ELarge::Gone
+ };
+
+ Token = size_t(marks[ui8(state)]);
+ }
+
+ size_t Pages; // Total pages allocated with mmap like call
+ size_t Bytes; // Actually requested bytes by user
+ size_t Token; // Block state token, see ELarge enum.
+};
+
+
+static void LargeBlockUnmap(void* p, size_t pages) {
+ const auto bytes = (pages + 1) * uintptr_t(4096);
+
+ IncrementCounter(CT_MUNMAP, bytes);
+ IncrementCounter(CT_MUNMAP_CNT, 1);
+#ifdef _MSC_VER
+ Y_ASSERT_NOBT(0);
+#else
+ TLargeBlk::As(p)->Mark(ELarge::Gone);
+ munmap((char*)p - 4096ll, bytes);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+const size_t LB_BUF_SIZE = 250;
+const size_t LB_BUF_HASH = 977;
+static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[]
+static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE];
+static TAtomic lbFreePageCount;
+
+
+static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) {
+ size_t pgCount = (_nSize + 4095) / 4096;
+#ifdef _MSC_VER
+ char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE);
+ if (Y_UNLIKELY(pRes == 0)) {
+ NMalloc::AbortFromCorruptedAllocator(); // out of memory
+ }
+#else
+
+ IncrementCounter(counter, pgCount * 4096ll);
+ IncrementCounter(CT_SYSTEM_ALLOC, 4096ll);
+
+ int lbHash = pgCount % LB_BUF_HASH;
+ for (int i = 0; i < LB_BUF_SIZE; ++i) {
+ void* p = lbFreePtrs[lbHash][i];
+ if (p == nullptr)
+ continue;
+ if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) {
+ size_t realPageCount = TLargeBlk::As(p)->Pages;
+ if (realPageCount == pgCount) {
+ AtomicAdd(lbFreePageCount, -pgCount);
+ TLargeBlk::As(p)->Mark(ELarge::Alloc);
+ return p;
+ } else {
+ if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) {
+ // block was freed while we were busy
+ AtomicAdd(lbFreePageCount, -realPageCount);
+ LargeBlockUnmap(p, realPageCount);
+ --i;
+ }
+ }
+ }
+ }
+ char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE);
+#endif
+ pRes += 4096ll;
+ TLargeBlk::As(pRes)->SetSize(_nSize, pgCount);
+ TLargeBlk::As(pRes)->Mark(ELarge::Alloc);
+
+ return pRes;
+}
+
+#ifndef _MSC_VER
+static void FreeAllLargeBlockMem() {
+ for (auto& lbFreePtr : lbFreePtrs) {
+ for (int i = 0; i < LB_BUF_SIZE; ++i) {
+ void* p = lbFreePtr[i];
+ if (p == nullptr)
+ continue;
+ if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) {
+ int pgCount = TLargeBlk::As(p)->Pages;
+ AtomicAdd(lbFreePageCount, -pgCount);
+ LargeBlockUnmap(p, pgCount);
+ }
+ }
+ }
+}
+#endif
+
+static void LargeBlockFree(void* p, ELFAllocCounter counter) {
+ if (p == nullptr)
+ return;
+#ifdef _MSC_VER
+ VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE);
+#else
+ size_t pgCount = TLargeBlk::As(p)->Pages;
+
+ TLargeBlk::As(p)->Mark(ELarge::Free);
+ IncrementCounter(counter, pgCount * 4096ll);
+ IncrementCounter(CT_SYSTEM_FREE, 4096ll);
+
+ if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE)
+ FreeAllLargeBlockMem();
+ int lbHash = pgCount % LB_BUF_HASH;
+ for (int i = 0; i < LB_BUF_SIZE; ++i) {
+ if (lbFreePtrs[lbHash][i] == nullptr) {
+ if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) {
+ AtomicAdd(lbFreePageCount, pgCount);
+ return;
+ }
+ }
+ }
+
+ LargeBlockUnmap(p, pgCount);
+#endif
+}
+
+static void* SystemAlloc(size_t _nSize) {
+ //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize);
+ return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC);
+}
+static void SystemFree(void* p) {
+ //HeapFree(GetProcessHeap(), 0, p);
+ LargeBlockFree(p, CT_SYSTEM_FREE);
+}
+
+//////////////////////////////////////////////////////////////////////////
+static int* volatile nLock = nullptr;
+static int nLockVar;
+inline void RealEnterCriticalDefault(int* volatile* lockPtr) {
+ while (DoCas(lockPtr, &nLockVar, (int*)nullptr) != nullptr)
+ ; //pthread_yield();
+}
+inline void RealLeaveCriticalDefault(int* volatile* lockPtr) {
+ *lockPtr = nullptr;
+}
+static void (*RealEnterCritical)(int* volatile* lockPtr) = RealEnterCriticalDefault;
+static void (*RealLeaveCritical)(int* volatile* lockPtr) = RealLeaveCriticalDefault;
+static void (*BeforeLFAllocGlobalLockAcquired)() = nullptr;
+static void (*AfterLFAllocGlobalLockReleased)() = nullptr;
+class CCriticalSectionLockMMgr {
+public:
+ CCriticalSectionLockMMgr() {
+ if (BeforeLFAllocGlobalLockAcquired) {
+ BeforeLFAllocGlobalLockAcquired();
+ }
+ RealEnterCritical(&nLock);
+ }
+ ~CCriticalSectionLockMMgr() {
+ RealLeaveCritical(&nLock);
+ if (AfterLFAllocGlobalLockReleased) {
+ AfterLFAllocGlobalLockReleased();
+ }
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+class TLFAllocFreeList {
+ struct TNode {
+ TNode* Next;
+ };
+
+ TNode* volatile Head;
+ TNode* volatile Pending;
+ TAtomic PendingToFreeListCounter;
+ TAtomic AllocCount;
+
+ static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) {
+ for (;;) {
+ TNode* volatile prevHead = *headPtr;
+ n->Next = prevHead;
+ if (DoCas(headPtr, n, prevHead) == prevHead)
+ break;
+ }
+ }
+ Y_FORCE_INLINE void* DoAlloc() {
+ TNode* res;
+ for (res = Head; res; res = Head) {
+ TNode* keepNext = res->Next;
+ if (DoCas(&Head, keepNext, res) == res) {
+ //Y_VERIFY(keepNext == res->Next);
+ break;
+ }
+ }
+ return res;
+ }
+ void FreeList(TNode* fl) {
+ if (!fl)
+ return;
+ TNode* flTail = fl;
+ while (flTail->Next)
+ flTail = flTail->Next;
+ for (;;) {
+ TNode* volatile prevHead = Head;
+ flTail->Next = prevHead;
+ if (DoCas(&Head, fl, prevHead) == prevHead)
+ break;
+ }
+ }
+
+public:
+ Y_FORCE_INLINE void Free(void* ptr) {
+ TNode* newFree = (TNode*)ptr;
+ if (AtomicAdd(AllocCount, 0) == 0)
+ Enqueue(&Head, newFree);
+ else
+ Enqueue(&Pending, newFree);
+ }
+ Y_FORCE_INLINE void* Alloc() {
+ TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0);
+ TNode* fl = Pending;
+ if (AtomicAdd(AllocCount, 1) == 1) {
+ // No other allocs in progress.
+ // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads.
+ // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList
+ if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) {
+ // pick first element from Pending and return it
+ void* res = fl;
+ fl = fl->Next;
+ // if there are other elements in Pending list, add them to main free list
+ FreeList(fl);
+ AtomicAdd(PendingToFreeListCounter, 1);
+ AtomicAdd(AllocCount, -1);
+ return res;
+ }
+ }
+ void* res = DoAlloc();
+ AtomicAdd(AllocCount, -1);
+ return res;
+ }
+ void* GetWholeList() {
+ TNode* res;
+ for (res = Head; res; res = Head) {
+ if (DoCas(&Head, (TNode*)nullptr, res) == res)
+ break;
+ }
+ return res;
+ }
+ void ReturnWholeList(void* ptr) {
+ while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc()
+ ; //ThreadYield();
+ for (;;) {
+ TNode* prevHead = Head;
+ if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) {
+ FreeList(prevHead);
+ break;
+ }
+ }
+ }
+};
+
+/////////////////////////////////////////////////////////////////////////
+static TLFAllocFreeList globalFreeLists[N_SIZES];
+static char* volatile globalCurrentPtr[N_SIZES];
+static TLFAllocFreeList blockFreeList;
+
+// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks
+const int FL_GROUP_SIZE = 15;
+struct TFreeListGroup {
+ TFreeListGroup* Next;
+ char* Ptrs[FL_GROUP_SIZE];
+};
+#ifdef _64_
+const int FREE_LIST_GROUP_SIZEIDX = 8;
+#else
+const int FREE_LIST_GROUP_SIZEIDX = 6;
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+// find free chunks and reset chunk size so they can be reused by different sized allocations
+// do not look at blockFreeList (TFreeListGroup has same size for any allocations)
+static bool DefragmentMem() {
+ if (!EnableDefrag) {
+ return false;
+ }
+
+ IncrementCounter(CT_DEGRAGMENT_CNT, 1);
+
+ int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int));
+ if (Y_UNLIKELY(!nFreeCount)) {
+ //__debugbreak();
+ NMalloc::AbortFromCorruptedAllocator();
+ }
+ memset(nFreeCount, 0, N_CHUNKS * sizeof(int));
+
+ TFreeListGroup* wholeLists[N_SIZES];
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+ wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+ for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+ for (auto pData : g->Ptrs) {
+ if (pData) {
+ uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+ ++nFreeCount[nChunk];
+ Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx);
+ }
+ }
+ }
+ }
+
+ bool bRes = false;
+ for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+ int fc = nFreeCount[nChunk];
+ nFreeCount[nChunk] = 0;
+ if (chunkSizeIdx[nChunk] <= 0)
+ continue;
+ int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast(chunkSizeIdx[nChunk])];
+ Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count
+ if (fc == nEntries) {
+ bRes = true;
+ nFreeCount[nChunk] = 1;
+ }
+ }
+ if (bRes) {
+ for (auto& wholeList : wholeLists) {
+ TFreeListGroup** ppPtr = &wholeList;
+ while (*ppPtr) {
+ TFreeListGroup* g = *ppPtr;
+ int dst = 0;
+ for (auto pData : g->Ptrs) {
+ if (pData) {
+ uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+ if (nFreeCount[nChunk] == 0)
+ g->Ptrs[dst++] = pData; // block is not freed, keep pointer
+ }
+ }
+ if (dst == 0) {
+ // no valid pointers in group, free it
+ *ppPtr = g->Next;
+ blockFreeList.Free(g);
+ } else {
+ // reset invalid pointers to 0
+ for (int i = dst; i < FL_GROUP_SIZE; ++i)
+ g->Ptrs[i] = nullptr;
+ ppPtr = &g->Next;
+ }
+ }
+ }
+ for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+ if (!nFreeCount[nChunk])
+ continue;
+ char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _win_
+ VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT);
+#elif defined(_freebsd_)
+ madvise(pStart, N_CHUNK_SIZE, MADV_FREE);
+#else
+ madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED);
+#endif
+ AddFreeChunk(nChunk);
+ }
+ }
+
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+ globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+
+ SystemFree(nFreeCount);
+ return bRes;
+}
+
+static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) {
+ char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx];
+ while (char* newBlock = *pFreeArray) {
+ char* nextFree = newBlock + blockSize * count;
+
+ // check if there is space in chunk
+ char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE;
+ if (nextFree >= globalEndPtr) {
+ if (nextFree > globalEndPtr)
+ break;
+ nextFree = nullptr; // it was last block in chunk
+ }
+ if (DoCas(pFreeArray, nextFree, newBlock) == newBlock)
+ return newBlock;
+ }
+ return nullptr;
+}
+
+enum EDefrag {
+ MEM_DEFRAG,
+ NO_MEM_DEFRAG,
+};
+
+static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) {
+ IncrementCounter(CT_SLOW_ALLOC_CNT, 1);
+
+ CCriticalSectionLockMMgr ls;
+ void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+ if (res)
+ return res; // might happen when other thread allocated new current chunk
+
+ for (;;) {
+ uintptr_t nChunk;
+ if (GetFreeChunk(&nChunk)) {
+ char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _MSC_VER
+ void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE);
+ Y_ASSERT_NOBT(pTest == newPlace);
+#endif
+ chunkSizeIdx[nChunk] = (char)nSizeIdx;
+ globalCurrentPtr[nSizeIdx] = newPlace + blockSize;
+ return newPlace;
+ }
+
+ // out of luck, try to defrag
+ if (defrag == MEM_DEFRAG && DefragmentMem()) {
+ continue;
+ }
+
+ char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL);
+ uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1));
+ uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1));
+ for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) {
+ uintptr_t chunk = p / N_CHUNK_SIZE;
+ Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p);
+ Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0);
+ AddFreeChunk(chunk);
+ }
+ }
+ return nullptr;
+}
+
+// allocate single block
+static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) {
+ int blockSize = nSizeIdxToSize[nSizeIdx];
+ void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+ if (res)
+ return res;
+
+ return SlowLFAlloc(nSizeIdx, blockSize, defrag);
+}
+
+// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE)
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) {
+ int blockSize = nSizeIdxToSize[nSizeIdx];
+ void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE);
+ if (res) {
+ char* resPtr = (char*)res;
+ for (int k = 0; k < FL_GROUP_SIZE; ++k) {
+ buf[k] = resPtr;
+ resPtr += blockSize;
+ }
+ return FL_GROUP_SIZE;
+ }
+ buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG);
+ return 1;
+}
+
+// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) {
+ TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+ TFreeListGroup* g = (TFreeListGroup*)fl.Alloc();
+ if (g) {
+ int resCount = 0;
+ for (auto& ptr : g->Ptrs) {
+ if (ptr)
+ buf[resCount++] = ptr;
+ else
+ break;
+ }
+ blockFreeList.Free(g);
+ return resCount;
+ }
+ return 0;
+}
+
+// add several blocks to global free list
+static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) {
+ for (int startIdx = 0; startIdx < count;) {
+ TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc();
+ Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]);
+ if (!g) {
+ g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG);
+ }
+
+ int groupSize = count - startIdx;
+ if (groupSize > FL_GROUP_SIZE)
+ groupSize = FL_GROUP_SIZE;
+ for (int i = 0; i < groupSize; ++i)
+ g->Ptrs[i] = buf[startIdx + i];
+ for (int i = groupSize; i < FL_GROUP_SIZE; ++i)
+ g->Ptrs[i] = nullptr;
+
+ // add free group to the global list
+ TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+ fl.Free(g);
+
+ startIdx += groupSize;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+static TAtomic GlobalCounters[CT_MAX];
+const int MAX_LOCAL_UPDATES = 100;
+
+struct TLocalCounter {
+ intptr_t Value;
+ int Updates;
+ TAtomic* Parent;
+
+ Y_FORCE_INLINE void Init(TAtomic* parent) {
+ Parent = parent;
+ Value = 0;
+ Updates = 0;
+ }
+
+ Y_FORCE_INLINE void Increment(size_t value) {
+ Value += value;
+ if (++Updates > MAX_LOCAL_UPDATES) {
+ Flush();
+ }
+ }
+
+ Y_FORCE_INLINE void Flush() {
+ AtomicAdd(*Parent, Value);
+ Value = 0;
+ Updates = 0;
+ }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// DBG stuff
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TPerTagAllocCounter {
+ TAtomic Size;
+ TAtomic Count;
+
+ Y_FORCE_INLINE void Alloc(size_t size) {
+ AtomicAdd(Size, size);
+ AtomicAdd(Count, 1);
+ }
+
+ Y_FORCE_INLINE void Free(size_t size) {
+ AtomicSub(Size, size);
+ AtomicSub(Count, 1);
+ }
+};
+
+struct TLocalPerTagAllocCounter {
+ intptr_t Size;
+ int Count;
+ int Updates;
+
+ Y_FORCE_INLINE void Init() {
+ Size = 0;
+ Count = 0;
+ Updates = 0;
+ }
+
+ Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) {
+ Size += size;
+ ++Count;
+ if (++Updates > MAX_LOCAL_UPDATES) {
+ Flush(parent);
+ }
+ }
+
+ Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) {
+ Size -= size;
+ --Count;
+ if (++Updates > MAX_LOCAL_UPDATES) {
+ Flush(parent);
+ }
+ }
+
+ Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) {
+ AtomicAdd(parent.Size, Size);
+ Size = 0;
+ AtomicAdd(parent.Count, Count);
+ Count = 0;
+ Updates = 0;
+ }
+};
+
+static const int DBG_ALLOC_MAX_TAG = 1000;
+static const int DBG_ALLOC_NUM_SIZES = 30;
+static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+const int THREAD_BUF = 256;
+static int borderSizes[N_SIZES];
+const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024;
+struct TThreadAllocInfo {
+ // FreePtrs - pointers to first free blocks in per thread block list
+ // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero
+ char* FreePtrs[N_SIZES][THREAD_BUF];
+ int FreePtrIndex[N_SIZES];
+ TThreadAllocInfo* pNextInfo;
+ TLocalCounter LocalCounters[CT_MAX];
+
+#if defined(LFALLOC_DBG)
+ TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+#endif
+#ifdef _win_
+ HANDLE hThread;
+#endif
+
+ void Init(TThreadAllocInfo** pHead) {
+ memset(this, 0, sizeof(*this));
+ for (auto& i : FreePtrIndex)
+ i = THREAD_BUF;
+#ifdef _win_
+ BOOL b = DuplicateHandle(
+ GetCurrentProcess(), GetCurrentThread(),
+ GetCurrentProcess(), &hThread,
+ 0, FALSE, DUPLICATE_SAME_ACCESS);
+ Y_ASSERT_NOBT(b);
+#endif
+ pNextInfo = *pHead;
+ *pHead = this;
+ for (int k = 0; k < N_SIZES; ++k) {
+ int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k];
+ if (maxCount > THREAD_BUF)
+ maxCount = THREAD_BUF;
+ borderSizes[k] = THREAD_BUF - maxCount;
+ }
+ for (int i = 0; i < CT_MAX; ++i) {
+ LocalCounters[i].Init(&GlobalCounters[i]);
+ }
+#if defined(LFALLOC_DBG)
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+ local.Init();
+ }
+ }
+#endif
+ }
+ void Done() {
+ for (auto sizeIdx : FreePtrIndex) {
+ Y_ASSERT_NOBT(sizeIdx == THREAD_BUF);
+ }
+ for (auto& localCounter : LocalCounters) {
+ localCounter.Flush();
+ }
+#if defined(LFALLOC_DBG)
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+ auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+ local.Flush(global);
+ }
+ }
+#endif
+#ifdef _win_
+ if (hThread)
+ CloseHandle(hThread);
+#endif
+ }
+};
+PERTHREAD TThreadAllocInfo* pThreadInfo;
+static TThreadAllocInfo* pThreadInfoList;
+
+static int* volatile nLockThreadInfo = nullptr;
+class TLockThreadListMMgr {
+public:
+ TLockThreadListMMgr() {
+ RealEnterCritical(&nLockThreadInfo);
+ }
+ ~TLockThreadListMMgr() {
+ RealLeaveCritical(&nLockThreadInfo);
+ }
+};
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) {
+#ifdef LFALLOC_YT
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ thr->LocalCounters[counter].Increment(value);
+ } else {
+ AtomicAdd(GlobalCounters[counter], value);
+ }
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFast(int counter) {
+#ifdef LFALLOC_YT
+ return GlobalCounters[counter];
+#else
+ return 0;
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFull(int counter) {
+#ifdef LFALLOC_YT
+ i64 ret = GlobalCounters[counter];
+ {
+ TLockThreadListMMgr ll;
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+ TThreadAllocInfo* pInfo = *p;
+ ret += pInfo->LocalCounters[counter].Value;
+ p = &pInfo->pNextInfo;
+ }
+ }
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) {
+ for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) {
+ int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx];
+ char** freePtrs = pInfo->FreePtrs[sizeIdx];
+ PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx);
+ freePtrIdx = THREAD_BUF;
+ }
+}
+
+#ifdef _win_
+static bool IsDeadThread(TThreadAllocInfo* pInfo) {
+ DWORD dwExit;
+ bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE;
+ return isDead;
+}
+
+static void CleanupAfterDeadThreads() {
+ TLockThreadListMMgr ls;
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+ TThreadAllocInfo* pInfo = *p;
+ if (IsDeadThread(pInfo)) {
+ MoveSingleThreadFreeToGlobal(pInfo);
+ pInfo->Done();
+ *p = pInfo->pNextInfo;
+ SystemFree(pInfo);
+ } else
+ p = &pInfo->pNextInfo;
+ }
+}
+#endif
+
+#ifndef _win_
+static pthread_key_t ThreadCacheCleaner;
+static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting
+static PERTHREAD bool IsStoppingThread;
+
+static void FreeThreadCache(void*) {
+ TThreadAllocInfo* pToDelete = nullptr;
+ {
+ TLockThreadListMMgr ls;
+ pToDelete = pThreadInfo;
+ if (pToDelete == nullptr)
+ return;
+
+ // remove from the list
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) {
+ if (*p == pToDelete) {
+ *p = pToDelete->pNextInfo;
+ break;
+ }
+ }
+ IsStoppingThread = true;
+ pThreadInfo = nullptr;
+ }
+
+ // free per thread buf
+ MoveSingleThreadFreeToGlobal(pToDelete);
+ pToDelete->Done();
+ SystemFree(pToDelete);
+}
+#endif
+
+static void AllocThreadInfo() {
+#ifndef _win_
+ if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) {
+ pthread_key_create(&ThreadCacheCleaner, FreeThreadCache);
+ ThreadCacheCleanerStarted = (void*)-1;
+ }
+ if (ThreadCacheCleanerStarted != (void*)-1)
+ return; // do not use ThreadCacheCleaner until it is constructed
+
+ {
+ if (IsStoppingThread)
+ return;
+ TLockThreadListMMgr ls;
+ if (IsStoppingThread) // better safe than sorry
+ return;
+
+ pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+ pThreadInfo->Init(&pThreadInfoList);
+ }
+ pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called
+#else
+ CleanupAfterDeadThreads();
+ {
+ TLockThreadListMMgr ls;
+ pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+ pThreadInfo->Init(&pThreadInfoList);
+ }
+#endif
+}
+
+ //////////////////////////////////////////////////////////////////////////
+ // DBG stuff
+ //////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TAllocHeader {
+ size_t Size;
+ int Tag;
+ int Cookie;
+};
+
+static inline void* GetAllocPtr(TAllocHeader* p) {
+ return p + 1;
+}
+
+static inline TAllocHeader* GetAllocHeader(void* p) {
+ return ((TAllocHeader*)p) - 1;
+}
+
+PERTHREAD int AllocationTag;
+extern "C" int SetThreadAllocTag(int tag) {
+ int prevTag = AllocationTag;
+ if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) {
+ AllocationTag = tag;
+ }
+ return prevTag;
+}
+
+PERTHREAD bool ProfileCurrentThread;
+extern "C" bool SetProfileCurrentThread(bool newVal) {
+ bool prevVal = ProfileCurrentThread;
+ ProfileCurrentThread = newVal;
+ return prevVal;
+}
+
+static volatile bool ProfileAllThreads;
+extern "C" bool SetProfileAllThreads(bool newVal) {
+ bool prevVal = ProfileAllThreads;
+ ProfileAllThreads = newVal;
+ return prevVal;
+}
+
+static volatile bool AllocationSamplingEnabled;
+extern "C" bool SetAllocationSamplingEnabled(bool newVal) {
+ bool prevVal = AllocationSamplingEnabled;
+ AllocationSamplingEnabled = newVal;
+ return prevVal;
+}
+
+static size_t AllocationSampleRate = 1000;
+extern "C" size_t SetAllocationSampleRate(size_t newVal) {
+ size_t prevVal = AllocationSampleRate;
+ AllocationSampleRate = newVal;
+ return prevVal;
+}
+
+static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE;
+extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) {
+ size_t prevVal = AllocationSampleMaxSize;
+ AllocationSampleMaxSize = newVal;
+ return prevVal;
+}
+
+using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
+static TAllocationCallback* AllocationCallback;
+extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
+ TAllocationCallback* prevVal = AllocationCallback;
+ AllocationCallback = newVal;
+ return prevVal;
+}
+
+using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
+static TDeallocationCallback* DeallocationCallback;
+extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
+ TDeallocationCallback* prevVal = DeallocationCallback;
+ DeallocationCallback = newVal;
+ return prevVal;
+}
+
+PERTHREAD TAtomic AllocationsCount;
+PERTHREAD bool InAllocationCallback;
+
+static const int DBG_ALLOC_INVALID_COOKIE = -1;
+static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) {
+ int cookie = DBG_ALLOC_INVALID_COOKIE;
+ if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) {
+ if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) {
+ if (AllocationCallback) {
+ InAllocationCallback = true;
+ cookie = AllocationCallback(p->Tag, p->Size, sizeIdx);
+ InAllocationCallback = false;
+ }
+ }
+ }
+ return cookie;
+}
+
+static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) {
+ if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) {
+ if (DeallocationCallback) {
+ InAllocationCallback = true;
+ DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx);
+ InAllocationCallback = false;
+ }
+ }
+}
+
+static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) {
+ if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+ Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+ auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+ local.Alloc(global, p->Size);
+ } else {
+ global.Alloc(p->Size);
+ }
+ }
+}
+
+static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) {
+ if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+ Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+ auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+ local.Free(global, p->Size);
+ } else {
+ global.Free(p->Size);
+ }
+ }
+}
+
+static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) {
+ TAllocHeader* p = (TAllocHeader*)ptr;
+ p->Size = size;
+ p->Tag = AllocationTag;
+ p->Cookie = SampleAllocation(p, sizeIdx);
+ TrackPerTagAllocation(p, sizeIdx);
+ return GetAllocPtr(p);
+}
+
+static void TrackDeallocation(void* ptr, int sizeIdx) {
+ TAllocHeader* p = (TAllocHeader*)ptr;
+ SampleDeallocation(p, sizeIdx);
+ TrackPerTagDeallocation(p, sizeIdx);
+}
+
+struct TPerTagAllocInfo {
+ ssize_t Count;
+ ssize_t Size;
+};
+
+extern "C" void GetPerTagAllocInfo(
+ bool flushPerThreadCounters,
+ TPerTagAllocInfo* info,
+ int& maxTag,
+ int& numSizes) {
+ maxTag = DBG_ALLOC_MAX_TAG;
+ numSizes = DBG_ALLOC_NUM_SIZES;
+
+ if (info) {
+ if (flushPerThreadCounters) {
+ TLockThreadListMMgr ll;
+ for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+ TThreadAllocInfo* pInfo = *p;
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx];
+ auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+ local.Flush(global);
+ }
+ }
+ p = &pInfo->pNextInfo;
+ }
+ }
+
+ for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+ for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+ auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+ auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx];
+ res.Count = global.Count;
+ res.Size = global.Size;
+ }
+ }
+ }
+}
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) {
+#if defined(LFALLOC_DBG)
+ size_t size = _nSize;
+ _nSize += sizeof(TAllocHeader);
+#endif
+
+ IncrementCounter(CT_USER_ALLOC, _nSize);
+
+ int nSizeIdx;
+ if (_nSize > 512) {
+ if (_nSize > N_MAX_FAST_SIZE) {
+ void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC);
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, N_SIZES);
+#endif
+ return ptr;
+ }
+ nSizeIdx = size2idxArr2[(_nSize - 1) >> 8];
+ } else
+ nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)];
+
+ IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]);
+
+ // check per thread buffer
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (!thr) {
+ AllocThreadInfo();
+ thr = pThreadInfo;
+ if (!thr) {
+ void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG);
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+ return ptr;
+ }
+ }
+ {
+ int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+ if (freePtrIdx < THREAD_BUF) {
+ void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++];
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+ return ptr;
+ }
+
+ // try to alloc from global free list
+ char* buf[FL_GROUP_SIZE];
+ int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf);
+ if (count == 0) {
+ count = LFAllocNoCacheMultiple(nSizeIdx, buf);
+ if (count == 0) {
+ NMalloc::AbortFromCorruptedAllocator(); // no way LFAllocNoCacheMultiple() can fail
+ }
+ }
+ char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1;
+ for (int i = 0; i < count - 1; ++i)
+ dstBuf[-i] = buf[i];
+ freePtrIdx -= count - 1;
+ void* ptr = buf[count - 1];
+#if defined(LFALLOC_DBG)
+ ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+ return ptr;
+ }
+}
+
+static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) {
+ void* res = LFAllocImpl(_nSize);
+#ifdef DBG_FILL_MEMORY
+ if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) {
+ memset(res, 0xcf, _nSize);
+ }
+#endif
+ return res;
+}
+
+static Y_FORCE_INLINE void LFFree(void* p) {
+#if defined(LFALLOC_DBG)
+ if (p == nullptr)
+ return;
+ p = GetAllocHeader(p);
+#endif
+
+ uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll;
+ if (chkOffset >= N_MAX_WORKSET_SIZE) {
+ if (p == nullptr)
+ return;
+#if defined(LFALLOC_DBG)
+ TrackDeallocation(p, N_SIZES);
+#endif
+ LargeBlockFree(p, CT_LARGE_FREE);
+ return;
+ }
+
+ uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE;
+ ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+ if (nSizeIdx <= 0) {
+#if defined(LFALLOC_DBG)
+ TrackDeallocation(p, N_SIZES);
+#endif
+ LargeBlockFree(p, CT_LARGE_FREE);
+ return;
+ }
+
+#if defined(LFALLOC_DBG)
+ TrackDeallocation(p, nSizeIdx);
+#endif
+
+#ifdef DBG_FILL_MEMORY
+ memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]);
+#endif
+
+ IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]);
+
+ // try to store info to per thread buf
+ TThreadAllocInfo* thr = pThreadInfo;
+ if (thr) {
+ int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+ if (freePtrIdx > borderSizes[nSizeIdx]) {
+ thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p;
+ return;
+ }
+
+ // move several pointers to global free list
+ int freeCount = FL_GROUP_SIZE;
+ if (freeCount > THREAD_BUF - freePtrIdx)
+ freeCount = THREAD_BUF - freePtrIdx;
+ char** freePtrs = thr->FreePtrs[nSizeIdx];
+ PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount);
+ freePtrIdx += freeCount;
+
+ freePtrs[--freePtrIdx] = (char*)p;
+
+ } else {
+ AllocThreadInfo();
+ PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1);
+ }
+}
+
+static size_t LFGetSize(const void* p) {
+#if defined(LFALLOC_DBG)
+ if (p == nullptr)
+ return 0;
+ return GetAllocHeader(const_cast(p))->Size;
+#endif
+
+ uintptr_t chkOffset = ((const char*)p - ALLOC_START);
+ if (chkOffset >= N_MAX_WORKSET_SIZE) {
+ if (p == nullptr)
+ return 0;
+ return TLargeBlk::As(p)->Pages * 4096ll;
+ }
+ uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE;
+ ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+ if (nSizeIdx <= 0)
+ return TLargeBlk::As(p)->Pages * 4096ll;
+ return nSizeIdxToSize[nSizeIdx];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Output mem alloc stats
+const int N_PAGE_SIZE = 4096;
+static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl
+{
+ static char buff[20000];
+ va_list va;
+ //
+ va_start(va, pszFormat);
+ vsprintf(buff, pszFormat, va);
+ va_end(va);
+//
+#ifdef _win_
+ OutputDebugStringA(buff);
+#else
+ fprintf(stderr, buff);
+#endif
+}
+
+struct TChunkStats {
+ char *Start, *Finish;
+ i64 Size;
+ char* Entries;
+ i64 FreeCount;
+
+ TChunkStats(size_t chunk, i64 size, char* entries)
+ : Size(size)
+ , Entries(entries)
+ , FreeCount(0)
+ {
+ Start = ALLOC_START + chunk * N_CHUNK_SIZE;
+ Finish = Start + N_CHUNK_SIZE;
+ }
+ void CheckBlock(char* pBlock) {
+ if (pBlock && pBlock >= Start && pBlock < Finish) {
+ ++FreeCount;
+ i64 nShift = pBlock - Start;
+ i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+ Entries[nOffsetInStep / Size] = 1;
+ }
+ }
+ void SetGlobalFree(char* ptr) {
+ i64 nShift = ptr - Start;
+ i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+ while (nOffsetInStep + Size <= N_CHUNK_SIZE) {
+ ++FreeCount;
+ Entries[nOffsetInStep / Size] = 1;
+ nOffsetInStep += Size;
+ }
+ }
+};
+
+static void DumpMemoryBlockUtilizationLocked() {
+ TFreeListGroup* wholeLists[N_SIZES];
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+ wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+ }
+ char* bfList = (char*)blockFreeList.GetWholeList();
+
+ DebugTraceMMgr("memory blocks utilisation stats:\n");
+ i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0;
+ i64 nTotalGroupBlocks = 0;
+ char* entries;
+ entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4));
+ for (size_t k = 0; k < N_CHUNKS; ++k) {
+ if (chunkSizeIdx[k] <= 0) {
+ if (chunkSizeIdx[k] == -1)
+ nTotalLocked += N_CHUNK_SIZE;
+ continue;
+ }
+ i64 nSizeIdx = chunkSizeIdx[k];
+ i64 nSize = nSizeIdxToSize[nSizeIdx];
+ TChunkStats cs(k, nSize, entries);
+ int nEntriesTotal = N_CHUNK_SIZE / nSize;
+ memset(entries, 0, nEntriesTotal);
+ for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+ for (auto& ptr : g->Ptrs)
+ cs.CheckBlock(ptr);
+ }
+ TChunkStats csGB(k, nSize, entries);
+ if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) {
+ for (auto g : wholeLists) {
+ for (; g; g = g->Next)
+ csGB.CheckBlock((char*)g);
+ }
+ for (char* blk = bfList; blk; blk = *(char**)blk)
+ csGB.CheckBlock(blk);
+ nTotalGroupBlocks += csGB.FreeCount * nSize;
+ }
+ if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k)
+ cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]);
+ nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize;
+
+ char pages[N_CHUNK_SIZE / N_PAGE_SIZE];
+ memset(pages, 0, sizeof(pages));
+ for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) {
+ int nBit = 0;
+ if (entries[i])
+ nBit = 1; // free entry
+ else
+ nBit = 2; // used entry
+ for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE)
+ pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit;
+ }
+ i64 nBadPages = 0;
+ for (auto page : pages) {
+ nBadPages += page == 3;
+ nTotalPages += page != 1;
+ }
+ DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n",
+ k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize,
+ (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages));
+ nTotalAllocated += N_CHUNK_SIZE;
+ nTotalFree += cs.FreeCount * nSize;
+ nTotalBadPages += nBadPages;
+ }
+ SystemFree(entries);
+ DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n",
+ nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked,
+ 100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages);
+ DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE);
+
+ for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+ globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+ blockFreeList.ReturnWholeList(bfList);
+}
+
+void FlushThreadFreeList() {
+ if (pThreadInfo)
+ MoveSingleThreadFreeToGlobal(pThreadInfo);
+}
+
+void DumpMemoryBlockUtilization() {
+ // move current thread free to global lists to get better statistics
+ FlushThreadFreeList();
+ {
+ CCriticalSectionLockMMgr ls;
+ DumpMemoryBlockUtilizationLocked();
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// malloc api
+
+static bool LFAlloc_SetParam(const char* param, const char* value) {
+ if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) {
+ LB_LIMIT_TOTAL_SIZE = atoi(value);
+ return true;
+ }
+ if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) {
+ LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE;
+ return true;
+ }
+#ifdef DBG_FILL_MEMORY
+ if (!strcmp(param, "FillMemoryOnAllocation")) {
+ FillMemoryOnAllocation = !strcmp(value, "true");
+ return true;
+ }
+#endif
+ if (!strcmp(param, "BeforeLFAllocGlobalLockAcquired")) {
+ BeforeLFAllocGlobalLockAcquired = (decltype(BeforeLFAllocGlobalLockAcquired))(value);
+ return true;
+ }
+ if (!strcmp(param, "AfterLFAllocGlobalLockReleased")) {
+ AfterLFAllocGlobalLockReleased = (decltype(AfterLFAllocGlobalLockReleased))(value);
+ return true;
+ }
+ if (!strcmp(param, "EnterCritical")) {
+ assert(value);
+ RealEnterCritical = (decltype(RealEnterCritical))(value);
+ return true;
+ }
+ if (!strcmp(param, "LeaveCritical")) {
+ assert(value);
+ RealLeaveCritical = (decltype(RealLeaveCritical))(value);
+ return true;
+ }
+ if (!strcmp(param, "TransparentHugePages")) {
+ TransparentHugePages = !strcmp(value, "true");
+ return true;
+ }
+ if (!strcmp(param, "MapHugeTLB")) {
+ MapHugeTLB = !strcmp(value, "true");
+ return true;
+ }
+ if (!strcmp(param, "EnableDefrag")) {
+ EnableDefrag = !strcmp(value, "true");
+ return true;
+ }
+ return false;
+};
+
+static const char* LFAlloc_GetParam(const char* param) {
+ struct TParam {
+ const char* Name;
+ const char* Value;
+ };
+
+ static const TParam Params[] = {
+ {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast},
+ {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull},
+#if defined(LFALLOC_DBG)
+ {"SetThreadAllocTag", (const char*)&SetThreadAllocTag},
+ {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread},
+ {"SetProfileAllThreads", (const char*)&SetProfileAllThreads},
+ {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled},
+ {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate},
+ {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize},
+ {"SetAllocationCallback", (const char*)&SetAllocationCallback},
+ {"SetDeallocationCallback", (const char*)&SetDeallocationCallback},
+ {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo},
+#endif // LFALLOC_DBG
+ };
+
+ for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) {
+ if (strcmp(param, Params[i].Name) == 0) {
+ return Params[i].Value;
+ }
+ }
+ return nullptr;
+}
+
+static Y_FORCE_INLINE void* LFVAlloc(size_t size) {
+ const size_t pg = N_PAGE_SIZE;
+ size_t bigsize = (size + pg - 1) & (~(pg - 1));
+ void* p = LFAlloc(bigsize);
+
+ Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0);
+ return p;
+}
+
+static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) {
+ if (Y_UNLIKELY(alignment > 4096)) {
+#ifdef _win_
+ OutputDebugStringA("Larger alignment are not guaranteed with this implementation\n");
+#else
+ fprintf(stderr, "Larger alignment are not guaranteed with this implementation\n");
+#endif
+ NMalloc::AbortFromCorruptedAllocator();
+ }
+ size_t bigsize = size;
+ if (bigsize <= alignment) {
+ bigsize = alignment;
+ } else if (bigsize < 2 * alignment) {
+ bigsize = 2 * alignment;
+ }
+ *memptr = LFAlloc(bigsize);
+ return 0;
+}
+#endif
diff --git a/contrib/lfalloc/src/lfmalloc.h b/contrib/lfalloc/src/lfmalloc.h
new file mode 100644
index 00000000000..1e6a0d55773
--- /dev/null
+++ b/contrib/lfalloc/src/lfmalloc.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include
+#include
+#include "util/system/compiler.h"
+
+namespace NMalloc {
+ volatile inline bool IsAllocatorCorrupted = false;
+
+ static inline void AbortFromCorruptedAllocator() {
+ IsAllocatorCorrupted = true;
+ abort();
+ }
+
+ struct TAllocHeader {
+ void* Block;
+ size_t AllocSize;
+ void Y_FORCE_INLINE Encode(void* block, size_t size, size_t signature) {
+ Block = block;
+ AllocSize = size | signature;
+ }
+ };
+}
diff --git a/contrib/lfalloc/src/util/README.md b/contrib/lfalloc/src/util/README.md
new file mode 100644
index 00000000000..c367cb4b439
--- /dev/null
+++ b/contrib/lfalloc/src/util/README.md
@@ -0,0 +1,33 @@
+Style guide for the util folder is a stricter version of general style guide (mostly in terms of ambiguity resolution).
+
+ * all {} must be in K&R style
+ * &, * tied closer to a type, not to variable
+ * always use `using` not `typedef`
+ * even a single line block must be in braces {}:
+ ```
+ if (A) {
+ B();
+ }
+ ```
+ * _ at the end of private data member of a class - `First_`, `Second_`
+ * every .h file must be accompanied with corresponding .cpp to avoid a leakage and check that it is self contained
+ * prohibited to use `printf`-like functions
+
+
+Things declared in the general style guide, which sometimes are missed:
+
+ * `template <`, not `template<`
+ * `noexcept`, not `throw ()` nor `throw()`, not required for destructors
+ * indents inside `namespace` same as inside `class`
+
+
+Requirements for a new code (and for corrections in an old code which involves change of behaviour) in util:
+
+ * presence of UNIT-tests
+ * presence of comments in Doxygen style
+ * accessors without Get prefix (`Length()`, but not `GetLength()`)
+
+This guide is not a mandatory as there is the general style guide.
+Nevertheless if it is not followed, then a next `ya style .` run in the util folder will undeservedly update authors of some lines of code.
+
+Thus before a commit it is recommended to run `ya style .` in the util folder.
diff --git a/contrib/lfalloc/src/util/system/atomic.h b/contrib/lfalloc/src/util/system/atomic.h
new file mode 100644
index 00000000000..9876515a54d
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "defaults.h"
+
+using TAtomicBase = intptr_t;
+using TAtomic = volatile TAtomicBase;
+
+#if defined(__GNUC__)
+#include "atomic_gcc.h"
+#elif defined(_MSC_VER)
+#include "atomic_win.h"
+#else
+#error unsupported platform
+#endif
+
+#if !defined(ATOMIC_COMPILER_BARRIER)
+#define ATOMIC_COMPILER_BARRIER()
+#endif
+
+static inline TAtomicBase AtomicSub(TAtomic& a, TAtomicBase v) {
+ return AtomicAdd(a, -v);
+}
+
+static inline TAtomicBase AtomicGetAndSub(TAtomic& a, TAtomicBase v) {
+ return AtomicGetAndAdd(a, -v);
+}
+
+#if defined(USE_GENERIC_SETGET)
+static inline TAtomicBase AtomicGet(const TAtomic& a) {
+ return a;
+}
+
+static inline void AtomicSet(TAtomic& a, TAtomicBase v) {
+ a = v;
+}
+#endif
+
+static inline bool AtomicTryLock(TAtomic* a) {
+ return AtomicCas(a, 1, 0);
+}
+
+static inline bool AtomicTryAndTryLock(TAtomic* a) {
+ return (AtomicGet(*a) == 0) && AtomicTryLock(a);
+}
+
+static inline void AtomicUnlock(TAtomic* a) {
+ ATOMIC_COMPILER_BARRIER();
+ AtomicSet(*a, 0);
+}
+
+#include "atomic_ops.h"
diff --git a/contrib/lfalloc/src/util/system/atomic_gcc.h b/contrib/lfalloc/src/util/system/atomic_gcc.h
new file mode 100644
index 00000000000..ed8dc2bdc53
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic_gcc.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#define ATOMIC_COMPILER_BARRIER() __asm__ __volatile__("" \
+ : \
+ : \
+ : "memory")
+
+static inline TAtomicBase AtomicGet(const TAtomic& a) {
+ TAtomicBase tmp;
+#if defined(_arm64_)
+ __asm__ __volatile__(
+ "ldar %x[value], %[ptr] \n\t"
+ : [value] "=r"(tmp)
+ : [ptr] "Q"(a)
+ : "memory");
+#else
+ __atomic_load(&a, &tmp, __ATOMIC_ACQUIRE);
+#endif
+ return tmp;
+}
+
+static inline void AtomicSet(TAtomic& a, TAtomicBase v) {
+#if defined(_arm64_)
+ __asm__ __volatile__(
+ "stlr %x[value], %[ptr] \n\t"
+ : [ptr] "=Q"(a)
+ : [value] "r"(v)
+ : "memory");
+#else
+ __atomic_store(&a, &v, __ATOMIC_RELEASE);
+#endif
+}
+
+static inline intptr_t AtomicIncrement(TAtomic& p) {
+ return __atomic_add_fetch(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndIncrement(TAtomic& p) {
+ return __atomic_fetch_add(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicDecrement(TAtomic& p) {
+ return __atomic_sub_fetch(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndDecrement(TAtomic& p) {
+ return __atomic_fetch_sub(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicAdd(TAtomic& p, intptr_t v) {
+ return __atomic_add_fetch(&p, v, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndAdd(TAtomic& p, intptr_t v) {
+ return __atomic_fetch_add(&p, v, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicSwap(TAtomic* p, intptr_t v) {
+ (void)p; // disable strange 'parameter set but not used' warning on gcc
+ intptr_t ret;
+ __atomic_exchange(p, &v, &ret, __ATOMIC_SEQ_CST);
+ return ret;
+}
+
+static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+ (void)a; // disable strange 'parameter set but not used' warning on gcc
+ return __atomic_compare_exchange(a, &compare, &exchange, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+ (void)a; // disable strange 'parameter set but not used' warning on gcc
+ __atomic_compare_exchange(a, &compare, &exchange, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+ return compare;
+}
+
+static inline intptr_t AtomicOr(TAtomic& a, intptr_t b) {
+ return __atomic_or_fetch(&a, b, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicXor(TAtomic& a, intptr_t b) {
+ return __atomic_xor_fetch(&a, b, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicAnd(TAtomic& a, intptr_t b) {
+ return __atomic_and_fetch(&a, b, __ATOMIC_SEQ_CST);
+}
+
+static inline void AtomicBarrier() {
+ __sync_synchronize();
+}
diff --git a/contrib/lfalloc/src/util/system/atomic_ops.h b/contrib/lfalloc/src/util/system/atomic_ops.h
new file mode 100644
index 00000000000..425b643e14d
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic_ops.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include
+
+template
+inline TAtomic* AsAtomicPtr(T volatile* target) {
+ return reinterpret_cast(target);
+}
+
+template
+inline const TAtomic* AsAtomicPtr(T const volatile* target) {
+ return reinterpret_cast(target);
+}
+
+// integral types
+
+template
+struct TAtomicTraits {
+ enum {
+ Castable = std::is_integral::value && sizeof(T) == sizeof(TAtomicBase) && !std::is_const::value,
+ };
+};
+
+template
+using TEnableIfCastable = std::enable_if_t::Castable, TT>;
+
+template
+inline TEnableIfCastable AtomicGet(T const volatile& target) {
+ return static_cast(AtomicGet(*AsAtomicPtr(&target)));
+}
+
+template
+inline TEnableIfCastable AtomicSet(T volatile& target, TAtomicBase value) {
+ AtomicSet(*AsAtomicPtr(&target), value);
+}
+
+template
+inline TEnableIfCastable AtomicIncrement(T volatile& target) {
+ return static_cast(AtomicIncrement(*AsAtomicPtr(&target)));
+}
+
+template
+inline TEnableIfCastable AtomicGetAndIncrement(T volatile& target) {
+ return static_cast(AtomicGetAndIncrement(*AsAtomicPtr(&target)));
+}
+
+template
+inline TEnableIfCastable AtomicDecrement(T volatile& target) {
+ return static_cast(AtomicDecrement(*AsAtomicPtr(&target)));
+}
+
+template
+inline TEnableIfCastable AtomicGetAndDecrement(T volatile& target) {
+ return static_cast(AtomicGetAndDecrement(*AsAtomicPtr(&target)));
+}
+
+template
+inline TEnableIfCastable AtomicAdd(T volatile& target, TAtomicBase value) {
+ return static_cast(AtomicAdd(*AsAtomicPtr(&target), value));
+}
+
+template
+inline TEnableIfCastable AtomicGetAndAdd(T volatile& target, TAtomicBase value) {
+ return static_cast(AtomicGetAndAdd(*AsAtomicPtr(&target), value));
+}
+
+template
+inline TEnableIfCastable AtomicSub(T volatile& target, TAtomicBase value) {
+ return static_cast(AtomicSub(*AsAtomicPtr(&target), value));
+}
+
+template
+inline TEnableIfCastable AtomicGetAndSub(T volatile& target, TAtomicBase value) {
+ return static_cast(AtomicGetAndSub(*AsAtomicPtr(&target), value));
+}
+
+template
+inline TEnableIfCastable AtomicSwap(T volatile* target, TAtomicBase exchange) {
+ return static_cast(AtomicSwap(AsAtomicPtr(target), exchange));
+}
+
+template
+inline TEnableIfCastable AtomicCas(T volatile* target, TAtomicBase exchange, TAtomicBase compare) {
+ return AtomicCas(AsAtomicPtr(target), exchange, compare);
+}
+
+template
+inline TEnableIfCastable AtomicGetAndCas(T volatile* target, TAtomicBase exchange, TAtomicBase compare) {
+ return static_cast(AtomicGetAndCas(AsAtomicPtr(target), exchange, compare));
+}
+
+template
+inline TEnableIfCastable AtomicTryLock(T volatile* target) {
+ return AtomicTryLock(AsAtomicPtr(target));
+}
+
+template
+inline TEnableIfCastable AtomicTryAndTryLock(T volatile* target) {
+ return AtomicTryAndTryLock(AsAtomicPtr(target));
+}
+
+template
+inline TEnableIfCastable AtomicUnlock(T volatile* target) {
+ AtomicUnlock(AsAtomicPtr(target));
+}
+
+template
+inline TEnableIfCastable AtomicOr(T volatile& target, TAtomicBase value) {
+ return static_cast(AtomicOr(*AsAtomicPtr(&target), value));
+}
+
+template
+inline TEnableIfCastable AtomicAnd(T volatile& target, TAtomicBase value) {
+ return static_cast(AtomicAnd(*AsAtomicPtr(&target), value));
+}
+
+template
+inline TEnableIfCastable AtomicXor(T volatile& target, TAtomicBase value) {
+ return static_cast(AtomicXor(*AsAtomicPtr(&target), value));
+}
+
+// pointer types
+
+template
+inline T* AtomicGet(T* const volatile& target) {
+ return reinterpret_cast(AtomicGet(*AsAtomicPtr(&target)));
+}
+
+template
+inline void AtomicSet(T* volatile& target, T* value) {
+ AtomicSet(*AsAtomicPtr(&target), reinterpret_cast(value));
+}
+
+using TNullPtr = decltype(nullptr);
+
+template
+inline void AtomicSet(T* volatile& target, TNullPtr) {
+ AtomicSet(*AsAtomicPtr(&target), 0);
+}
+
+template
+inline T* AtomicSwap(T* volatile* target, T* exchange) {
+ return reinterpret_cast(AtomicSwap(AsAtomicPtr(target), reinterpret_cast(exchange)));
+}
+
+template
+inline T* AtomicSwap(T* volatile* target, TNullPtr) {
+ return reinterpret_cast(AtomicSwap(AsAtomicPtr(target), 0));
+}
+
+template
+inline bool AtomicCas(T* volatile* target, T* exchange, T* compare) {
+ return AtomicCas(AsAtomicPtr(target), reinterpret_cast(exchange), reinterpret_cast(compare));
+}
+
+template
+inline T* AtomicGetAndCas(T* volatile* target, T* exchange, T* compare) {
+ return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), reinterpret_cast(exchange), reinterpret_cast(compare)));
+}
+
+template
+inline bool AtomicCas(T* volatile* target, T* exchange, TNullPtr) {
+ return AtomicCas(AsAtomicPtr(target), reinterpret_cast(exchange), 0);
+}
+
+template
+inline T* AtomicGetAndCas(T* volatile* target, T* exchange, TNullPtr) {
+ return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), reinterpret_cast(exchange), 0));
+}
+
+template
+inline bool AtomicCas(T* volatile* target, TNullPtr, T* compare) {
+ return AtomicCas(AsAtomicPtr(target), 0, reinterpret_cast(compare));
+}
+
+template
+inline T* AtomicGetAndCas(T* volatile* target, TNullPtr, T* compare) {
+ return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), 0, reinterpret_cast(compare)));
+}
+
+template
+inline bool AtomicCas(T* volatile* target, TNullPtr, TNullPtr) {
+ return AtomicCas(AsAtomicPtr(target), 0, 0);
+}
+
+template
+inline T* AtomicGetAndCas(T* volatile* target, TNullPtr, TNullPtr) {
+ return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), 0, 0));
+}
diff --git a/contrib/lfalloc/src/util/system/atomic_win.h b/contrib/lfalloc/src/util/system/atomic_win.h
new file mode 100644
index 00000000000..1abebd87b38
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic_win.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include
+
+#define USE_GENERIC_SETGET
+
+#if defined(_i386_)
+
+#pragma intrinsic(_InterlockedIncrement)
+#pragma intrinsic(_InterlockedDecrement)
+#pragma intrinsic(_InterlockedExchangeAdd)
+#pragma intrinsic(_InterlockedExchange)
+#pragma intrinsic(_InterlockedCompareExchange)
+
+static inline intptr_t AtomicIncrement(TAtomic& a) {
+ return _InterlockedIncrement((volatile long*)&a);
+}
+
+static inline intptr_t AtomicGetAndIncrement(TAtomic& a) {
+ return _InterlockedIncrement((volatile long*)&a) - 1;
+}
+
+static inline intptr_t AtomicDecrement(TAtomic& a) {
+ return _InterlockedDecrement((volatile long*)&a);
+}
+
+static inline intptr_t AtomicGetAndDecrement(TAtomic& a) {
+ return _InterlockedDecrement((volatile long*)&a) + 1;
+}
+
+static inline intptr_t AtomicAdd(TAtomic& a, intptr_t b) {
+ return _InterlockedExchangeAdd((volatile long*)&a, b) + b;
+}
+
+static inline intptr_t AtomicGetAndAdd(TAtomic& a, intptr_t b) {
+ return _InterlockedExchangeAdd((volatile long*)&a, b);
+}
+
+static inline intptr_t AtomicSwap(TAtomic* a, intptr_t b) {
+ return _InterlockedExchange((volatile long*)a, b);
+}
+
+static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+ return _InterlockedCompareExchange((volatile long*)a, exchange, compare) == compare;
+}
+
+static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+ return _InterlockedCompareExchange((volatile long*)a, exchange, compare);
+}
+
+#else // _x86_64_
+
+#pragma intrinsic(_InterlockedIncrement64)
+#pragma intrinsic(_InterlockedDecrement64)
+#pragma intrinsic(_InterlockedExchangeAdd64)
+#pragma intrinsic(_InterlockedExchange64)
+#pragma intrinsic(_InterlockedCompareExchange64)
+
+static inline intptr_t AtomicIncrement(TAtomic& a) {
+ return _InterlockedIncrement64((volatile __int64*)&a);
+}
+
+static inline intptr_t AtomicGetAndIncrement(TAtomic& a) {
+ return _InterlockedIncrement64((volatile __int64*)&a) - 1;
+}
+
+static inline intptr_t AtomicDecrement(TAtomic& a) {
+ return _InterlockedDecrement64((volatile __int64*)&a);
+}
+
+static inline intptr_t AtomicGetAndDecrement(TAtomic& a) {
+ return _InterlockedDecrement64((volatile __int64*)&a) + 1;
+}
+
+static inline intptr_t AtomicAdd(TAtomic& a, intptr_t b) {
+ return _InterlockedExchangeAdd64((volatile __int64*)&a, b) + b;
+}
+
+static inline intptr_t AtomicGetAndAdd(TAtomic& a, intptr_t b) {
+ return _InterlockedExchangeAdd64((volatile __int64*)&a, b);
+}
+
+static inline intptr_t AtomicSwap(TAtomic* a, intptr_t b) {
+ return _InterlockedExchange64((volatile __int64*)a, b);
+}
+
+static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+ return _InterlockedCompareExchange64((volatile __int64*)a, exchange, compare) == compare;
+}
+
+static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+ return _InterlockedCompareExchange64((volatile __int64*)a, exchange, compare);
+}
+
+static inline intptr_t AtomicOr(TAtomic& a, intptr_t b) {
+ return _InterlockedOr64(&a, b) | b;
+}
+
+static inline intptr_t AtomicAnd(TAtomic& a, intptr_t b) {
+ return _InterlockedAnd64(&a, b) & b;
+}
+
+static inline intptr_t AtomicXor(TAtomic& a, intptr_t b) {
+ return _InterlockedXor64(&a, b) ^ b;
+}
+
+#endif // _x86_
+
+//TODO
+static inline void AtomicBarrier() {
+ TAtomic val = 0;
+
+ AtomicSwap(&val, 0);
+}
diff --git a/contrib/lfalloc/src/util/system/compiler.h b/contrib/lfalloc/src/util/system/compiler.h
new file mode 100644
index 00000000000..b5cec600923
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/compiler.h
@@ -0,0 +1,617 @@
+#pragma once
+
+// useful cross-platfrom definitions for compilers
+
+/**
+ * @def Y_FUNC_SIGNATURE
+ *
+ * Use this macro to get pretty function name (see example).
+ *
+ * @code
+ * void Hi() {
+ * Cout << Y_FUNC_SIGNATURE << Endl;
+ * }
+
+ * template
+ * void Do() {
+ * Cout << Y_FUNC_SIGNATURE << Endl;
+ * }
+
+ * int main() {
+ * Hi(); // void Hi()
+ * Do(); // void Do() [T = int]
+ * Do(); // void Do() [T = TString]
+ * }
+ * @endcode
+ */
+#if defined(__GNUC__)
+#define Y_FUNC_SIGNATURE __PRETTY_FUNCTION__
+#elif defined(_MSC_VER)
+#define Y_FUNC_SIGNATURE __FUNCSIG__
+#else
+#define Y_FUNC_SIGNATURE ""
+#endif
+
+#ifdef __GNUC__
+#define Y_PRINTF_FORMAT(n, m) __attribute__((__format__(__printf__, n, m)))
+#endif
+
+#ifndef Y_PRINTF_FORMAT
+#define Y_PRINTF_FORMAT(n, m)
+#endif
+
+#if defined(__clang__)
+#define Y_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
+#endif
+
+#if !defined(Y_NO_SANITIZE)
+#define Y_NO_SANITIZE(...)
+#endif
+
+/**
+ * @def Y_DECLARE_UNUSED
+ *
+ * Macro is needed to silence compiler warning about unused entities (e.g. function or argument).
+ *
+ * @code
+ * Y_DECLARE_UNUSED int FunctionUsedSolelyForDebugPurposes();
+ * assert(FunctionUsedSolelyForDebugPurposes() == 42);
+ *
+ * void Foo(const int argumentUsedOnlyForDebugPurposes Y_DECLARE_UNUSED) {
+ * assert(argumentUsedOnlyForDebugPurposes == 42);
+ * // however you may as well omit `Y_DECLARE_UNUSED` and use `UNUSED` macro instead
+ * Y_UNUSED(argumentUsedOnlyForDebugPurposes);
+ * }
+ * @endcode
+ */
+#ifdef __GNUC__
+#define Y_DECLARE_UNUSED __attribute__((unused))
+#endif
+
+#ifndef Y_DECLARE_UNUSED
+#define Y_DECLARE_UNUSED
+#endif
+
+#if defined(__GNUC__)
+#define Y_LIKELY(Cond) __builtin_expect(!!(Cond), 1)
+#define Y_UNLIKELY(Cond) __builtin_expect(!!(Cond), 0)
+#define Y_PREFETCH_READ(Pointer, Priority) __builtin_prefetch((const void*)(Pointer), 0, Priority)
+#define Y_PREFETCH_WRITE(Pointer, Priority) __builtin_prefetch((const void*)(Pointer), 1, Priority)
+#endif
+
+/**
+ * @def Y_FORCE_INLINE
+ *
+ * Macro to use in place of 'inline' in function declaration/definition to force
+ * it to be inlined.
+ */
+#if !defined(Y_FORCE_INLINE)
+#if defined(CLANG_COVERAGE)
+#/* excessive __always_inline__ might significantly slow down compilation of an instrumented unit */
+#define Y_FORCE_INLINE inline
+#elif defined(_MSC_VER)
+#define Y_FORCE_INLINE __forceinline
+#elif defined(__GNUC__)
+#/* Clang also defines __GNUC__ (as 4) */
+#define Y_FORCE_INLINE inline __attribute__((__always_inline__))
+#else
+#define Y_FORCE_INLINE inline
+#endif
+#endif
+
+/**
+ * @def Y_NO_INLINE
+ *
+ * Macro to use in place of 'inline' in function declaration/definition to
+ * prevent it from being inlined.
+ */
+#if !defined(Y_NO_INLINE)
+#if defined(_MSC_VER)
+#define Y_NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__) || defined(__INTEL_COMPILER)
+#/* Clang also defines __GNUC__ (as 4) */
+#define Y_NO_INLINE __attribute__((__noinline__))
+#else
+#define Y_NO_INLINE
+#endif
+#endif
+
+//to cheat compiler about strict aliasing or similar problems
+#if defined(__GNUC__)
+#define Y_FAKE_READ(X) \
+ do { \
+ __asm__ __volatile__("" \
+ : \
+ : "m"(X)); \
+ } while (0)
+
+#define Y_FAKE_WRITE(X) \
+ do { \
+ __asm__ __volatile__("" \
+ : "=m"(X)); \
+ } while (0)
+#endif
+
+#if !defined(Y_FAKE_READ)
+#define Y_FAKE_READ(X)
+#endif
+
+#if !defined(Y_FAKE_WRITE)
+#define Y_FAKE_WRITE(X)
+#endif
+
+#ifndef Y_PREFETCH_READ
+#define Y_PREFETCH_READ(Pointer, Priority) (void)(const void*)(Pointer), (void)Priority
+#endif
+
+#ifndef Y_PREFETCH_WRITE
+#define Y_PREFETCH_WRITE(Pointer, Priority) (void)(const void*)(Pointer), (void)Priority
+#endif
+
+#ifndef Y_LIKELY
+#define Y_LIKELY(Cond) (Cond)
+#define Y_UNLIKELY(Cond) (Cond)
+#endif
+
+#ifdef __GNUC__
+#define _packed __attribute__((packed))
+#else
+#define _packed
+#endif
+
+#if defined(__GNUC__)
+#define Y_WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#endif
+
+#ifndef Y_WARN_UNUSED_RESULT
+#define Y_WARN_UNUSED_RESULT
+#endif
+
+#if defined(__GNUC__)
+#define Y_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+#if !defined(Y_HIDDEN)
+#define Y_HIDDEN
+#endif
+
+#if defined(__GNUC__)
+#define Y_PUBLIC __attribute__((visibility("default")))
+#endif
+
+#if !defined(Y_PUBLIC)
+#define Y_PUBLIC
+#endif
+
+#if !defined(Y_UNUSED) && !defined(__cplusplus)
+#define Y_UNUSED(var) (void)(var)
+#endif
+#if !defined(Y_UNUSED) && defined(__cplusplus)
+template
+constexpr Y_FORCE_INLINE int Y_UNUSED(Types&&...) {
+ return 0;
+};
+#endif
+
+/**
+ * @def Y_ASSUME
+ *
+ * Macro that tells the compiler that it can generate optimized code
+ * as if the given expression will always evaluate true.
+ * The behavior is undefined if it ever evaluates false.
+ *
+ * @code
+ * // factored into a function so that it's testable
+ * inline int Avg(int x, int y) {
+ * if (x >= 0 && y >= 0) {
+ * return (static_cast(x) + static_cast(y)) >> 1;
+ * } else {
+ * // a slower implementation
+ * }
+ * }
+ *
+ * // we know that xs and ys are non-negative from domain knowledge,
+ * // but we can't change the types of xs and ys because of API constrains
+ * int Foo(const TVector& xs, const TVector& ys) {
+ * TVector avgs;
+ * avgs.resize(xs.size());
+ * for (size_t i = 0; i < xs.size(); ++i) {
+ * auto x = xs[i];
+ * auto y = ys[i];
+ * Y_ASSUME(x >= 0);
+ * Y_ASSUME(y >= 0);
+ * xs[i] = Avg(x, y);
+ * }
+ * }
+ * @endcode
+ */
+#if defined(__GNUC__)
+#define Y_ASSUME(condition) ((condition) ? (void)0 : __builtin_unreachable())
+#elif defined(_MSC_VER)
+#define Y_ASSUME(condition) __assume(condition)
+#else
+#define Y_ASSUME(condition) Y_UNUSED(condition)
+#endif
+
+#ifdef __cplusplus
+[[noreturn]]
+#endif
+Y_HIDDEN void _YandexAbort();
+
+/**
+ * @def Y_UNREACHABLE
+ *
+ * Macro that marks the rest of the code branch unreachable.
+ * The behavior is undefined if it's ever reached.
+ *
+ * @code
+ * switch (i % 3) {
+ * case 0:
+ * return foo;
+ * case 1:
+ * return bar;
+ * case 2:
+ * return baz;
+ * default:
+ * Y_UNREACHABLE();
+ * }
+ * @endcode
+ */
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define Y_UNREACHABLE() Y_ASSUME(0)
+#else
+#define Y_UNREACHABLE() _YandexAbort()
+#endif
+
+#if defined(undefined_sanitizer_enabled)
+#define _ubsan_enabled_
+#endif
+
+#ifdef __clang__
+
+#if __has_feature(thread_sanitizer)
+#define _tsan_enabled_
+#endif
+#if __has_feature(memory_sanitizer)
+#define _msan_enabled_
+#endif
+#if __has_feature(address_sanitizer)
+#define _asan_enabled_
+#endif
+
+#else
+
+#if defined(thread_sanitizer_enabled) || defined(__SANITIZE_THREAD__)
+#define _tsan_enabled_
+#endif
+#if defined(memory_sanitizer_enabled)
+#define _msan_enabled_
+#endif
+#if defined(address_sanitizer_enabled) || defined(__SANITIZE_ADDRESS__)
+#define _asan_enabled_
+#endif
+
+#endif
+
+#if defined(_asan_enabled_) || defined(_msan_enabled_) || defined(_tsan_enabled_) || defined(_ubsan_enabled_)
+#define _san_enabled_
+#endif
+
+#if defined(_MSC_VER)
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+
+#if defined(__GNUC__)
+#define Y_WEAK __attribute__((weak))
+#else
+#define Y_WEAK
+#endif
+
+#if defined(__CUDACC_VER_MAJOR__)
+#define Y_CUDA_AT_LEAST(x, y) (__CUDACC_VER_MAJOR__ > x || (__CUDACC_VER_MAJOR__ == x && __CUDACC_VER_MINOR__ >= y))
+#else
+#define Y_CUDA_AT_LEAST(x, y) 0
+#endif
+
+// NVidia CUDA C++ Compiler did not know about noexcept keyword until version 9.0
+#if !Y_CUDA_AT_LEAST(9, 0)
+#if defined(__CUDACC__) && !defined(noexcept)
+#define noexcept throw ()
+#endif
+#endif
+
+#if defined(__GNUC__)
+#define Y_COLD __attribute__((cold))
+#define Y_LEAF __attribute__((leaf))
+#define Y_WRAPPER __attribute__((artificial))
+#else
+#define Y_COLD
+#define Y_LEAF
+#define Y_WRAPPER
+#endif
+
+/**
+ * @def Y_PRAGMA
+ *
+ * Macro for use in other macros to define compiler pragma
+ * See below for other usage examples
+ *
+ * @code
+ * #if defined(__clang__) || defined(__GNUC__)
+ * #define Y_PRAGMA_NO_WSHADOW \
+ * Y_PRAGMA("GCC diagnostic ignored \"-Wshadow\"")
+ * #elif defined(_MSC_VER)
+ * #define Y_PRAGMA_NO_WSHADOW \
+ * Y_PRAGMA("warning(disable:4456 4457")
+ * #else
+ * #define Y_PRAGMA_NO_WSHADOW
+ * #endif
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA(x) _Pragma(x)
+#elif defined(_MSC_VER)
+#define Y_PRAGMA(x) __pragma(x)
+#else
+#define Y_PRAGMA(x)
+#endif
+
+/**
+ * @def Y_PRAGMA_DIAGNOSTIC_PUSH
+ *
+ * Cross-compiler pragma to save diagnostic settings
+ *
+ * @see
+ * GCC: https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html
+ * MSVC: https://msdn.microsoft.com/en-us/library/2c8f766e.aspx
+ * Clang: https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_DIAGNOSTIC_PUSH \
+ Y_PRAGMA("GCC diagnostic push")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_DIAGNOSTIC_PUSH \
+ Y_PRAGMA(warning(push))
+#else
+#define Y_PRAGMA_DIAGNOSTIC_PUSH
+#endif
+
+/**
+ * @def Y_PRAGMA_DIAGNOSTIC_POP
+ *
+ * Cross-compiler pragma to restore diagnostic settings
+ *
+ * @see
+ * GCC: https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html
+ * MSVC: https://msdn.microsoft.com/en-us/library/2c8f766e.aspx
+ * Clang: https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_DIAGNOSTIC_POP \
+ Y_PRAGMA("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_DIAGNOSTIC_POP \
+ Y_PRAGMA(warning(pop))
+#else
+#define Y_PRAGMA_DIAGNOSTIC_POP
+#endif
+
+/**
+ * @def Y_PRAGMA_NO_WSHADOW
+ *
+ * Cross-compiler pragma to disable warnings about shadowing variables
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_WSHADOW
+ *
+ * // some code which use variable shadowing, e.g.:
+ *
+ * for (int i = 0; i < 100; ++i) {
+ * Use(i);
+ *
+ * for (int i = 42; i < 100500; ++i) { // this i is shadowing previous i
+ * AnotherUse(i);
+ * }
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_WSHADOW \
+ Y_PRAGMA("GCC diagnostic ignored \"-Wshadow\"")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_NO_WSHADOW \
+ Y_PRAGMA(warning(disable : 4456 4457))
+#else
+#define Y_PRAGMA_NO_WSHADOW
+#endif
+
+/**
+ * @ def Y_PRAGMA_NO_UNUSED_FUNCTION
+ *
+ * Cross-compiler pragma to disable warnings about unused functions
+ *
+ * @see
+ * GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
+ * Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wunused-function
+ * MSVC: there is no such warning
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_UNUSED_FUNCTION
+ *
+ * // some code which introduces a function which later will not be used, e.g.:
+ *
+ * void Foo() {
+ * }
+ *
+ * int main() {
+ * return 0; // Foo() never called
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_UNUSED_FUNCTION \
+ Y_PRAGMA("GCC diagnostic ignored \"-Wunused-function\"")
+#else
+#define Y_PRAGMA_NO_UNUSED_FUNCTION
+#endif
+
+/**
+ * @ def Y_PRAGMA_NO_UNUSED_PARAMETER
+ *
+ * Cross-compiler pragma to disable warnings about unused function parameters
+ *
+ * @see
+ * GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
+ * Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wunused-parameter
+ * MSVC: https://msdn.microsoft.com/en-us/library/26kb9fy0.aspx
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_UNUSED_PARAMETER
+ *
+ * // some code which introduces a function with unused parameter, e.g.:
+ *
+ * void foo(int a) {
+ * // a is not referenced
+ * }
+ *
+ * int main() {
+ * foo(1);
+ * return 0;
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_UNUSED_PARAMETER \
+ Y_PRAGMA("GCC diagnostic ignored \"-Wunused-parameter\"")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_NO_UNUSED_PARAMETER \
+ Y_PRAGMA(warning(disable : 4100))
+#else
+#define Y_PRAGMA_NO_UNUSED_PARAMETER
+#endif
+
+/**
+ * @def Y_PRAGMA_NO_DEPRECATED
+ *
+ * Cross compiler pragma to disable warnings and errors about deprecated
+ *
+ * @see
+ * GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
+ * Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wdeprecated
+ * MSVC: https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4996?view=vs-2017
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_DEPRECATED
+ *
+ * [deprecated] void foo() {
+ * // ...
+ * }
+ *
+ * int main() {
+ * foo();
+ * return 0;
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_DEPRECATED \
+ Y_PRAGMA("GCC diagnostic ignored \"-Wdeprecated\"")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_NO_DEPRECATED \
+ Y_PRAGMA(warning(disable : 4996))
+#else
+#define Y_PRAGMA_NO_DEPRECATED
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+/**
+ * @def Y_CONST_FUNCTION
+ methods and functions, marked with this method are promised to:
+ 1. do not have side effects
+ 2. this method do not read global memory
+ NOTE: this attribute can't be set for methods that depend on data, pointed by this
+ this allow compilers to do hard optimization of that functions
+ NOTE: in common case this attribute can't be set if method have pointer-arguments
+ NOTE: as result there no any reason to discard result of such method
+*/
+#define Y_CONST_FUNCTION [[gnu::const]]
+#endif
+
+#if !defined(Y_CONST_FUNCTION)
+#define Y_CONST_FUNCTION
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+/**
+ * @def Y_PURE_FUNCTION
+ methods and functions, marked with this method are promised to:
+ 1. do not have side effects
+ 2. result will be the same if no global memory changed
+ this allow compilers to do hard optimization of that functions
+ NOTE: as result there no any reason to discard result of such method
+*/
+#define Y_PURE_FUNCTION [[gnu::pure]]
+#endif
+
+#if !defined(Y_PURE_FUNCTION)
+#define Y_PURE_FUNCTION
+#endif
+
+/**
+ * @ def Y_HAVE_INT128
+ *
+ * Defined when the compiler supports __int128 extension
+ *
+ * @code
+ *
+ * #if defined(Y_HAVE_INT128)
+ * __int128 myVeryBigInt = 12345678901234567890;
+ * #endif
+ *
+ * @endcode
+ */
+#if defined(__SIZEOF_INT128__)
+#define Y_HAVE_INT128 1
+#endif
+
+/**
+ * XRAY macro must be passed to compiler if XRay is enabled.
+ *
+ * Define everything XRay-specific as a macro so that it doesn't cause errors
+ * for compilers that doesn't support XRay.
+ */
+#if defined(XRAY) && defined(__cplusplus)
+#include
+#define Y_XRAY_ALWAYS_INSTRUMENT [[clang::xray_always_instrument]]
+#define Y_XRAY_NEVER_INSTRUMENT [[clang::xray_never_instrument]]
+#define Y_XRAY_CUSTOM_EVENT(__string, __length) \
+ do { \
+ __xray_customevent(__string, __length); \
+ } while (0)
+#else
+#define Y_XRAY_ALWAYS_INSTRUMENT
+#define Y_XRAY_NEVER_INSTRUMENT
+#define Y_XRAY_CUSTOM_EVENT(__string, __length) \
+ do { \
+ } while (0)
+#endif
diff --git a/contrib/lfalloc/src/util/system/defaults.h b/contrib/lfalloc/src/util/system/defaults.h
new file mode 100644
index 00000000000..19196a28b2b
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/defaults.h
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "platform.h"
+
+#if defined _unix_
+#define LOCSLASH_C '/'
+#define LOCSLASH_S "/"
+#else
+#define LOCSLASH_C '\\'
+#define LOCSLASH_S "\\"
+#endif // _unix_
+
+#if defined(__INTEL_COMPILER) && defined(__cplusplus)
+#include
+#endif
+
+// low and high parts of integers
+#if !defined(_win_)
+#include
+#endif
+
+#if defined(BSD) || defined(_android_)
+
+#if defined(BSD)
+#include
+#endif
+
+#if defined(_android_)
+#include
+#endif
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+#define _little_endian_
+#elif (BYTE_ORDER == BIG_ENDIAN)
+#define _big_endian_
+#else
+#error unknown endian not supported
+#endif
+
+#elif (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(WHATEVER_THAT_HAS_BIG_ENDIAN)
+#define _big_endian_
+#else
+#define _little_endian_
+#endif
+
+// alignment
+#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_QUADS)
+#define _must_align8_
+#endif
+
+#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_LONGS)
+#define _must_align4_
+#endif
+
+#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_SHORTS)
+#define _must_align2_
+#endif
+
+#if defined(__GNUC__)
+#define alias_hack __attribute__((__may_alias__))
+#endif
+
+#ifndef alias_hack
+#define alias_hack
+#endif
+
+#include "types.h"
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#define PRAGMA(x) _Pragma(#x)
+#define RCSID(idstr) PRAGMA(comment(exestr, idstr))
+#else
+#define RCSID(idstr) static const char rcsid[] = idstr
+#endif
+
+#include "compiler.h"
+
+#ifdef _win_
+#include
+#elif defined(_sun_)
+#include
+#endif
+
+#ifdef NDEBUG
+#define Y_IF_DEBUG(X)
+#else
+#define Y_IF_DEBUG(X) X
+#endif
+
+/**
+ * @def Y_ARRAY_SIZE
+ *
+ * This macro is needed to get number of elements in a statically allocated fixed size array. The
+ * expression is a compile-time constant and therefore can be used in compile time computations.
+ *
+ * @code
+ * enum ENumbers {
+ * EN_ONE,
+ * EN_TWO,
+ * EN_SIZE
+ * }
+ *
+ * const char* NAMES[] = {
+ * "one",
+ * "two"
+ * }
+ *
+ * static_assert(Y_ARRAY_SIZE(NAMES) == EN_SIZE, "you should define `NAME` for each enumeration");
+ * @endcode
+ *
+ * This macro also catches type errors. If you see a compiler error like "warning: division by zero
+ * is undefined" when using `Y_ARRAY_SIZE` then you are probably giving it a pointer.
+ *
+ * Since all of our code is expected to work on a 64 bit platform where pointers are 8 bytes we may
+ * falsefully accept pointers to types of sizes that are divisors of 8 (1, 2, 4 and 8).
+ */
+#if defined(__cplusplus)
+namespace NArraySizePrivate {
+ template
+ struct TArraySize;
+
+ template
+ struct TArraySize {
+ enum {
+ Result = N
+ };
+ };
+
+ template
+ struct TArraySize {
+ enum {
+ Result = N
+ };
+ };
+}
+
+#define Y_ARRAY_SIZE(arr) ((size_t)::NArraySizePrivate::TArraySize::Result)
+#else
+#undef Y_ARRAY_SIZE
+#define Y_ARRAY_SIZE(arr) \
+ ((sizeof(arr) / sizeof((arr)[0])) / static_cast(!(sizeof(arr) % sizeof((arr)[0]))))
+#endif
+
+#undef Y_ARRAY_BEGIN
+#define Y_ARRAY_BEGIN(arr) (arr)
+
+#undef Y_ARRAY_END
+#define Y_ARRAY_END(arr) ((arr) + Y_ARRAY_SIZE(arr))
+
+/**
+ * Concatenates two symbols, even if one of them is itself a macro.
+ */
+#define Y_CAT(X, Y) Y_CAT_I(X, Y)
+#define Y_CAT_I(X, Y) Y_CAT_II(X, Y)
+#define Y_CAT_II(X, Y) X##Y
+
+#define Y_STRINGIZE(X) UTIL_PRIVATE_STRINGIZE_AUX(X)
+#define UTIL_PRIVATE_STRINGIZE_AUX(X) #X
+
+#if defined(__COUNTER__)
+#define Y_GENERATE_UNIQUE_ID(N) Y_CAT(N, __COUNTER__)
+#endif
+
+#if !defined(Y_GENERATE_UNIQUE_ID)
+#define Y_GENERATE_UNIQUE_ID(N) Y_CAT(N, __LINE__)
+#endif
+
+#define NPOS ((size_t)-1)
diff --git a/contrib/lfalloc/src/util/system/platform.h b/contrib/lfalloc/src/util/system/platform.h
new file mode 100644
index 00000000000..0687f239a2e
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/platform.h
@@ -0,0 +1,242 @@
+#pragma once
+
+// What OS ?
+// our definition has the form _{osname}_
+
+#if defined(_WIN64)
+#define _win64_
+#define _win32_
+#elif defined(__WIN32__) || defined(_WIN32) // _WIN32 is also defined by the 64-bit compiler for backward compatibility
+#define _win32_
+#else
+#define _unix_
+#if defined(__sun__) || defined(sun) || defined(sparc) || defined(__sparc)
+#define _sun_
+#endif
+#if defined(__hpux__)
+#define _hpux_
+#endif
+#if defined(__linux__)
+#define _linux_
+#endif
+#if defined(__FreeBSD__)
+#define _freebsd_
+#endif
+#if defined(__CYGWIN__)
+#define _cygwin_
+#endif
+#if defined(__APPLE__)
+#define _darwin_
+#endif
+#if defined(__ANDROID__)
+#define _android_
+#endif
+#endif
+
+#if defined(__IOS__)
+#define _ios_
+#endif
+
+#if defined(_linux_)
+#if defined(_musl_)
+//nothing to do
+#elif defined(_android_)
+#define _bionic_
+#else
+#define _glibc_
+#endif
+#endif
+
+#if defined(_darwin_)
+#define unix
+#define __unix__
+#endif
+
+#if defined(_win32_) || defined(_win64_)
+#define _win_
+#endif
+
+#if defined(__arm__) || defined(__ARM__) || defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM)
+#if defined(__arm64) || defined(__arm64__) || defined(__aarch64__)
+#define _arm64_
+#else
+#define _arm32_
+#endif
+#endif
+
+#if defined(_arm64_) || defined(_arm32_)
+#define _arm_
+#endif
+
+/* __ia64__ and __x86_64__ - defined by GNU C.
+ * _M_IA64, _M_X64, _M_AMD64 - defined by Visual Studio.
+ *
+ * Microsoft can define _M_IX86, _M_AMD64 (before Visual Studio 8)
+ * or _M_X64 (starting in Visual Studio 8).
+ */
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+#define _x86_64_
+#endif
+
+#if defined(__i386__) || defined(_M_IX86)
+#define _i386_
+#endif
+
+#if defined(__ia64__) || defined(_M_IA64)
+#define _ia64_
+#endif
+
+#if defined(__powerpc__)
+#define _ppc_
+#endif
+
+#if defined(__powerpc64__)
+#define _ppc64_
+#endif
+
+#if !defined(sparc) && !defined(__sparc) && !defined(__hpux__) && !defined(__alpha__) && !defined(_ia64_) && !defined(_x86_64_) && !defined(_arm_) && !defined(_i386_) && !defined(_ppc_) && !defined(_ppc64_)
+#error "platform not defined, please, define one"
+#endif
+
+#if defined(_x86_64_) || defined(_i386_)
+#define _x86_
+#endif
+
+#if defined(__MIC__)
+#define _mic_
+#define _k1om_
+#endif
+
+// stdio or MessageBox
+#if defined(__CONSOLE__) || defined(_CONSOLE)
+#define _console_
+#endif
+#if (defined(_win_) && !defined(_console_))
+#define _windows_
+#elif !defined(_console_)
+#define _console_
+#endif
+
+#if defined(__SSE__) || defined(SSE_ENABLED)
+#define _sse_
+#endif
+
+#if defined(__SSE2__) || defined(SSE2_ENABLED)
+#define _sse2_
+#endif
+
+#if defined(__SSE3__) || defined(SSE3_ENABLED)
+#define _sse3_
+#endif
+
+#if defined(__SSSE3__) || defined(SSSE3_ENABLED)
+#define _ssse3_
+#endif
+
+#if defined(POPCNT_ENABLED)
+#define _popcnt_
+#endif
+
+#if defined(__DLL__) || defined(_DLL)
+#define _dll_
+#endif
+
+// 16, 32 or 64
+#if defined(__sparc_v9__) || defined(_x86_64_) || defined(_ia64_) || defined(_arm64_) || defined(_ppc64_)
+#define _64_
+#else
+#define _32_
+#endif
+
+/* All modern 64-bit Unix systems use scheme LP64 (long, pointers are 64-bit).
+ * Microsoft uses a different scheme: LLP64 (long long, pointers are 64-bit).
+ *
+ * Scheme LP64 LLP64
+ * char 8 8
+ * short 16 16
+ * int 32 32
+ * long 64 32
+ * long long 64 64
+ * pointer 64 64
+ */
+
+#if defined(_32_)
+#define SIZEOF_PTR 4
+#elif defined(_64_)
+#define SIZEOF_PTR 8
+#endif
+
+#define PLATFORM_DATA_ALIGN SIZEOF_PTR
+
+#if !defined(SIZEOF_PTR)
+#error todo
+#endif
+
+#define SIZEOF_CHAR 1
+#define SIZEOF_UNSIGNED_CHAR 1
+#define SIZEOF_SHORT 2
+#define SIZEOF_UNSIGNED_SHORT 2
+#define SIZEOF_INT 4
+#define SIZEOF_UNSIGNED_INT 4
+
+#if defined(_32_)
+#define SIZEOF_LONG 4
+#define SIZEOF_UNSIGNED_LONG 4
+#elif defined(_64_)
+#if defined(_win_)
+#define SIZEOF_LONG 4
+#define SIZEOF_UNSIGNED_LONG 4
+#else
+#define SIZEOF_LONG 8
+#define SIZEOF_UNSIGNED_LONG 8
+#endif // _win_
+#endif // _32_
+
+#if !defined(SIZEOF_LONG)
+#error todo
+#endif
+
+#define SIZEOF_LONG_LONG 8
+#define SIZEOF_UNSIGNED_LONG_LONG 8
+
+#undef SIZEOF_SIZE_T // in case we include which defines it, too
+#define SIZEOF_SIZE_T SIZEOF_PTR
+
+#if defined(__INTEL_COMPILER)
+#pragma warning(disable 1292)
+#pragma warning(disable 1469)
+#pragma warning(disable 193)
+#pragma warning(disable 271)
+#pragma warning(disable 383)
+#pragma warning(disable 424)
+#pragma warning(disable 444)
+#pragma warning(disable 584)
+#pragma warning(disable 593)
+#pragma warning(disable 981)
+#pragma warning(disable 1418)
+#pragma warning(disable 304)
+#pragma warning(disable 810)
+#pragma warning(disable 1029)
+#pragma warning(disable 1419)
+#pragma warning(disable 177)
+#pragma warning(disable 522)
+#pragma warning(disable 858)
+#pragma warning(disable 111)
+#pragma warning(disable 1599)
+#pragma warning(disable 411)
+#pragma warning(disable 304)
+#pragma warning(disable 858)
+#pragma warning(disable 444)
+#pragma warning(disable 913)
+#pragma warning(disable 310)
+#pragma warning(disable 167)
+#pragma warning(disable 180)
+#pragma warning(disable 1572)
+#endif
+
+#if defined(_MSC_VER)
+#undef _WINSOCKAPI_
+#define _WINSOCKAPI_
+#undef NOMINMAX
+#define NOMINMAX
+#endif
diff --git a/contrib/lfalloc/src/util/system/types.h b/contrib/lfalloc/src/util/system/types.h
new file mode 100644
index 00000000000..af4f0adb13d
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/types.h
@@ -0,0 +1,117 @@
+#pragma once
+
+// DO_NOT_STYLE
+
+#include "platform.h"
+
+#include
+
+typedef int8_t i8;
+typedef int16_t i16;
+typedef uint8_t ui8;
+typedef uint16_t ui16;
+
+typedef int yssize_t;
+#define PRIYSZT "d"
+
+#if defined(_darwin_) && defined(_32_)
+typedef unsigned long ui32;
+typedef long i32;
+#else
+typedef uint32_t ui32;
+typedef int32_t i32;
+#endif
+
+#if defined(_darwin_) && defined(_64_)
+typedef unsigned long ui64;
+typedef long i64;
+#else
+typedef uint64_t ui64;
+typedef int64_t i64;
+#endif
+
+#define LL(number) INT64_C(number)
+#define ULL(number) UINT64_C(number)
+
+// Macro for size_t and ptrdiff_t types
+#if defined(_32_)
+# if defined(_darwin_)
+# define PRISZT "lu"
+# undef PRIi32
+# define PRIi32 "li"
+# undef SCNi32
+# define SCNi32 "li"
+# undef PRId32
+# define PRId32 "li"
+# undef SCNd32
+# define SCNd32 "li"
+# undef PRIu32
+# define PRIu32 "lu"
+# undef SCNu32
+# define SCNu32 "lu"
+# undef PRIx32
+# define PRIx32 "lx"
+# undef SCNx32
+# define SCNx32 "lx"
+# elif !defined(_cygwin_)
+# define PRISZT PRIu32
+# else
+# define PRISZT "u"
+# endif
+# define SCNSZT SCNu32
+# define PRIPDT PRIi32
+# define SCNPDT SCNi32
+# define PRITMT PRIi32
+# define SCNTMT SCNi32
+#elif defined(_64_)
+# if defined(_darwin_)
+# define PRISZT "lu"
+# undef PRIu64
+# define PRIu64 PRISZT
+# undef PRIx64
+# define PRIx64 "lx"
+# undef PRIX64
+# define PRIX64 "lX"
+# undef PRId64
+# define PRId64 "ld"
+# undef PRIi64
+# define PRIi64 "li"
+# undef SCNi64
+# define SCNi64 "li"
+# undef SCNu64
+# define SCNu64 "lu"
+# undef SCNx64
+# define SCNx64 "lx"
+# else
+# define PRISZT PRIu64
+# endif
+# define SCNSZT SCNu64
+# define PRIPDT PRIi64
+# define SCNPDT SCNi64
+# define PRITMT PRIi64
+# define SCNTMT SCNi64
+#else
+# error "Unsupported platform"
+#endif
+
+// SUPERLONG
+#if !defined(DONT_USE_SUPERLONG) && !defined(SUPERLONG_MAX)
+#define SUPERLONG_MAX ~LL(0)
+typedef i64 SUPERLONG;
+#endif
+
+// UNICODE
+// UCS-2, native byteorder
+typedef ui16 wchar16;
+// internal symbol type: UTF-16LE
+typedef wchar16 TChar;
+typedef ui32 wchar32;
+
+#if defined(_MSC_VER)
+#include
+typedef SSIZE_T ssize_t;
+#define HAVE_SSIZE_T 1
+#include
+#endif
+
+#include
diff --git a/contrib/libmetrohash/src/platform.h b/contrib/libmetrohash/src/platform.h
index 31291b94b33..bc00e5a286b 100644
--- a/contrib/libmetrohash/src/platform.h
+++ b/contrib/libmetrohash/src/platform.h
@@ -18,6 +18,7 @@
#define METROHASH_PLATFORM_H
#include
+#include
// rotate right idiom recognized by most compilers
inline static uint64_t rotate_right(uint64_t v, unsigned k)
@@ -25,20 +26,28 @@ inline static uint64_t rotate_right(uint64_t v, unsigned k)
return (v >> k) | (v << (64 - k));
}
-// unaligned reads, fast and safe on Nehalem and later microarchitectures
inline static uint64_t read_u64(const void * const ptr)
{
- return static_cast(*reinterpret_cast(ptr));
+ uint64_t result;
+ // Assignment like `result = *reinterpret_cast(ptr)` here would mean undefined behaviour (unaligned read),
+ // so we use memcpy() which is the most portable. clang & gcc usually translates `memcpy()` into a single `load` instruction
+ // when hardware supports it, so using memcpy() is efficient too.
+ memcpy(&result, ptr, sizeof(result));
+ return result;
}
inline static uint64_t read_u32(const void * const ptr)
{
- return static_cast(*reinterpret_cast(ptr));
+ uint32_t result;
+ memcpy(&result, ptr, sizeof(result));
+ return result;
}
inline static uint64_t read_u16(const void * const ptr)
{
- return static_cast(*reinterpret_cast(ptr));
+ uint16_t result;
+ memcpy(&result, ptr, sizeof(result));
+ return result;
}
inline static uint64_t read_u8 (const void * const ptr)
diff --git a/contrib/poco b/contrib/poco
index fe5505e56c2..29439cf7fa3 160000
--- a/contrib/poco
+++ b/contrib/poco
@@ -1 +1 @@
-Subproject commit fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f
+Subproject commit 29439cf7fa32c1a2d62d925bb6d6a3f14668a4a2
diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index 1306039e9c3..2c9bfa48605 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -155,7 +155,6 @@ if (USE_EMBEDDED_COMPILER)
target_include_directories (dbms SYSTEM BEFORE PUBLIC ${LLVM_INCLUDE_DIRS})
endif ()
-
if (CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" OR CMAKE_BUILD_TYPE_UC STREQUAL "MINSIZEREL")
# Won't generate debug info for files with heavy template instantiation to achieve faster linking and lower size.
set_source_files_properties(
@@ -186,8 +185,6 @@ target_link_libraries (clickhouse_common_io
${LINK_LIBRARIES_ONLY_ON_X86_64}
PUBLIC
${DOUBLE_CONVERSION_LIBRARIES}
- PRIVATE
- pocoext
PUBLIC
${Poco_Net_LIBRARY}
${Poco_Util_LIBRARY}
@@ -214,6 +211,10 @@ target_link_libraries (clickhouse_common_io
target_include_directories(clickhouse_common_io SYSTEM BEFORE PUBLIC ${RE2_INCLUDE_DIR})
+if (USE_LFALLOC)
+ target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${LFALLOC_INCLUDE_DIR})
+endif ()
+
if(CPUID_LIBRARY)
target_link_libraries(clickhouse_common_io PRIVATE ${CPUID_LIBRARY})
endif()
@@ -223,8 +224,9 @@ if(CPUINFO_LIBRARY)
endif()
target_link_libraries (dbms
- PRIVATE
+ PUBLIC
clickhouse_compression
+ PRIVATE
clickhouse_parsers
clickhouse_common_config
PUBLIC
@@ -232,7 +234,6 @@ target_link_libraries (dbms
PRIVATE
clickhouse_dictionaries_embedded
PUBLIC
- pocoext
${MYSQLXX_LIBRARY}
PRIVATE
${BTRIE_LIBRARIES}
diff --git a/dbms/programs/benchmark/Benchmark.cpp b/dbms/programs/benchmark/Benchmark.cpp
index b366add0ba5..89b363a2709 100644
--- a/dbms/programs/benchmark/Benchmark.cpp
+++ b/dbms/programs/benchmark/Benchmark.cpp
@@ -439,7 +439,7 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv)
("help", "produce help message")
("concurrency,c", value()->default_value(1), "number of parallel queries")
("delay,d", value()->default_value(1), "delay between intermediate reports in seconds (set 0 to disable reports)")
- ("stage", value()->default_value("complete"), "request query processing up to specified stage")
+ ("stage", value()->default_value("complete"), "request query processing up to specified stage: complete,fetch_columns,with_mergeable_state")
("iterations,i", value()->default_value(0), "amount of queries to be executed")
("timelimit,t", value()->default_value(0.), "stop launch of queries after specified time limit")
("randomize,r", value()->default_value(false), "randomize order of execution")
diff --git a/dbms/programs/odbc-bridge/HandlerFactory.cpp b/dbms/programs/odbc-bridge/HandlerFactory.cpp
index a6422db268c..55c2c8d7637 100644
--- a/dbms/programs/odbc-bridge/HandlerFactory.cpp
+++ b/dbms/programs/odbc-bridge/HandlerFactory.cpp
@@ -2,7 +2,6 @@
#include "PingHandler.h"
#include "ColumnInfoHandler.h"
#include
-#include
#include
#include
diff --git a/dbms/programs/odbc-bridge/MainHandler.cpp b/dbms/programs/odbc-bridge/MainHandler.cpp
index d95f1386c7b..cb5dfa70c9c 100644
--- a/dbms/programs/odbc-bridge/MainHandler.cpp
+++ b/dbms/programs/odbc-bridge/MainHandler.cpp
@@ -11,11 +11,12 @@
#include
#include
#include
-#include
#include
#include
#include
#include
+#include
+#include
namespace DB
{
@@ -31,6 +32,24 @@ namespace
}
}
+using PocoSessionPoolConstructor = std::function()>;
+/** Is used to adjust max size of default Poco thread pool. See issue #750
+ * Acquire the lock, resize pool and construct new Session.
+ */
+std::shared_ptr createAndCheckResizePocoSessionPool(PocoSessionPoolConstructor pool_constr)
+{
+ static std::mutex mutex;
+
+ Poco::ThreadPool & pool = Poco::ThreadPool::defaultPool();
+
+ /// NOTE: The lock don't guarantee that external users of the pool don't change its capacity
+ std::unique_lock lock(mutex);
+
+ if (pool.available() == 0)
+ pool.addCapacity(2 * std::max(pool.capacity(), 1));
+
+ return pool_constr();
+}
ODBCHandler::PoolPtr ODBCHandler::getPool(const std::string & connection_str)
{
diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h
index 9ae32c41fd9..486e6b1fd44 100644
--- a/dbms/src/Columns/ColumnString.h
+++ b/dbms/src/Columns/ColumnString.h
@@ -21,6 +21,7 @@ namespace DB
class ColumnString final : public COWPtrHelper
{
public:
+ using Char = UInt8;
using Chars = PaddedPODArray;
private:
diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h
index 9ed79c0b69c..c9afd7c1bfe 100644
--- a/dbms/src/Columns/IColumn.h
+++ b/dbms/src/Columns/IColumn.h
@@ -250,7 +250,7 @@ public:
/// Size of memory, allocated for column.
/// This is greater or equals to byteSize due to memory reservation in containers.
- /// Zero, if could be determined.
+ /// Zero, if could not be determined.
virtual size_t allocatedBytes() const = 0;
/// Make memory region readonly with mprotect if it is large enough.
diff --git a/dbms/src/Common/CurrentThread.cpp b/dbms/src/Common/CurrentThread.cpp
index 3eaf8bfa81d..6c2c77dccd7 100644
--- a/dbms/src/Common/CurrentThread.cpp
+++ b/dbms/src/Common/CurrentThread.cpp
@@ -7,7 +7,7 @@
#include
#include
#include
-#include
+#include
#include
@@ -29,7 +29,7 @@ void CurrentThread::updatePerformanceCounters()
ThreadStatus & CurrentThread::get()
{
if (unlikely(!current_thread))
- throw Exception("Thread #" + std::to_string(Poco::ThreadNumber::get()) + " status was not initialized", ErrorCodes::LOGICAL_ERROR);
+ throw Exception("Thread #" + std::to_string(getThreadNumber()) + " status was not initialized", ErrorCodes::LOGICAL_ERROR);
return *current_thread;
}
diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp
index bc53e39ec92..bfe9049fcb3 100644
--- a/dbms/src/Common/ErrorCodes.cpp
+++ b/dbms/src/Common/ErrorCodes.cpp
@@ -424,6 +424,8 @@ namespace ErrorCodes
extern const int HYPERSCAN_CANNOT_SCAN_TEXT = 447;
extern const int BROTLI_READ_FAILED = 448;
extern const int BROTLI_WRITE_FAILED = 449;
+ extern const int BAD_TTL_EXPRESSION = 450;
+ extern const int BAD_TTL_FILE = 451;
extern const int KEEPER_EXCEPTION = 999;
extern const int POCO_EXCEPTION = 1000;
diff --git a/dbms/src/Common/LFAllocator.cpp b/dbms/src/Common/LFAllocator.cpp
new file mode 100644
index 00000000000..71396d341ab
--- /dev/null
+++ b/dbms/src/Common/LFAllocator.cpp
@@ -0,0 +1,53 @@
+#include
+
+#if USE_LFALLOC
+#include "LFAllocator.h"
+
+#include
+#include
+
+namespace DB
+{
+
+void * LFAllocator::alloc(size_t size, size_t alignment)
+{
+ if (alignment == 0)
+ return LFAlloc(size);
+ else
+ {
+ void * ptr;
+ int res = LFPosixMemalign(&ptr, alignment, size);
+ return res ? nullptr : ptr;
+ }
+}
+
+void LFAllocator::free(void * buf, size_t)
+{
+ LFFree(buf);
+}
+
+void * LFAllocator::realloc(void * old_ptr, size_t, size_t new_size, size_t alignment)
+{
+ if (old_ptr == nullptr)
+ {
+ void * result = LFAllocator::alloc(new_size, alignment);
+ return result;
+ }
+ if (new_size == 0)
+ {
+ LFFree(old_ptr);
+ return nullptr;
+ }
+
+ void * new_ptr = LFAllocator::alloc(new_size, alignment);
+ if (new_ptr == nullptr)
+ return nullptr;
+ size_t old_size = LFGetSize(old_ptr);
+ memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size));
+ LFFree(old_ptr);
+ return new_ptr;
+}
+
+}
+
+#endif
diff --git a/dbms/src/Common/LFAllocator.h b/dbms/src/Common/LFAllocator.h
new file mode 100644
index 00000000000..f2a10cc4508
--- /dev/null
+++ b/dbms/src/Common/LFAllocator.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include
+
+#if !USE_LFALLOC
+#error "do not include this file until USE_LFALLOC is set to 1"
+#endif
+
+#include
+
+namespace DB
+{
+struct LFAllocator
+{
+ static void * alloc(size_t size, size_t alignment = 0);
+
+ static void free(void * buf, size_t);
+
+ static void * realloc(void * buf, size_t, size_t new_size, size_t alignment = 0);
+};
+
+}
diff --git a/dbms/src/Common/RWLock.cpp b/dbms/src/Common/RWLock.cpp
index 9fdb13a009d..aa658e964d7 100644
--- a/dbms/src/Common/RWLock.cpp
+++ b/dbms/src/Common/RWLock.cpp
@@ -1,7 +1,6 @@
#include "RWLock.h"
#include
#include
-#include
#include
#include
diff --git a/dbms/src/Common/RadixSort.h b/dbms/src/Common/RadixSort.h
index 0a5861b30eb..f95c6237d74 100644
--- a/dbms/src/Common/RadixSort.h
+++ b/dbms/src/Common/RadixSort.h
@@ -64,15 +64,15 @@ struct RadixSortFloatTransform
};
-template
+template
struct RadixSortFloatTraits
{
- using Element = Float; /// The type of the element. It can be a structure with a key and some other payload. Or just a key.
- using Key = Float; /// The key to sort.
+ using Element = _Element; /// The type of the element. It can be a structure with a key and some other payload. Or just a key.
+ using Key = _Key; /// The key to sort.
using CountType = uint32_t; /// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t.
/// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
- using KeyBits = std::conditional_t;
+ using KeyBits = std::conditional_t;
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
@@ -85,7 +85,13 @@ struct RadixSortFloatTraits
using Allocator = RadixSortMallocAllocator;
/// The function to get the key from an array element.
- static Key & extractKey(Element & elem) { return elem; }
+ static Key & extractKey(Element & elem)
+ {
+ if constexpr (std::is_same_v)
+ return elem;
+ else
+ return *reinterpret_cast(&elem);
+ }
};
@@ -109,13 +115,13 @@ struct RadixSortSignedTransform
};
-template
+template
struct RadixSortUIntTraits
{
- using Element = UInt;
- using Key = UInt;
+ using Element = _Element;
+ using Key = _Key;
using CountType = uint32_t;
- using KeyBits = UInt;
+ using KeyBits = _Key;
static constexpr size_t PART_SIZE_BITS = 8;
@@ -123,16 +129,22 @@ struct RadixSortUIntTraits
using Allocator = RadixSortMallocAllocator;
/// The function to get the key from an array element.
- static Key & extractKey(Element & elem) { return elem; }
+ static Key & extractKey(Element & elem)
+ {
+ if constexpr (std::is_same_v)
+ return elem;
+ else
+ return *reinterpret_cast(&elem);
+ }
};
-template
+template
struct RadixSortIntTraits
{
- using Element = Int;
- using Key = Int;
+ using Element = _Element;
+ using Key = _Key;
using CountType = uint32_t;
- using KeyBits = std::make_unsigned_t;
+ using KeyBits = std::make_unsigned_t<_Key>;
static constexpr size_t PART_SIZE_BITS = 8;
@@ -140,7 +152,13 @@ struct RadixSortIntTraits
using Allocator = RadixSortMallocAllocator;
/// The function to get the key from an array element.
- static Key & extractKey(Element & elem) { return elem; }
+ static Key & extractKey(Element & elem)
+ {
+ if constexpr (std::is_same_v)
+ return elem;
+ else
+ return *reinterpret_cast(&elem);
+ }
};
@@ -261,3 +279,16 @@ radixSort(T * arr, size_t size)
return RadixSort>::execute(arr, size);
}
+template
+std::enable_if_t, void>
+radixSort(_Element * arr, size_t size)
+{
+ return RadixSort>::execute(arr, size);
+}
+
+template
+std::enable_if_t, void>
+radixSort(_Element * arr, size_t size)
+{
+ return RadixSort>::execute(arr, size);
+}
diff --git a/dbms/src/Common/ThreadStatus.cpp b/dbms/src/Common/ThreadStatus.cpp
index 6ee7518e393..c2e415ab363 100644
--- a/dbms/src/Common/ThreadStatus.cpp
+++ b/dbms/src/Common/ThreadStatus.cpp
@@ -7,7 +7,7 @@
#include
#include
-#include
+#include
namespace DB
@@ -33,7 +33,7 @@ TasksStatsCounters TasksStatsCounters::current()
ThreadStatus::ThreadStatus()
{
- thread_number = Poco::ThreadNumber::get();
+ thread_number = getThreadNumber();
os_thread_id = TaskStatsInfoGetter::getCurrentTID();
last_rusage = std::make_unique();
diff --git a/dbms/src/Common/config.h.in b/dbms/src/Common/config.h.in
index c323afe369e..d6fc6d146f0 100644
--- a/dbms/src/Common/config.h.in
+++ b/dbms/src/Common/config.h.in
@@ -25,6 +25,8 @@
#cmakedefine01 USE_BROTLI
#cmakedefine01 USE_SSL
#cmakedefine01 USE_HYPERSCAN
+#cmakedefine01 USE_LFALLOC
+#cmakedefine01 USE_LFALLOC_RANDOM_HINT
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
#cmakedefine01 LLVM_HAS_RTTI
diff --git a/dbms/src/Core/AccurateComparison.h b/dbms/src/Core/AccurateComparison.h
index 4fa33c7d099..af1ea285a89 100644
--- a/dbms/src/Core/AccurateComparison.h
+++ b/dbms/src/Core/AccurateComparison.h
@@ -426,6 +426,21 @@ inline bool_if_safe_conversion greaterOrEqualsOp(A a, B b)
return a >= b;
}
+/// Converts numeric to an equal numeric of other type.
+template
+inline bool NO_SANITIZE_UNDEFINED convertNumeric(From value, To & result)
+{
+ /// Note that NaNs doesn't compare equal to anything, but they are still in range of any Float type.
+ if (isNaN(value) && std::is_floating_point_v)
+ {
+ result = value;
+ return true;
+ }
+
+ result = static_cast(value);
+ return equalsOp(value, result);
+}
+
}
diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h
index f677b8a7079..7b87a537908 100644
--- a/dbms/src/Core/Settings.h
+++ b/dbms/src/Core/Settings.h
@@ -93,9 +93,12 @@ struct Settings
M(SettingBool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.") \
\
M(SettingUInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized.") \
+ M(SettingUInt64, merge_tree_min_bytes_for_concurrent_read, (100 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized.") \
M(SettingUInt64, merge_tree_min_rows_for_seek, 0, "You can skip reading more than that number of rows at the price of one seek per file.") \
+ M(SettingUInt64, merge_tree_min_bytes_for_seek, 0, "You can skip reading more than that number of bytes at the price of one seek per file.") \
M(SettingUInt64, merge_tree_coarse_index_granularity, 8, "If the index segment can contain the required keys, divide it into as many parts and recursively check them.") \
M(SettingUInt64, merge_tree_max_rows_to_use_cache, (1024 * 1024), "The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)") \
+ M(SettingUInt64, merge_tree_max_bytes_to_use_cache, (600 * 1024 * 1024), "The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)") \
\
M(SettingBool, merge_tree_uniform_read_distribution, true, "Distribute read from MergeTree over threads evenly, ensuring stable average execution time of each thread within one read operation.") \
\
diff --git a/dbms/src/DataStreams/BlocksListBlockInputStream.h b/dbms/src/DataStreams/BlocksListBlockInputStream.h
index 0947fbb4406..de287c2dc5e 100644
--- a/dbms/src/DataStreams/BlocksListBlockInputStream.h
+++ b/dbms/src/DataStreams/BlocksListBlockInputStream.h
@@ -23,6 +23,8 @@ public:
String getName() const override { return "BlocksList"; }
protected:
+ Block getHeader() const override { return list.empty() ? Block() : *list.begin(); }
+
Block readImpl() override
{
if (it == end)
diff --git a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.cpp b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.cpp
index f630494ae2f..cfc165aa4d2 100644
--- a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.cpp
+++ b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.cpp
@@ -40,7 +40,7 @@ void CollapsingSortedBlockInputStream::reportIncorrectData()
}
-void CollapsingSortedBlockInputStream::insertRows(MutableColumns & merged_columns, size_t & merged_rows)
+void CollapsingSortedBlockInputStream::insertRows(MutableColumns & merged_columns, size_t block_size, MergeStopCondition & condition)
{
if (count_positive == 0 && count_negative == 0)
{
@@ -52,7 +52,7 @@ void CollapsingSortedBlockInputStream::insertRows(MutableColumns & merged_column
{
if (count_positive <= count_negative)
{
- ++merged_rows;
+ condition.addRowWithGranularity(block_size);
for (size_t i = 0; i < num_columns; ++i)
merged_columns[i]->insertFrom(*(*first_negative.columns)[i], first_negative.row_num);
@@ -62,7 +62,7 @@ void CollapsingSortedBlockInputStream::insertRows(MutableColumns & merged_column
if (count_positive >= count_negative)
{
- ++merged_rows;
+ condition.addRowWithGranularity(block_size);
for (size_t i = 0; i < num_columns; ++i)
merged_columns[i]->insertFrom(*(*last_positive.columns)[i], last_positive.row_num);
@@ -106,12 +106,14 @@ Block CollapsingSortedBlockInputStream::readImpl()
void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue & queue)
{
- size_t merged_rows = 0;
+ MergeStopCondition stop_condition(average_block_sizes, max_block_size);
+ size_t current_block_granularity;
/// Take rows in correct order and put them into `merged_columns` until the rows no more than `max_block_size`
for (; !queue.empty(); ++current_pos)
{
SortCursor current = queue.top();
+ current_block_granularity = current->rows;
if (current_key.empty())
setPrimaryKeyRef(current_key, current);
@@ -122,7 +124,7 @@ void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, st
bool key_differs = next_key != current_key;
/// if there are enough rows and the last one is calculated completely
- if (key_differs && merged_rows >= max_block_size)
+ if (key_differs && stop_condition.checkStop())
{
++blocks_written;
return;
@@ -133,7 +135,7 @@ void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, st
if (key_differs)
{
/// We write data for the previous primary key.
- insertRows(merged_columns, merged_rows);
+ insertRows(merged_columns, current_block_granularity, stop_condition);
current_key.swap(next_key);
@@ -167,7 +169,7 @@ void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, st
first_negative_pos = current_pos;
}
- if (!blocks_written && !merged_rows)
+ if (!blocks_written && stop_condition.empty())
{
setRowRef(last_negative, current);
last_negative_pos = current_pos;
@@ -193,7 +195,7 @@ void CollapsingSortedBlockInputStream::merge(MutableColumns & merged_columns, st
}
/// Write data for last primary key.
- insertRows(merged_columns, merged_rows);
+ insertRows(merged_columns, /*some_granularity*/ 0, stop_condition);
finished = true;
}
diff --git a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h
index cf72df30dbd..4b3b936d703 100644
--- a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h
+++ b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h
@@ -26,8 +26,9 @@ class CollapsingSortedBlockInputStream : public MergingSortedBlockInputStream
public:
CollapsingSortedBlockInputStream(
BlockInputStreams inputs_, const SortDescription & description_,
- const String & sign_column, size_t max_block_size_, WriteBuffer * out_row_sources_buf_ = nullptr)
- : MergingSortedBlockInputStream(inputs_, description_, max_block_size_, 0, out_row_sources_buf_)
+ const String & sign_column, size_t max_block_size_,
+ WriteBuffer * out_row_sources_buf_ = nullptr, bool average_block_sizes_ = false)
+ : MergingSortedBlockInputStream(inputs_, description_, max_block_size_, 0, out_row_sources_buf_, false, average_block_sizes_)
{
sign_column_number = header.getPositionByName(sign_column);
}
@@ -75,7 +76,7 @@ private:
void merge(MutableColumns & merged_columns, std::priority_queue & queue);
/// Output to result rows for the current primary key.
- void insertRows(MutableColumns & merged_columns, size_t & merged_rows);
+ void insertRows(MutableColumns & merged_columns, size_t block_size, MergeStopCondition & condition);
void reportIncorrectData();
};
diff --git a/dbms/src/DataStreams/MarkInCompressedFile.h b/dbms/src/DataStreams/MarkInCompressedFile.h
index 3a1d9aa0f19..ff07b2afbe1 100644
--- a/dbms/src/DataStreams/MarkInCompressedFile.h
+++ b/dbms/src/DataStreams/MarkInCompressedFile.h
@@ -6,6 +6,10 @@
#include
#include
+#include
+#if USE_LFALLOC
+#include
+#endif
namespace DB
{
@@ -32,8 +36,16 @@ struct MarkInCompressedFile
{
return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + ")";
}
+
+ String toStringWithRows(size_t rows_num)
+ {
+ return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + "," + DB::toString(rows_num) + ")";
+ }
+
};
-
+#if USE_LFALLOC
+using MarksInCompressedFile = PODArray;
+#else
using MarksInCompressedFile = PODArray;
-
+#endif
}
diff --git a/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp b/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp
index d4e7ba0e749..719977854da 100644
--- a/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp
+++ b/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp
@@ -18,9 +18,10 @@ namespace ErrorCodes
MergingSortedBlockInputStream::MergingSortedBlockInputStream(
const BlockInputStreams & inputs_, const SortDescription & description_,
- size_t max_block_size_, UInt64 limit_, WriteBuffer * out_row_sources_buf_, bool quiet_)
+ size_t max_block_size_, UInt64 limit_, WriteBuffer * out_row_sources_buf_, bool quiet_, bool average_block_sizes_)
: description(description_), max_block_size(max_block_size_), limit(limit_), quiet(quiet_)
- , source_blocks(inputs_.size()), cursors(inputs_.size()), out_row_sources_buf(out_row_sources_buf_)
+ , average_block_sizes(average_block_sizes_), source_blocks(inputs_.size())
+ , cursors(inputs_.size()), out_row_sources_buf(out_row_sources_buf_)
{
children.insert(children.end(), inputs_.begin(), inputs_.end());
header = children.at(0)->getHeader();
@@ -116,7 +117,7 @@ Block MergingSortedBlockInputStream::readImpl()
template
void MergingSortedBlockInputStream::fetchNextBlock(const TSortCursor & current, std::priority_queue & queue)
{
- size_t order = current.impl->order;
+ size_t order = current->order;
size_t size = cursors.size();
if (order >= size || &cursors[order] != current.impl)
@@ -132,6 +133,19 @@ void MergingSortedBlockInputStream::fetchNextBlock(const TSortCursor & current,
}
}
+
+bool MergingSortedBlockInputStream::MergeStopCondition::checkStop() const
+{
+ if (!count_average)
+ return sum_rows_count == max_block_size;
+
+ if (sum_rows_count == 0)
+ return false;
+
+ size_t average = sum_blocks_granularity / sum_rows_count;
+ return sum_rows_count >= average;
+}
+
template
void MergingSortedBlockInputStream::fetchNextBlock(const SortCursor & current, std::priority_queue & queue);
@@ -144,10 +158,11 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
{
size_t merged_rows = 0;
+ MergeStopCondition stop_condition(average_block_sizes, max_block_size);
/** Increase row counters.
* Return true if it's time to finish generating the current data block.
*/
- auto count_row_and_check_limit = [&, this]()
+ auto count_row_and_check_limit = [&, this](size_t current_granularity)
{
++total_merged_rows;
if (limit && total_merged_rows == limit)
@@ -159,19 +174,15 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
}
++merged_rows;
- if (merged_rows == max_block_size)
- {
- // std::cerr << "max_block_size reached\n";
- return true;
- }
-
- return false;
+ stop_condition.addRowWithGranularity(current_granularity);
+ return stop_condition.checkStop();
};
/// Take rows in required order and put them into `merged_columns`, while the rows are no more than `max_block_size`
while (!queue.empty())
{
TSortCursor current = queue.top();
+ size_t current_block_granularity = current->rows;
queue.pop();
while (true)
@@ -179,20 +190,20 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
/** And what if the block is totally less or equal than the rest for the current cursor?
* Or is there only one data source left in the queue? Then you can take the entire block on current cursor.
*/
- if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top())))
+ if (current->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top())))
{
// std::cerr << "current block is totally less or equals\n";
/// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function.
if (merged_rows != 0)
{
- // std::cerr << "merged rows is non-zero\n";
+ //std::cerr << "merged rows is non-zero\n";
queue.push(current);
return;
}
- /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
- size_t source_num = current.impl->order;
+ /// Actually, current->order stores source number (i.e. cursors[current->order] == current)
+ size_t source_num = current->order;
if (source_num >= cursors.size())
throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR);
@@ -204,6 +215,7 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
merged_rows = merged_columns.at(0)->size();
+ /// Limit output
if (limit && total_merged_rows + merged_rows > limit)
{
merged_rows = limit - total_merged_rows;
@@ -217,6 +229,8 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
finished = true;
}
+ /// Write order of rows for other columns
+ /// this data will be used in grather stream
if (out_row_sources_buf)
{
RowSourcePart row_source(source_num);
@@ -224,7 +238,7 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
out_row_sources_buf->write(row_source.data);
}
- // std::cerr << "fetching next block\n";
+ //std::cerr << "fetching next block\n";
total_merged_rows += merged_rows;
fetchNextBlock(current, queue);
@@ -239,7 +253,7 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
if (out_row_sources_buf)
{
/// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl)
- RowSourcePart row_source(current.impl->order);
+ RowSourcePart row_source(current->order);
out_row_sources_buf->write(row_source.data);
}
@@ -250,7 +264,7 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
if (queue.empty() || !(current.greater(queue.top())))
{
- if (count_row_and_check_limit())
+ if (count_row_and_check_limit(current_block_granularity))
{
// std::cerr << "pushing back to queue\n";
queue.push(current);
@@ -277,7 +291,7 @@ void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::
break;
}
- if (count_row_and_check_limit())
+ if (count_row_and_check_limit(current_block_granularity))
return;
}
diff --git a/dbms/src/DataStreams/MergingSortedBlockInputStream.h b/dbms/src/DataStreams/MergingSortedBlockInputStream.h
index 68ea179d68d..be05783c3ed 100644
--- a/dbms/src/DataStreams/MergingSortedBlockInputStream.h
+++ b/dbms/src/DataStreams/MergingSortedBlockInputStream.h
@@ -68,7 +68,7 @@ public:
*/
MergingSortedBlockInputStream(
const BlockInputStreams & inputs_, const SortDescription & description_, size_t max_block_size_,
- UInt64 limit_ = 0, WriteBuffer * out_row_sources_buf_ = nullptr, bool quiet_ = false);
+ UInt64 limit_ = 0, WriteBuffer * out_row_sources_buf_ = nullptr, bool quiet_ = false, bool average_block_sizes_ = false);
String getName() const override { return "MergingSorted"; }
@@ -116,6 +116,38 @@ protected:
size_t size() const { return empty() ? 0 : columns->size(); }
};
+ /// Simple class, which allows to check stop condition during merge process
+ /// in simple case it just compare amount of merged rows with max_block_size
+ /// in `count_average` case it compares amount of merged rows with linear combination
+ /// of block sizes from which these rows were taken.
+ struct MergeStopCondition
+ {
+ size_t sum_blocks_granularity = 0;
+ size_t sum_rows_count = 0;
+ bool count_average;
+ size_t max_block_size;
+
+ MergeStopCondition(bool count_average_, size_t max_block_size_)
+ : count_average(count_average_)
+ , max_block_size(max_block_size_)
+ {}
+
+ /// add single row from block size `granularity`
+ void addRowWithGranularity(size_t granularity)
+ {
+ sum_blocks_granularity += granularity;
+ sum_rows_count++;
+ }
+
+ /// check that sum_rows_count is enough
+ bool checkStop() const;
+
+ bool empty() const
+ {
+ return sum_blocks_granularity == 0;
+ }
+ };
+
Block readImpl() override;
@@ -139,6 +171,7 @@ protected:
bool first = true;
bool has_collation = false;
bool quiet = false;
+ bool average_block_sizes = false;
/// May be smaller or equal to max_block_size. To do 'reserve' for columns.
size_t expected_block_size = 0;
diff --git a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.cpp b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.cpp
index d0298ac77fe..e2e99815b93 100644
--- a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.cpp
+++ b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.cpp
@@ -12,7 +12,7 @@ namespace ErrorCodes
}
-void ReplacingSortedBlockInputStream::insertRow(MutableColumns & merged_columns, size_t & merged_rows)
+void ReplacingSortedBlockInputStream::insertRow(MutableColumns & merged_columns)
{
if (out_row_sources_buf)
{
@@ -24,7 +24,6 @@ void ReplacingSortedBlockInputStream::insertRow(MutableColumns & merged_columns,
current_row_sources.resize(0);
}
- ++merged_rows;
for (size_t i = 0; i < num_columns; ++i)
merged_columns[i]->insertFrom(*(*selected_row.columns)[i], selected_row.row_num);
}
@@ -51,12 +50,12 @@ Block ReplacingSortedBlockInputStream::readImpl()
void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue & queue)
{
- size_t merged_rows = 0;
-
+ MergeStopCondition stop_condition(average_block_sizes, max_block_size);
/// Take the rows in needed order and put them into `merged_columns` until rows no more than `max_block_size`
while (!queue.empty())
{
SortCursor current = queue.top();
+ size_t current_block_granularity = current->rows;
if (current_key.empty())
setPrimaryKeyRef(current_key, current);
@@ -66,7 +65,7 @@ void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, std
bool key_differs = next_key != current_key;
/// if there are enough rows and the last one is calculated completely
- if (key_differs && merged_rows >= max_block_size)
+ if (key_differs && stop_condition.checkStop())
return;
queue.pop();
@@ -74,7 +73,8 @@ void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, std
if (key_differs)
{
/// Write the data for the previous primary key.
- insertRow(merged_columns, merged_rows);
+ insertRow(merged_columns);
+ stop_condition.addRowWithGranularity(current_block_granularity);
selected_row.reset();
current_key.swap(next_key);
}
@@ -110,7 +110,7 @@ void ReplacingSortedBlockInputStream::merge(MutableColumns & merged_columns, std
/// We will write the data for the last primary key.
if (!selected_row.empty())
- insertRow(merged_columns, merged_rows);
+ insertRow(merged_columns);
finished = true;
}
diff --git a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h
index cd52c3a8d08..525c8a50754 100644
--- a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h
+++ b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h
@@ -18,8 +18,9 @@ class ReplacingSortedBlockInputStream : public MergingSortedBlockInputStream
public:
ReplacingSortedBlockInputStream(
const BlockInputStreams & inputs_, const SortDescription & description_,
- const String & version_column, size_t max_block_size_, WriteBuffer * out_row_sources_buf_ = nullptr)
- : MergingSortedBlockInputStream(inputs_, description_, max_block_size_, 0, out_row_sources_buf_)
+ const String & version_column, size_t max_block_size_, WriteBuffer * out_row_sources_buf_ = nullptr,
+ bool average_block_sizes_ = false)
+ : MergingSortedBlockInputStream(inputs_, description_, max_block_size_, 0, out_row_sources_buf_, false, average_block_sizes_)
{
if (!version_column.empty())
version_column_number = header.getPositionByName(version_column);
@@ -54,7 +55,7 @@ private:
void merge(MutableColumns & merged_columns, std::priority_queue & queue);
/// Output into result the rows for current primary key.
- void insertRow(MutableColumns & merged_columns, size_t & merged_rows);
+ void insertRow(MutableColumns & merged_columns);
};
}
diff --git a/dbms/src/DataStreams/TTLBlockInputStream.cpp b/dbms/src/DataStreams/TTLBlockInputStream.cpp
new file mode 100644
index 00000000000..482a3ff4814
--- /dev/null
+++ b/dbms/src/DataStreams/TTLBlockInputStream.cpp
@@ -0,0 +1,208 @@
+#include
+#include
+#include
+#include
+#include
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+}
+
+
+TTLBlockInputStream::TTLBlockInputStream(
+ const BlockInputStreamPtr & input_,
+ const MergeTreeData & storage_,
+ const MergeTreeData::MutableDataPartPtr & data_part_,
+ time_t current_time_)
+ : storage(storage_)
+ , data_part(data_part_)
+ , current_time(current_time_)
+ , old_ttl_infos(data_part->ttl_infos)
+ , log(&Logger::get(storage.getLogName() + " (TTLBlockInputStream)"))
+ , date_lut(DateLUT::instance())
+{
+ children.push_back(input_);
+
+ const auto & column_defaults = storage.getColumns().getDefaults();
+ ASTPtr default_expr_list = std::make_shared();
+ for (const auto & [name, ttl_info] : old_ttl_infos.columns_ttl)
+ {
+ if (ttl_info.min <= current_time)
+ {
+ new_ttl_infos.columns_ttl.emplace(name, MergeTreeDataPart::TTLInfo{});
+ empty_columns.emplace(name);
+
+ auto it = column_defaults.find(name);
+
+ if (it != column_defaults.end())
+ default_expr_list->children.emplace_back(
+ setAlias(it->second.expression, it->first));
+ }
+ else
+ new_ttl_infos.columns_ttl.emplace(name, ttl_info);
+ }
+
+ if (old_ttl_infos.table_ttl.min > current_time)
+ new_ttl_infos.table_ttl = old_ttl_infos.table_ttl;
+
+ if (!default_expr_list->children.empty())
+ {
+ auto syntax_result = SyntaxAnalyzer(storage.global_context).analyze(
+ default_expr_list, storage.getColumns().getAllPhysical());
+ defaults_expression = ExpressionAnalyzer{default_expr_list, syntax_result, storage.global_context}.getActions(true);
+ }
+}
+
+
+Block TTLBlockInputStream::getHeader() const
+{
+ return children.at(0)->getHeader();
+}
+
+Block TTLBlockInputStream::readImpl()
+{
+ Block block = children.at(0)->read();
+ if (!block)
+ return block;
+
+ if (storage.hasTableTTL())
+ {
+ /// Skip all data if table ttl is expired for part
+ if (old_ttl_infos.table_ttl.max <= current_time)
+ {
+ rows_removed = data_part->rows_count;
+ return {};
+ }
+
+ if (old_ttl_infos.table_ttl.min <= current_time)
+ removeRowsWithExpiredTableTTL(block);
+ }
+
+ removeValuesWithExpiredColumnTTL(block);
+
+ return block;
+}
+
+void TTLBlockInputStream::readSuffixImpl()
+{
+ for (const auto & elem : new_ttl_infos.columns_ttl)
+ new_ttl_infos.updatePartMinTTL(elem.second.min);
+
+ new_ttl_infos.updatePartMinTTL(new_ttl_infos.table_ttl.min);
+
+ data_part->ttl_infos = std::move(new_ttl_infos);
+ data_part->empty_columns = std::move(empty_columns);
+
+ if (rows_removed)
+ LOG_INFO(log, "Removed " << rows_removed << " rows with expired ttl from part " << data_part->name);
+}
+
+void TTLBlockInputStream::removeRowsWithExpiredTableTTL(Block & block)
+{
+ storage.ttl_table_entry.expression->execute(block);
+
+ const auto & current = block.getByName(storage.ttl_table_entry.result_column);
+ const IColumn * ttl_column = current.column.get();
+
+ MutableColumns result_columns;
+ result_columns.reserve(getHeader().columns());
+ for (const auto & name : storage.getColumns().getNamesOfPhysical())
+ {
+ auto & column_with_type = block.getByName(name);
+ const IColumn * values_column = column_with_type.column.get();
+ MutableColumnPtr result_column = values_column->cloneEmpty();
+ result_column->reserve(block.rows());
+
+ for (size_t i = 0; i < block.rows(); ++i)
+ {
+ UInt32 cur_ttl = getTimestampByIndex(ttl_column, i);
+ if (cur_ttl > current_time)
+ {
+ new_ttl_infos.table_ttl.update(cur_ttl);
+ result_column->insertFrom(*values_column, i);
+ }
+ else
+ ++rows_removed;
+ }
+ result_columns.emplace_back(std::move(result_column));
+ }
+
+ block = getHeader().cloneWithColumns(std::move(result_columns));
+}
+
+void TTLBlockInputStream::removeValuesWithExpiredColumnTTL(Block & block)
+{
+ Block block_with_defaults;
+ if (defaults_expression)
+ {
+ block_with_defaults = block;
+ defaults_expression->execute(block_with_defaults);
+ }
+
+ for (const auto & [name, ttl_entry] : storage.ttl_entries_by_name)
+ {
+ const auto & old_ttl_info = old_ttl_infos.columns_ttl[name];
+ auto & new_ttl_info = new_ttl_infos.columns_ttl[name];
+
+ if (old_ttl_info.min > current_time)
+ continue;
+
+ if (old_ttl_info.max <= current_time)
+ continue;
+
+ if (!block.has(ttl_entry.result_column))
+ ttl_entry.expression->execute(block);
+
+ ColumnPtr default_column = nullptr;
+ if (block_with_defaults.has(name))
+ default_column = block_with_defaults.getByName(name).column->convertToFullColumnIfConst();
+
+ auto & column_with_type = block.getByName(name);
+ const IColumn * values_column = column_with_type.column.get();
+ MutableColumnPtr result_column = values_column->cloneEmpty();
+ result_column->reserve(block.rows());
+
+ const auto & current = block.getByName(ttl_entry.result_column);
+ const IColumn * ttl_column = current.column.get();
+
+ for (size_t i = 0; i < block.rows(); ++i)
+ {
+ UInt32 cur_ttl = getTimestampByIndex(ttl_column, i);
+
+ if (cur_ttl <= current_time)
+ {
+ if (default_column)
+ result_column->insertFrom(*default_column, i);
+ else
+ result_column->insertDefault();
+ }
+ else
+ {
+ new_ttl_info.update(cur_ttl);
+ empty_columns.erase(name);
+ result_column->insertFrom(*values_column, i);
+ }
+ }
+ column_with_type.column = std::move(result_column);
+ }
+
+ for (const auto & elem : storage.ttl_entries_by_name)
+ if (block.has(elem.second.result_column))
+ block.erase(elem.second.result_column);
+}
+
+UInt32 TTLBlockInputStream::getTimestampByIndex(const IColumn * column, size_t ind)
+{
+ if (const ColumnUInt16 * column_date = typeid_cast(column))
+ return date_lut.fromDayNum(DayNum(column_date->getData()[ind]));
+ else if (const ColumnUInt32 * column_date_time = typeid_cast(column))
+ return column_date_time->getData()[ind];
+ else
+ throw Exception("Unexpected type of result ttl column", ErrorCodes::LOGICAL_ERROR);
+}
+
+}
diff --git a/dbms/src/DataStreams/TTLBlockInputStream.h b/dbms/src/DataStreams/TTLBlockInputStream.h
new file mode 100644
index 00000000000..a95cd627bc9
--- /dev/null
+++ b/dbms/src/DataStreams/TTLBlockInputStream.h
@@ -0,0 +1,60 @@
+#pragma once
+#include