From 7e7a864fd459d11ad7ea0312b7260f9ee2a04790 Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Thu, 31 Jan 2019 21:26:11 +0800
Subject: [PATCH 01/69] Added bitmap function feature with roaring bitmap

---
 contrib/CMakeLists.txt                        |     1 +
 contrib/croaring/CMakeLists.txt               |     6 +
 contrib/croaring/LICENSE                      |   202 +
 contrib/croaring/README.txt                   |     2 +
 contrib/croaring/roaring.c                    | 11093 ++++++++++++++++
 contrib/croaring/roaring.h                    |  7166 ++++++++++
 contrib/croaring/roaring.hh                   |  1732 +++
 dbms/CMakeLists.txt                           |     1 +
 .../AggregateFunctionGroupBitmap.cpp          |    40 +
 .../AggregateFunctionGroupBitmap.h            |    53 +
 .../AggregateFunctionGroupBitmapData.h        |   492 +
 .../registerAggregateFunctions.cpp            |     2 +
 dbms/src/Functions/FunctionsBitmap.cpp        |    25 +
 dbms/src/Functions/FunctionsBitmap.h          |   499 +
 dbms/src/Functions/registerFunctions.cpp      |     2 +
 .../00834_bitmap_function.reference           |    15 +
 .../0_stateless/00834_bitmap_function.sql     |    56 +
 .../query_language/agg_functions/reference.md |    42 +
 .../functions/bitmap_functions.md             |   277 +
 docs/redirects.txt                            |     1 +
 docs/toc_en.yml                               |     1 +
 21 files changed, 21708 insertions(+)
 create mode 100644 contrib/croaring/CMakeLists.txt
 create mode 100644 contrib/croaring/LICENSE
 create mode 100644 contrib/croaring/README.txt
 create mode 100644 contrib/croaring/roaring.c
 create mode 100644 contrib/croaring/roaring.h
 create mode 100644 contrib/croaring/roaring.hh
 create mode 100644 dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
 create mode 100644 dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h
 create mode 100644 dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
 create mode 100644 dbms/src/Functions/FunctionsBitmap.cpp
 create mode 100644 dbms/src/Functions/FunctionsBitmap.h
 create mode 100644 dbms/tests/queries/0_stateless/00834_bitmap_function.reference
 create mode 100644 dbms/tests/queries/0_stateless/00834_bitmap_function.sql
 create mode 100644 docs/en/query_language/functions/bitmap_functions.md

diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index fcc2cc75817..ccfb6bea4fb 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -44,6 +44,7 @@ if (USE_INTERNAL_METROHASH_LIBRARY)
 endif ()
 
 add_subdirectory (murmurhash)
+add_subdirectory (croaring)
 
 if (USE_INTERNAL_BTRIE_LIBRARY)
     add_subdirectory (libbtrie)
diff --git a/contrib/croaring/CMakeLists.txt b/contrib/croaring/CMakeLists.txt
new file mode 100644
index 00000000000..c79f66a05d7
--- /dev/null
+++ b/contrib/croaring/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_library(roaring
+	roaring.c
+	roaring.h
+	roaring.hh)
+
+target_include_directories (roaring PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/contrib/croaring/LICENSE b/contrib/croaring/LICENSE
new file mode 100644
index 00000000000..3265476ea81
--- /dev/null
+++ b/contrib/croaring/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2016 The CRoaring authors 
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/contrib/croaring/README.txt b/contrib/croaring/README.txt
new file mode 100644
index 00000000000..3daa1c43ed8
--- /dev/null
+++ b/contrib/croaring/README.txt
@@ -0,0 +1,2 @@
+download from https://github.com/RoaringBitmap/CRoaring/archive/v0.2.57.tar.gz
+and use ./amalgamation.sh generate
diff --git a/contrib/croaring/roaring.c b/contrib/croaring/roaring.c
new file mode 100644
index 00000000000..acf8ed9fee9
--- /dev/null
+++ b/contrib/croaring/roaring.c
@@ -0,0 +1,11093 @@
+/* auto-generated on Tue Dec 18 09:42:59 CST 2018. Do not edit! */
+#include "roaring.h"
+
+/* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
+#ifdef DMALLOC
+#include "dmalloc.h"
+#endif
+
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/array_util.c */
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern inline int32_t binarySearch(const uint16_t *array, int32_t lenarray,
+                                   uint16_t ikey);
+
+#ifdef USESSE4
+// used by intersect_vector16
+ALIGNED(0x1000)
+static const uint8_t shuffle_mask16[] = {
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
+    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
+    0xFF, 0xFF, 0xFF, 0xFF, 12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
+    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    8,    9,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    10,   11,
+    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    10,   11,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    8,    9,    10,   11,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
+    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
+    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    6,    7,    8,    9,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    6,    7,    8,    9,    12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
+    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4,    5,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    4,    5,    6,    7,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    4,    5,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    2,    3,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   12,   13,
+    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
+    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
+    2,    3,    4,    5,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 6,    7,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0,    1,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
+    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
+    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
+    0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
+    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 2,    3,    4,    5,
+    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
+    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
+    12,   13,   14,   15};
+
+/**
+ * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
+ * Optimized by D. Lemire on May 3rd 2013
+ */
+int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                           const uint16_t *__restrict__ B, size_t s_b,
+                           uint16_t *C) {
+    size_t count = 0;
+    size_t i_a = 0, i_b = 0;
+    const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    __m128i v_a, v_b;
+    if ((i_a < st_a) && (i_b < st_b)) {
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        while ((A[i_a] == 0) || (B[i_b] == 0)) {
+            const __m128i res_v = _mm_cmpestrm(
+                v_b, vectorlength, v_a, vectorlength,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            const int r = _mm_extract_epi32(res_v, 0);
+            __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 + r);
+            __m128i p = _mm_shuffle_epi8(v_a, sm16);
+            _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+            count += _mm_popcnt_u32(r);
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                i_a += vectorlength;
+                if (i_a == st_a) break;
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        if ((i_a < st_a) && (i_b < st_b))
+            while (true) {
+                const __m128i res_v = _mm_cmpistrm(
+                    v_b, v_a,
+                    _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+                const int r = _mm_extract_epi32(res_v, 0);
+                __m128i sm16 =
+                    _mm_load_si128((const __m128i *)shuffle_mask16 + r);
+                __m128i p = _mm_shuffle_epi8(v_a, sm16);
+                _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+                count += _mm_popcnt_u32(r);
+                const uint16_t a_max = A[i_a + vectorlength - 1];
+                const uint16_t b_max = B[i_b + vectorlength - 1];
+                if (a_max <= b_max) {
+                    i_a += vectorlength;
+                    if (i_a == st_a) break;
+                    v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+                }
+                if (b_max <= a_max) {
+                    i_b += vectorlength;
+                    if (i_b == st_b) break;
+                    v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+                }
+            }
+    }
+    // intersect the tail using scalar intersection
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (a < b) {
+            i_a++;
+        } else if (b < a) {
+            i_b++;
+        } else {
+            C[count] = a;  //==b;
+            count++;
+            i_a++;
+            i_b++;
+        }
+    }
+    return (int32_t)count;
+}
+
+int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
+                                       size_t s_a,
+                                       const uint16_t *__restrict__ B,
+                                       size_t s_b) {
+    size_t count = 0;
+    size_t i_a = 0, i_b = 0;
+    const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    __m128i v_a, v_b;
+    if ((i_a < st_a) && (i_b < st_b)) {
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        while ((A[i_a] == 0) || (B[i_b] == 0)) {
+            const __m128i res_v = _mm_cmpestrm(
+                v_b, vectorlength, v_a, vectorlength,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            const int r = _mm_extract_epi32(res_v, 0);
+            count += _mm_popcnt_u32(r);
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                i_a += vectorlength;
+                if (i_a == st_a) break;
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        if ((i_a < st_a) && (i_b < st_b))
+            while (true) {
+                const __m128i res_v = _mm_cmpistrm(
+                    v_b, v_a,
+                    _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+                const int r = _mm_extract_epi32(res_v, 0);
+                count += _mm_popcnt_u32(r);
+                const uint16_t a_max = A[i_a + vectorlength - 1];
+                const uint16_t b_max = B[i_b + vectorlength - 1];
+                if (a_max <= b_max) {
+                    i_a += vectorlength;
+                    if (i_a == st_a) break;
+                    v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+                }
+                if (b_max <= a_max) {
+                    i_b += vectorlength;
+                    if (i_b == st_b) break;
+                    v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+                }
+            }
+    }
+    // intersect the tail using scalar intersection
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (a < b) {
+            i_a++;
+        } else if (b < a) {
+            i_b++;
+        } else {
+            count++;
+            i_a++;
+            i_b++;
+        }
+    }
+    return (int32_t)count;
+}
+
+int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                            const uint16_t *__restrict__ B, size_t s_b,
+                            uint16_t *C) {
+    // we handle the degenerate case
+    if (s_a == 0) return 0;
+    if (s_b == 0) {
+        if (A != C) memcpy(C, A, sizeof(uint16_t) * s_a);
+        return (int32_t)s_a;
+    }
+    // handle the leading zeroes, it is messy but it allows us to use the fast
+    // _mm_cmpistrm instrinsic safely
+    int32_t count = 0;
+    if ((A[0] == 0) || (B[0] == 0)) {
+        if ((A[0] == 0) && (B[0] == 0)) {
+            A++;
+            s_a--;
+            B++;
+            s_b--;
+        } else if (A[0] == 0) {
+            C[count++] = 0;
+            A++;
+            s_a--;
+        } else {
+            B++;
+            s_b--;
+        }
+    }
+    // at this point, we have two non-empty arrays, made of non-zero
+    // increasing values.
+    size_t i_a = 0, i_b = 0;
+    const size_t vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    if ((i_a < st_a) && (i_b < st_b)) {  // this is the vectorized code path
+        __m128i v_a, v_b;                //, v_bmax;
+        // we load a vector from A and a vector from B
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        // we have a runningmask which indicates which values from A have been
+        // spotted in B, these don't get written out.
+        __m128i runningmask_a_found_in_b = _mm_setzero_si128();
+        /****
+        * start of the main vectorized loop
+        *****/
+        while (true) {
+            // afoundinb will contain a mask indicate for each entry in A
+            // whether it is seen
+            // in B
+            const __m128i a_found_in_b =
+                _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
+                                           _SIDD_BIT_MASK);
+            runningmask_a_found_in_b =
+                _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
+            // we always compare the last values of A and B
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                // Ok. In this code path, we are ready to write our v_a
+                // because there is no need to read more from B, they will
+                // all be large values.
+                const int bitmask_belongs_to_difference =
+                    _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
+                /*** next few lines are probably expensive *****/
+                __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
+                                              bitmask_belongs_to_difference);
+                __m128i p = _mm_shuffle_epi8(v_a, sm16);
+                _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+                count += _mm_popcnt_u32(bitmask_belongs_to_difference);
+                // we advance a
+                i_a += vectorlength;
+                if (i_a == st_a)  // no more
+                    break;
+                runningmask_a_found_in_b = _mm_setzero_si128();
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                // in this code path, the current v_b has become useless
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        // at this point, either we have i_a == st_a, which is the end of the
+        // vectorized processing,
+        // or we have i_b == st_b,  and we are not done processing the vector...
+        // so we need to finish it off.
+        if (i_a < st_a) {        // we have unfinished business...
+            uint16_t buffer[8];  // buffer to do a masked load
+            memset(buffer, 0, 8 * sizeof(uint16_t));
+            memcpy(buffer, B + i_b, (s_b - i_b) * sizeof(uint16_t));
+            v_b = _mm_lddqu_si128((__m128i *)buffer);
+            const __m128i a_found_in_b =
+                _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
+                                           _SIDD_BIT_MASK);
+            runningmask_a_found_in_b =
+                _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
+            const int bitmask_belongs_to_difference =
+                _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
+            __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
+                                          bitmask_belongs_to_difference);
+            __m128i p = _mm_shuffle_epi8(v_a, sm16);
+            _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
+            count += _mm_popcnt_u32(bitmask_belongs_to_difference);
+            i_a += vectorlength;
+        }
+        // at this point we should have i_a == st_a and i_b == st_b
+    }
+    // do the tail using scalar code
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (b < a) {
+            i_b++;
+        } else if (a < b) {
+            C[count] = a;
+            count++;
+            i_a++;
+        } else {  //==
+            i_a++;
+            i_b++;
+        }
+    }
+    if (i_a < s_a) {
+        memmove(C + count, A + i_a, sizeof(uint16_t) * (s_a - i_a));
+        count += (int32_t)(s_a - i_a);
+    }
+    return count;
+}
+
+#endif  // USESSE4
+
+
+
+#ifdef USE_OLD_SKEW_INTERSECT
+// TODO: given enough experience with the new skew intersect, drop the old one from the code base.
+
+
+/* Computes the intersection between one small and one large set of uint16_t.
+ * Stores the result into buffer and return the number of elements. */
+int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
+                                const uint16_t *large, size_t size_l,
+                                uint16_t *buffer) {
+    size_t pos = 0, idx_l = 0, idx_s = 0;
+
+    if (0 == size_s) {
+        return 0;
+    }
+
+    uint16_t val_l = large[idx_l], val_s = small[idx_s];
+
+    while (true) {
+        if (val_l < val_s) {
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        } else if (val_s < val_l) {
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+        } else {
+            buffer[pos++] = val_s;
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        }
+    }
+
+    return (int32_t)pos;
+}
+#else // USE_OLD_SKEW_INTERSECT
+
+
+/**
+* Branchless binary search going after 4 values at once.
+* Assumes that array is sorted.
+* You have that array[*index1] >= target1, array[*index12] >= target2, ...
+* except when *index1 = n, in which case you know that all values in array are
+* smaller than target1, and so forth.
+* It has logarithmic complexity.
+*/
+static void binarySearch4(const uint16_t *array, int32_t n, uint16_t target1,
+                   uint16_t target2, uint16_t target3, uint16_t target4,
+                   int32_t *index1, int32_t *index2, int32_t *index3,
+                   int32_t *index4) {
+  const uint16_t *base1 = array;
+  const uint16_t *base2 = array;
+  const uint16_t *base3 = array;
+  const uint16_t *base4 = array;
+  if (n == 0)
+    return;
+  while (n > 1) {
+    int32_t half = n >> 1;
+    base1 = (base1[half] < target1) ? &base1[half] : base1;
+    base2 = (base2[half] < target2) ? &base2[half] : base2;
+    base3 = (base3[half] < target3) ? &base3[half] : base3;
+    base4 = (base4[half] < target4) ? &base4[half] : base4;
+    n -= half;
+  }
+  *index1 = (int32_t)((*base1 < target1) + base1 - array);
+  *index2 = (int32_t)((*base2 < target2) + base2 - array);
+  *index3 = (int32_t)((*base3 < target3) + base3 - array);
+  *index4 = (int32_t)((*base4 < target4) + base4 - array);
+}
+
+/**
+* Branchless binary search going after 2 values at once.
+* Assumes that array is sorted.
+* You have that array[*index1] >= target1, array[*index12] >= target2.
+* except when *index1 = n, in which case you know that all values in array are
+* smaller than target1, and so forth.
+* It has logarithmic complexity.
+*/
+static void binarySearch2(const uint16_t *array, int32_t n, uint16_t target1,
+                   uint16_t target2, int32_t *index1, int32_t *index2) {
+  const uint16_t *base1 = array;
+  const uint16_t *base2 = array;
+  if (n == 0)
+    return;
+  while (n > 1) {
+    int32_t half = n >> 1;
+    base1 = (base1[half] < target1) ? &base1[half] : base1;
+    base2 = (base2[half] < target2) ? &base2[half] : base2;
+    n -= half;
+  }
+  *index1 = (int32_t)((*base1 < target1) + base1 - array);
+  *index2 = (int32_t)((*base2 < target2) + base2 - array);
+}
+
+/* Computes the intersection between one small and one large set of uint16_t.
+ * Stores the result into buffer and return the number of elements.
+ * Processes the small set in blocks of 4 values calling binarySearch4
+ * and binarySearch2. This approach can be slightly superior to a conventional
+ * galloping search in some instances.
+ */
+int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
+                                         const uint16_t *large, size_t size_l,
+                                         uint16_t *buffer) {
+  size_t pos = 0, idx_l = 0, idx_s = 0;
+
+  if (0 == size_s) {
+    return 0;
+  }
+  int32_t index1 = 0, index2 = 0, index3 = 0, index4 = 0;
+  while ((idx_s + 4 <= size_s) && (idx_l < size_l)) {
+    uint16_t target1 = small[idx_s];
+    uint16_t target2 = small[idx_s + 1];
+    uint16_t target3 = small[idx_s + 2];
+    uint16_t target4 = small[idx_s + 3];
+    binarySearch4(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, target3,
+                  target4, &index1, &index2, &index3, &index4);
+    if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
+      buffer[pos++] = target1;
+    }
+    if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
+      buffer[pos++] = target2;
+    }
+    if ((index3 + idx_l < size_l) && (large[idx_l + index3] == target3)) {
+      buffer[pos++] = target3;
+    }
+    if ((index4 + idx_l < size_l) && (large[idx_l + index4] == target4)) {
+      buffer[pos++] = target4;
+    }
+    idx_s += 4;
+    idx_l += index1;
+  }
+  if ((idx_s + 2 <= size_s) && (idx_l < size_l)) {
+    uint16_t target1 = small[idx_s];
+    uint16_t target2 = small[idx_s + 1];
+    binarySearch2(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, &index1,
+                  &index2);
+    if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
+      buffer[pos++] = target1;
+    }
+    if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
+      buffer[pos++] = target2;
+    }
+    idx_s += 2;
+    idx_l += index1;
+  }
+  if ((idx_s < size_s) && (idx_l < size_l)) {
+    uint16_t val_s = small[idx_s];
+    int32_t index = binarySearch(large + idx_l, (int32_t)(size_l - idx_l), val_s);
+    if (index >= 0)
+      buffer[pos++] = val_s;
+  }
+  return (int32_t)pos;
+}
+
+
+#endif //USE_OLD_SKEW_INTERSECT
+
+
+// TODO: this could be accelerated, possibly, by using binarySearch4 as above.
+int32_t intersect_skewed_uint16_cardinality(const uint16_t *small,
+                                            size_t size_s,
+                                            const uint16_t *large,
+                                            size_t size_l) {
+    size_t pos = 0, idx_l = 0, idx_s = 0;
+
+    if (0 == size_s) {
+        return 0;
+    }
+
+    uint16_t val_l = large[idx_l], val_s = small[idx_s];
+
+    while (true) {
+        if (val_l < val_s) {
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        } else if (val_s < val_l) {
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+        } else {
+            pos++;
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        }
+    }
+
+    return (int32_t)pos;
+}
+
+bool intersect_skewed_uint16_nonempty(const uint16_t *small, size_t size_s,
+                                const uint16_t *large, size_t size_l) {
+    size_t idx_l = 0, idx_s = 0;
+
+    if (0 == size_s) {
+        return false;
+    }
+
+    uint16_t val_l = large[idx_l], val_s = small[idx_s];
+
+    while (true) {
+        if (val_l < val_s) {
+            idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
+            if (idx_l == size_l) break;
+            val_l = large[idx_l];
+        } else if (val_s < val_l) {
+            idx_s++;
+            if (idx_s == size_s) break;
+            val_s = small[idx_s];
+        } else {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/**
+ * Generic intersection function.
+ */
+int32_t intersect_uint16(const uint16_t *A, const size_t lenA,
+                         const uint16_t *B, const size_t lenB, uint16_t *out) {
+    const uint16_t *initout = out;
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint16_t *endA = A + lenA;
+    const uint16_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return (int32_t)(out - initout);
+        }
+        while (*A > *B) {
+            if (++B == endB) return (int32_t)(out - initout);
+        }
+        if (*A == *B) {
+            *out++ = *A;
+            if (++A == endA || ++B == endB) return (int32_t)(out - initout);
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    return (int32_t)(out - initout);  // NOTREACHED
+}
+
+int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA,
+                                     const uint16_t *B, const size_t lenB) {
+    int32_t answer = 0;
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint16_t *endA = A + lenA;
+    const uint16_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return answer;
+        }
+        while (*A > *B) {
+            if (++B == endB) return answer;
+        }
+        if (*A == *B) {
+            ++answer;
+            if (++A == endA || ++B == endB) return answer;
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    return answer;  // NOTREACHED
+}
+
+
+bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA,
+                         const uint16_t *B, const size_t lenB) {
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint16_t *endA = A + lenA;
+    const uint16_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return false;
+        }
+        while (*A > *B) {
+            if (++B == endB) return false;
+        }
+        if (*A == *B) {
+            return true;
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    return false;  // NOTREACHED
+}
+
+
+
+/**
+ * Generic intersection function.
+ */
+size_t intersection_uint32(const uint32_t *A, const size_t lenA,
+                           const uint32_t *B, const size_t lenB,
+                           uint32_t *out) {
+    const uint32_t *initout = out;
+    if (lenA == 0 || lenB == 0) return 0;
+    const uint32_t *endA = A + lenA;
+    const uint32_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return (out - initout);
+        }
+        while (*A > *B) {
+            if (++B == endB) return (out - initout);
+        }
+        if (*A == *B) {
+            *out++ = *A;
+            if (++A == endA || ++B == endB) return (out - initout);
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    return (out - initout);  // NOTREACHED
+}
+
+size_t intersection_uint32_card(const uint32_t *A, const size_t lenA,
+                                const uint32_t *B, const size_t lenB) {
+    if (lenA == 0 || lenB == 0) return 0;
+    size_t card = 0;
+    const uint32_t *endA = A + lenA;
+    const uint32_t *endB = B + lenB;
+
+    while (1) {
+        while (*A < *B) {
+        SKIP_FIRST_COMPARE:
+            if (++A == endA) return card;
+        }
+        while (*A > *B) {
+            if (++B == endB) return card;
+        }
+        if (*A == *B) {
+            card++;
+            if (++A == endA || ++B == endB) return card;
+        } else {
+            goto SKIP_FIRST_COMPARE;
+        }
+    }
+    return card;  // NOTREACHED
+}
+
+// can one vectorize the computation of the union? (Update: Yes! See
+// union_vector16).
+
+size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
+                    size_t size_2, uint16_t *buffer) {
+    size_t pos = 0, idx_1 = 0, idx_2 = 0;
+
+    if (0 == size_2) {
+        memmove(buffer, set_1, size_1 * sizeof(uint16_t));
+        return size_1;
+    }
+    if (0 == size_1) {
+        memmove(buffer, set_2, size_2 * sizeof(uint16_t));
+        return size_2;
+    }
+
+    uint16_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
+
+    while (true) {
+        if (val_1 < val_2) {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            if (idx_1 >= size_1) break;
+            val_1 = set_1[idx_1];
+        } else if (val_2 < val_1) {
+            buffer[pos++] = val_2;
+            ++idx_2;
+            if (idx_2 >= size_2) break;
+            val_2 = set_2[idx_2];
+        } else {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            ++idx_2;
+            if (idx_1 >= size_1 || idx_2 >= size_2) break;
+            val_1 = set_1[idx_1];
+            val_2 = set_2[idx_2];
+        }
+    }
+
+    if (idx_1 < size_1) {
+        const size_t n_elems = size_1 - idx_1;
+        memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint16_t));
+        pos += n_elems;
+    } else if (idx_2 < size_2) {
+        const size_t n_elems = size_2 - idx_2;
+        memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint16_t));
+        pos += n_elems;
+    }
+
+    return pos;
+}
+
+int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2,
+                      int length2, uint16_t *a_out) {
+    int out_card = 0;
+    int k1 = 0, k2 = 0;
+    if (length1 == 0) return 0;
+    if (length2 == 0) {
+        if (a1 != a_out) memcpy(a_out, a1, sizeof(uint16_t) * length1);
+        return length1;
+    }
+    uint16_t s1 = a1[k1];
+    uint16_t s2 = a2[k2];
+    while (true) {
+        if (s1 < s2) {
+            a_out[out_card++] = s1;
+            ++k1;
+            if (k1 >= length1) {
+                break;
+            }
+            s1 = a1[k1];
+        } else if (s1 == s2) {
+            ++k1;
+            ++k2;
+            if (k1 >= length1) {
+                break;
+            }
+            if (k2 >= length2) {
+                memmove(a_out + out_card, a1 + k1,
+                        sizeof(uint16_t) * (length1 - k1));
+                return out_card + length1 - k1;
+            }
+            s1 = a1[k1];
+            s2 = a2[k2];
+        } else {  // if (val1>val2)
+            ++k2;
+            if (k2 >= length2) {
+                memmove(a_out + out_card, a1 + k1,
+                        sizeof(uint16_t) * (length1 - k1));
+                return out_card + length1 - k1;
+            }
+            s2 = a2[k2];
+        }
+    }
+    return out_card;
+}
+
+int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
+                   const uint16_t *array_2, int32_t card_2, uint16_t *out) {
+    int32_t pos1 = 0, pos2 = 0, pos_out = 0;
+    while (pos1 < card_1 && pos2 < card_2) {
+        const uint16_t v1 = array_1[pos1];
+        const uint16_t v2 = array_2[pos2];
+        if (v1 == v2) {
+            ++pos1;
+            ++pos2;
+            continue;
+        }
+        if (v1 < v2) {
+            out[pos_out++] = v1;
+            ++pos1;
+        } else {
+            out[pos_out++] = v2;
+            ++pos2;
+        }
+    }
+    if (pos1 < card_1) {
+        const size_t n_elems = card_1 - pos1;
+        memcpy(out + pos_out, array_1 + pos1, n_elems * sizeof(uint16_t));
+        pos_out += (int32_t)n_elems;
+    } else if (pos2 < card_2) {
+        const size_t n_elems = card_2 - pos2;
+        memcpy(out + pos_out, array_2 + pos2, n_elems * sizeof(uint16_t));
+        pos_out += (int32_t)n_elems;
+    }
+    return pos_out;
+}
+
+#ifdef USESSE4
+
+/***
+ * start of the SIMD 16-bit union code
+ *
+ */
+
+// Assuming that vInput1 and vInput2 are sorted, produces a sorted output going
+// from vecMin all the way to vecMax
+// developed originally for merge sort using SIMD instructions.
+// Standard merge. See, e.g., Inoue and Taura, SIMD- and Cache-Friendly
+// Algorithm for Sorting an Array of Structures
+static inline void sse_merge(const __m128i *vInput1,
+                             const __m128i *vInput2,              // input 1 & 2
+                             __m128i *vecMin, __m128i *vecMax) {  // output
+    __m128i vecTmp;
+    vecTmp = _mm_min_epu16(*vInput1, *vInput2);
+    *vecMax = _mm_max_epu16(*vInput1, *vInput2);
+    vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+    *vecMin = _mm_min_epu16(vecTmp, *vecMax);
+    *vecMax = _mm_max_epu16(vecTmp, *vecMax);
+    *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 2);
+}
+
+// used by store_unique, generated by simdunion.py
+static uint8_t uniqshuf[] = {
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xe,  0xf,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xe,  0xf,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xc,  0xd,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xc,  0xd,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
+    0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
+    0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x2,  0x3,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0x0,  0x1,  0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF};
+
+// write vector new, while omitting repeated values assuming that previously
+// written vector was "old"
+static inline int store_unique(__m128i old, __m128i newval, uint16_t *output) {
+    __m128i vecTmp = _mm_alignr_epi8(newval, old, 16 - 2);
+    // lots of high latency instructions follow (optimize?)
+    int M = _mm_movemask_epi8(
+        _mm_packs_epi16(_mm_cmpeq_epi16(vecTmp, newval), _mm_setzero_si128()));
+    int numberofnewvalues = 8 - _mm_popcnt_u32(M);
+    __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
+    __m128i val = _mm_shuffle_epi8(newval, key);
+    _mm_storeu_si128((__m128i *)output, val);
+    return numberofnewvalues;
+}
+
+// working in-place, this function overwrites the repeated values
+// could be avoided?
+static inline uint32_t unique(uint16_t *out, uint32_t len) {
+    uint32_t pos = 1;
+    for (uint32_t i = 1; i < len; ++i) {
+        if (out[i] != out[i - 1]) {
+            out[pos++] = out[i];
+        }
+    }
+    return pos;
+}
+
+// use with qsort, could be avoided
+static int uint16_compare(const void *a, const void *b) {
+    return (*(uint16_t *)a - *(uint16_t *)b);
+}
+
+// a one-pass SSE union algorithm
+uint32_t union_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
+                        const uint16_t *__restrict__ array2, uint32_t length2,
+                        uint16_t *__restrict__ output) {
+    if ((length1 < 8) || (length2 < 8)) {
+        return (uint32_t)union_uint16(array1, length1, array2, length2, output);
+    }
+    __m128i vA, vB, V, vecMin, vecMax;
+    __m128i laststore;
+    uint16_t *initoutput = output;
+    uint32_t len1 = length1 / 8;
+    uint32_t len2 = length2 / 8;
+    uint32_t pos1 = 0;
+    uint32_t pos2 = 0;
+    // we start the machine
+    vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+    pos1++;
+    vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+    pos2++;
+    sse_merge(&vA, &vB, &vecMin, &vecMax);
+    laststore = _mm_set1_epi16(-1);
+    output += store_unique(laststore, vecMin, output);
+    laststore = vecMin;
+    if ((pos1 < len1) && (pos2 < len2)) {
+        uint16_t curA, curB;
+        curA = array1[8 * pos1];
+        curB = array2[8 * pos2];
+        while (true) {
+            if (curA <= curB) {
+                V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+                pos1++;
+                if (pos1 < len1) {
+                    curA = array1[8 * pos1];
+                } else {
+                    break;
+                }
+            } else {
+                V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+                pos2++;
+                if (pos2 < len2) {
+                    curB = array2[8 * pos2];
+                } else {
+                    break;
+                }
+            }
+            sse_merge(&V, &vecMax, &vecMin, &vecMax);
+            output += store_unique(laststore, vecMin, output);
+            laststore = vecMin;
+        }
+        sse_merge(&V, &vecMax, &vecMin, &vecMax);
+        output += store_unique(laststore, vecMin, output);
+        laststore = vecMin;
+    }
+    // we finish the rest off using a scalar algorithm
+    // could be improved?
+    //
+    // copy the small end on a tmp buffer
+    uint32_t len = (uint32_t)(output - initoutput);
+    uint16_t buffer[16];
+    uint32_t leftoversize = store_unique(laststore, vecMax, buffer);
+    if (pos1 == len1) {
+        memcpy(buffer + leftoversize, array1 + 8 * pos1,
+               (length1 - 8 * len1) * sizeof(uint16_t));
+        leftoversize += length1 - 8 * len1;
+        qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+
+        leftoversize = unique(buffer, leftoversize);
+        len += (uint32_t)union_uint16(buffer, leftoversize, array2 + 8 * pos2,
+                                      length2 - 8 * pos2, output);
+    } else {
+        memcpy(buffer + leftoversize, array2 + 8 * pos2,
+               (length2 - 8 * len2) * sizeof(uint16_t));
+        leftoversize += length2 - 8 * len2;
+        qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+        leftoversize = unique(buffer, leftoversize);
+        len += (uint32_t)union_uint16(buffer, leftoversize, array1 + 8 * pos1,
+                                      length1 - 8 * pos1, output);
+    }
+    return len;
+}
+
+/**
+ * End of the SIMD 16-bit union code
+ *
+ */
+
+/**
+ * Start of SIMD 16-bit XOR code
+ */
+
+// write vector new, while omitting repeated values assuming that previously
+// written vector was "old"
+static inline int store_unique_xor(__m128i old, __m128i newval,
+                                   uint16_t *output) {
+    __m128i vecTmp1 = _mm_alignr_epi8(newval, old, 16 - 4);
+    __m128i vecTmp2 = _mm_alignr_epi8(newval, old, 16 - 2);
+    __m128i equalleft = _mm_cmpeq_epi16(vecTmp2, vecTmp1);
+    __m128i equalright = _mm_cmpeq_epi16(vecTmp2, newval);
+    __m128i equalleftoright = _mm_or_si128(equalleft, equalright);
+    int M = _mm_movemask_epi8(
+        _mm_packs_epi16(equalleftoright, _mm_setzero_si128()));
+    int numberofnewvalues = 8 - _mm_popcnt_u32(M);
+    __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
+    __m128i val = _mm_shuffle_epi8(vecTmp2, key);
+    _mm_storeu_si128((__m128i *)output, val);
+    return numberofnewvalues;
+}
+
+// working in-place, this function overwrites the repeated values
+// could be avoided? Warning: assumes len > 0
+static inline uint32_t unique_xor(uint16_t *out, uint32_t len) {
+    uint32_t pos = 1;
+    for (uint32_t i = 1; i < len; ++i) {
+        if (out[i] != out[i - 1]) {
+            out[pos++] = out[i];
+        } else
+            pos--;  // if it is identical to previous, delete it
+    }
+    return pos;
+}
+
+// a one-pass SSE xor algorithm
+uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
+                      const uint16_t *__restrict__ array2, uint32_t length2,
+                      uint16_t *__restrict__ output) {
+    if ((length1 < 8) || (length2 < 8)) {
+        return xor_uint16(array1, length1, array2, length2, output);
+    }
+    __m128i vA, vB, V, vecMin, vecMax;
+    __m128i laststore;
+    uint16_t *initoutput = output;
+    uint32_t len1 = length1 / 8;
+    uint32_t len2 = length2 / 8;
+    uint32_t pos1 = 0;
+    uint32_t pos2 = 0;
+    // we start the machine
+    vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+    pos1++;
+    vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+    pos2++;
+    sse_merge(&vA, &vB, &vecMin, &vecMax);
+    laststore = _mm_set1_epi16(-1);
+    uint16_t buffer[17];
+    output += store_unique_xor(laststore, vecMin, output);
+
+    laststore = vecMin;
+    if ((pos1 < len1) && (pos2 < len2)) {
+        uint16_t curA, curB;
+        curA = array1[8 * pos1];
+        curB = array2[8 * pos2];
+        while (true) {
+            if (curA <= curB) {
+                V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
+                pos1++;
+                if (pos1 < len1) {
+                    curA = array1[8 * pos1];
+                } else {
+                    break;
+                }
+            } else {
+                V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
+                pos2++;
+                if (pos2 < len2) {
+                    curB = array2[8 * pos2];
+                } else {
+                    break;
+                }
+            }
+            sse_merge(&V, &vecMax, &vecMin, &vecMax);
+            // conditionally stores the last value of laststore as well as all
+            // but the
+            // last value of vecMin
+            output += store_unique_xor(laststore, vecMin, output);
+            laststore = vecMin;
+        }
+        sse_merge(&V, &vecMax, &vecMin, &vecMax);
+        // conditionally stores the last value of laststore as well as all but
+        // the
+        // last value of vecMin
+        output += store_unique_xor(laststore, vecMin, output);
+        laststore = vecMin;
+    }
+    uint32_t len = (uint32_t)(output - initoutput);
+
+    // we finish the rest off using a scalar algorithm
+    // could be improved?
+    // conditionally stores the last value of laststore as well as all but the
+    // last value of vecMax,
+    // we store to "buffer"
+    int leftoversize = store_unique_xor(laststore, vecMax, buffer);
+    uint16_t vec7 = _mm_extract_epi16(vecMax, 7);
+    uint16_t vec6 = _mm_extract_epi16(vecMax, 6);
+    if (vec7 != vec6) buffer[leftoversize++] = vec7;
+    if (pos1 == len1) {
+        memcpy(buffer + leftoversize, array1 + 8 * pos1,
+               (length1 - 8 * len1) * sizeof(uint16_t));
+        leftoversize += length1 - 8 * len1;
+        if (leftoversize == 0) {  // trivial case
+            memcpy(output, array2 + 8 * pos2,
+                   (length2 - 8 * pos2) * sizeof(uint16_t));
+            len += (length2 - 8 * pos2);
+        } else {
+            qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+            leftoversize = unique_xor(buffer, leftoversize);
+            len += xor_uint16(buffer, leftoversize, array2 + 8 * pos2,
+                              length2 - 8 * pos2, output);
+        }
+    } else {
+        memcpy(buffer + leftoversize, array2 + 8 * pos2,
+               (length2 - 8 * len2) * sizeof(uint16_t));
+        leftoversize += length2 - 8 * len2;
+        if (leftoversize == 0) {  // trivial case
+            memcpy(output, array1 + 8 * pos1,
+                   (length1 - 8 * pos1) * sizeof(uint16_t));
+            len += (length1 - 8 * pos1);
+        } else {
+            qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
+            leftoversize = unique_xor(buffer, leftoversize);
+            len += xor_uint16(buffer, leftoversize, array1 + 8 * pos1,
+                              length1 - 8 * pos1, output);
+        }
+    }
+    return len;
+}
+
+/**
+ * End of SIMD 16-bit XOR code
+ */
+
+#endif  // USESSE4
+
+size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2,
+                    size_t size_2, uint32_t *buffer) {
+    size_t pos = 0, idx_1 = 0, idx_2 = 0;
+
+    if (0 == size_2) {
+        memmove(buffer, set_1, size_1 * sizeof(uint32_t));
+        return size_1;
+    }
+    if (0 == size_1) {
+        memmove(buffer, set_2, size_2 * sizeof(uint32_t));
+        return size_2;
+    }
+
+    uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
+
+    while (true) {
+        if (val_1 < val_2) {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            if (idx_1 >= size_1) break;
+            val_1 = set_1[idx_1];
+        } else if (val_2 < val_1) {
+            buffer[pos++] = val_2;
+            ++idx_2;
+            if (idx_2 >= size_2) break;
+            val_2 = set_2[idx_2];
+        } else {
+            buffer[pos++] = val_1;
+            ++idx_1;
+            ++idx_2;
+            if (idx_1 >= size_1 || idx_2 >= size_2) break;
+            val_1 = set_1[idx_1];
+            val_2 = set_2[idx_2];
+        }
+    }
+
+    if (idx_1 < size_1) {
+        const size_t n_elems = size_1 - idx_1;
+        memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint32_t));
+        pos += n_elems;
+    } else if (idx_2 < size_2) {
+        const size_t n_elems = size_2 - idx_2;
+        memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint32_t));
+        pos += n_elems;
+    }
+
+    return pos;
+}
+
+size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
+                         const uint32_t *set_2, size_t size_2) {
+    size_t pos = 0, idx_1 = 0, idx_2 = 0;
+
+    if (0 == size_2) {
+        return size_1;
+    }
+    if (0 == size_1) {
+        return size_2;
+    }
+
+    uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
+
+    while (true) {
+        if (val_1 < val_2) {
+            ++idx_1;
+            ++pos;
+            if (idx_1 >= size_1) break;
+            val_1 = set_1[idx_1];
+        } else if (val_2 < val_1) {
+            ++idx_2;
+            ++pos;
+            if (idx_2 >= size_2) break;
+            val_2 = set_2[idx_2];
+        } else {
+            ++idx_1;
+            ++idx_2;
+            ++pos;
+            if (idx_1 >= size_1 || idx_2 >= size_2) break;
+            val_1 = set_1[idx_1];
+            val_2 = set_2[idx_2];
+        }
+    }
+
+    if (idx_1 < size_1) {
+        const size_t n_elems = size_1 - idx_1;
+        pos += n_elems;
+    } else if (idx_2 < size_2) {
+        const size_t n_elems = size_2 - idx_2;
+        pos += n_elems;
+    }
+    return pos;
+}
+
+
+
+size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
+                    size_t size_2, uint16_t *buffer) {
+#ifdef ROARING_VECTOR_OPERATIONS_ENABLED
+    // compute union with smallest array first
+    if (size_1 < size_2) {
+        return union_vector16(set_1, (uint32_t)size_1,
+                                          set_2, (uint32_t)size_2, buffer);
+    } else {
+        return union_vector16(set_2, (uint32_t)size_2,
+                                          set_1, (uint32_t)size_1, buffer);
+    }
+#else
+    // compute union with smallest array first
+    if (size_1 < size_2) {
+        return union_uint16(
+            set_1, size_1, set_2, size_2, buffer);
+    } else {
+        return union_uint16(
+            set_2, size_2, set_1, size_1, buffer);
+    }
+#endif
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/array_util.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/bitset_util.c */
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#ifdef IS_X64
+static uint8_t lengthTable[256] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+    4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
+    3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+    4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+#endif
+
+#ifdef USEAVX
+ALIGNED(32)
+static uint32_t vecDecodeTable[256][8] = {
+    {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
+    {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
+    {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
+    {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
+    {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
+    {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
+    {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
+    {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
+    {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
+    {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
+    {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
+    {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
+    {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
+    {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
+    {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
+    {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
+    {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
+    {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
+    {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
+    {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
+    {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
+    {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
+    {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
+    {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
+    {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
+    {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
+    {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
+    {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
+    {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
+    {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
+    {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
+    {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
+    {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
+    {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
+    {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
+    {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
+    {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
+    {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
+    {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
+    {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
+    {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
+    {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
+    {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
+    {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
+    {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
+    {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
+    {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
+    {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
+    {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
+    {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
+    {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
+    {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
+    {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
+    {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
+    {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
+    {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
+    {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
+    {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
+    {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
+    {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
+    {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
+    {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
+    {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
+    {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
+    {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
+    {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
+    {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
+    {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
+    {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
+    {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
+    {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
+    {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
+    {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
+    {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
+    {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
+    {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
+    {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
+    {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
+    {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
+    {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
+    {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
+    {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
+    {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
+    {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
+    {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
+    {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
+    {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
+    {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
+    {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
+    {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
+    {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
+    {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
+    {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
+    {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
+    {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
+    {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
+    {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
+    {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
+    {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
+    {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
+    {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
+    {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
+    {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
+    {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
+    {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
+    {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
+    {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
+    {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
+    {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
+    {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
+    {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
+    {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
+    {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
+    {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
+    {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
+    {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
+    {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
+    {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
+    {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
+    {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
+    {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
+    {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
+    {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
+    {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
+    {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
+    {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
+    {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
+    {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
+    {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
+    {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
+    {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
+    {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
+    {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
+    {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
+    {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
+    {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
+    {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
+    {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
+    {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
+    {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
+    {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
+    {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
+    {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
+    {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
+    {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
+    {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
+    {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
+    {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
+    {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
+    {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
+    {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
+    {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
+    {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
+    {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
+    {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
+    {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
+    {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
+    {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
+    {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
+    {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
+    {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
+    {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
+    {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
+    {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
+    {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
+    {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
+    {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
+    {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
+    {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
+    {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
+    {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
+    {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
+    {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
+    {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
+    {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
+    {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
+    {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
+    {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
+    {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
+    {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
+    {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
+    {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
+    {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
+    {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
+    {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
+    {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
+    {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
+    {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
+    {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
+    {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
+    {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
+    {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
+    {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
+    {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
+    {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
+    {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
+    {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
+    {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
+    {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
+    {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
+    {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
+    {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
+    {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
+    {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
+    {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
+    {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
+    {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
+    {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
+    {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
+    {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
+    {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
+    {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
+    {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
+    {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
+    {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
+    {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
+    {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
+    {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
+    {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
+    {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
+    {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
+    {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
+    {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
+    {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
+    {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
+    {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
+    {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
+    {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
+    {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
+    {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
+    {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
+    {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
+    {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
+    {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
+    {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
+    {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
+    {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
+    {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
+    {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
+    {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
+    {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
+    {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
+    {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
+    {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
+    {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
+    {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
+    {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
+    {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
+    {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
+    {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
+    {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
+    {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
+    {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
+    {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
+    {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
+    {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
+};
+
+#endif  // #ifdef USEAVX
+
+#ifdef IS_X64
+// same as vecDecodeTable but in 16 bits
+ALIGNED(32)
+static uint16_t vecDecodeTable_uint16[256][8] = {
+    {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
+    {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
+    {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
+    {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
+    {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
+    {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
+    {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
+    {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
+    {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
+    {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
+    {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
+    {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
+    {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
+    {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
+    {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
+    {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
+    {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
+    {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
+    {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
+    {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
+    {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
+    {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
+    {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
+    {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
+    {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
+    {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
+    {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
+    {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
+    {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
+    {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
+    {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
+    {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
+    {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
+    {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
+    {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
+    {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
+    {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
+    {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
+    {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
+    {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
+    {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
+    {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
+    {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
+    {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
+    {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
+    {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
+    {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
+    {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
+    {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
+    {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
+    {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
+    {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
+    {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
+    {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
+    {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
+    {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
+    {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
+    {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
+    {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
+    {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
+    {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
+    {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
+    {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
+    {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
+    {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
+    {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
+    {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
+    {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
+    {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
+    {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
+    {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
+    {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
+    {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
+    {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
+    {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
+    {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
+    {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
+    {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
+    {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
+    {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
+    {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
+    {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
+    {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
+    {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
+    {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
+    {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
+    {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
+    {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
+    {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
+    {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
+    {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
+    {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
+    {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
+    {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
+    {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
+    {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
+    {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
+    {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
+    {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
+    {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
+    {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
+    {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
+    {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
+    {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
+    {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
+    {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
+    {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
+    {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
+    {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
+    {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
+    {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
+    {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
+    {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
+    {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
+    {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
+    {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
+    {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
+    {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
+    {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
+    {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
+    {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
+    {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
+    {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
+    {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
+    {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
+    {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
+    {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
+    {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
+    {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
+    {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
+    {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
+    {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
+    {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
+    {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
+    {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
+    {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
+    {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
+    {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
+    {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
+    {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
+    {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
+    {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
+    {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
+    {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
+    {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
+    {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
+    {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
+    {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
+    {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
+    {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
+    {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
+    {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
+    {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
+    {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
+    {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
+    {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
+    {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
+    {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
+    {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
+    {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
+    {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
+    {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
+    {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
+    {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
+    {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
+    {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
+    {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
+    {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
+    {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
+    {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
+    {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
+    {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
+    {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
+    {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
+    {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
+    {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
+    {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
+    {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
+    {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
+    {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
+    {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
+    {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
+    {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
+    {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
+    {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
+    {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
+    {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
+    {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
+    {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
+    {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
+    {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
+    {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
+    {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
+    {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
+    {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
+    {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
+    {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
+    {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
+    {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
+    {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
+    {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
+    {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
+    {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
+    {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
+    {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
+    {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
+    {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
+    {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
+    {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
+    {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
+    {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
+    {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
+    {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
+    {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
+    {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
+    {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
+    {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
+    {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
+    {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
+    {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
+    {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
+    {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
+    {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
+    {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
+    {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
+    {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
+    {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
+    {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
+    {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
+    {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
+    {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
+    {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
+    {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
+    {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
+    {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
+    {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
+    {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
+    {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
+    {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
+    {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
+    {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
+    {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
+    {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
+    {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
+    {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
+    {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
+    {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
+    {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
+    {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
+    {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
+    {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
+    {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
+    {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
+    {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
+    {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
+    {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
+};
+
+#endif
+
+#ifdef USEAVX
+
+size_t bitset_extract_setbits_avx2(uint64_t *array, size_t length, void *vout,
+                                   size_t outcapacity, uint32_t base) {
+    uint32_t *out = (uint32_t *)vout;
+    uint32_t *initout = out;
+    __m256i baseVec = _mm256_set1_epi32(base - 1);
+    __m256i incVec = _mm256_set1_epi32(64);
+    __m256i add8 = _mm256_set1_epi32(8);
+    uint32_t *safeout = out + outcapacity;
+    size_t i = 0;
+    for (; (i < length) && (out + 64 <= safeout); ++i) {
+        uint64_t w = array[i];
+        if (w == 0) {
+            baseVec = _mm256_add_epi32(baseVec, incVec);
+        } else {
+            for (int k = 0; k < 4; ++k) {
+                uint8_t byteA = (uint8_t)w;
+                uint8_t byteB = (uint8_t)(w >> 8);
+                w >>= 16;
+                __m256i vecA =
+                    _mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
+                __m256i vecB =
+                    _mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
+                uint8_t advanceA = lengthTable[byteA];
+                uint8_t advanceB = lengthTable[byteB];
+                vecA = _mm256_add_epi32(baseVec, vecA);
+                baseVec = _mm256_add_epi32(baseVec, add8);
+                vecB = _mm256_add_epi32(baseVec, vecB);
+                baseVec = _mm256_add_epi32(baseVec, add8);
+                _mm256_storeu_si256((__m256i *)out, vecA);
+                out += advanceA;
+                _mm256_storeu_si256((__m256i *)out, vecB);
+                out += advanceB;
+            }
+        }
+    }
+    base += i * 64;
+    for (; (i < length) && (out < safeout); ++i) {
+        uint64_t w = array[i];
+        while ((w != 0) && (out < safeout)) {
+            uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
+            int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
+            uint32_t val = r + base;
+            memcpy(out, &val,
+                   sizeof(uint32_t));  // should be compiled as a MOV on x64
+            out++;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return out - initout;
+}
+#endif  // USEAVX
+
+size_t bitset_extract_setbits(uint64_t *bitset, size_t length, void *vout,
+                              uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    for (size_t i = 0; i < length; ++i) {
+        uint64_t w = bitset[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
+            int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
+            uint32_t val = r + base;
+            memcpy(out + outpos, &val,
+                   sizeof(uint32_t));  // should be compiled as a MOV on x64
+            outpos++;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return outpos;
+}
+
+size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ bitset1,
+                                                  const uint64_t * __restrict__ bitset2,
+                                                  size_t length, uint16_t *out,
+                                                  uint16_t base) {
+    int outpos = 0;
+    for (size_t i = 0; i < length; ++i) {
+        uint64_t w = bitset1[i] & bitset2[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = __builtin_ctzll(w);
+            out[outpos++] = r + base;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return outpos;
+}
+
+#ifdef IS_X64
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out" as 16-bit integers, values start at "base" (can
+ *be set to zero).
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ *
+ * This function uses SSE decoding.
+ */
+size_t bitset_extract_setbits_sse_uint16(const uint64_t *bitset, size_t length,
+                                         uint16_t *out, size_t outcapacity,
+                                         uint16_t base) {
+    uint16_t *initout = out;
+    __m128i baseVec = _mm_set1_epi16(base - 1);
+    __m128i incVec = _mm_set1_epi16(64);
+    __m128i add8 = _mm_set1_epi16(8);
+    uint16_t *safeout = out + outcapacity;
+    const int numberofbytes = 2;  // process two bytes at a time
+    size_t i = 0;
+    for (; (i < length) && (out + numberofbytes * 8 <= safeout); ++i) {
+        uint64_t w = bitset[i];
+        if (w == 0) {
+            baseVec = _mm_add_epi16(baseVec, incVec);
+        } else {
+            for (int k = 0; k < 4; ++k) {
+                uint8_t byteA = (uint8_t)w;
+                uint8_t byteB = (uint8_t)(w >> 8);
+                w >>= 16;
+                __m128i vecA = _mm_load_si128(
+                    (const __m128i *)vecDecodeTable_uint16[byteA]);
+                __m128i vecB = _mm_load_si128(
+                    (const __m128i *)vecDecodeTable_uint16[byteB]);
+                uint8_t advanceA = lengthTable[byteA];
+                uint8_t advanceB = lengthTable[byteB];
+                vecA = _mm_add_epi16(baseVec, vecA);
+                baseVec = _mm_add_epi16(baseVec, add8);
+                vecB = _mm_add_epi16(baseVec, vecB);
+                baseVec = _mm_add_epi16(baseVec, add8);
+                _mm_storeu_si128((__m128i *)out, vecA);
+                out += advanceA;
+                _mm_storeu_si128((__m128i *)out, vecB);
+                out += advanceB;
+            }
+        }
+    }
+    base += (uint16_t)(i * 64);
+    for (; (i < length) && (out < safeout); ++i) {
+        uint64_t w = bitset[i];
+        while ((w != 0) && (out < safeout)) {
+            uint64_t t = w & (~w + 1);
+            int r = __builtin_ctzll(w);
+            *out = r + base;
+            out++;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return out - initout;
+}
+#endif
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out", values start at "base" (can be set to zero).
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_setbits_uint16(const uint64_t *bitset, size_t length,
+                                     uint16_t *out, uint16_t base) {
+    int outpos = 0;
+    for (size_t i = 0; i < length; ++i) {
+        uint64_t w = bitset[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = __builtin_ctzll(w);
+            out[outpos++] = r + base;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return outpos;
+}
+
+#if defined(ASMBITMANIPOPTIMIZATION)
+
+uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
+                                  const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, pos;
+    uint64_t shift = 6;
+    const uint16_t *end = list + length;
+    if (!length) return card;
+    // TODO: could unroll for performance, see bitset_set_list
+    // bts is not available as an intrinsic in GCC
+    __asm volatile(
+        "1:\n"
+        "movzwq (%[list]), %[pos]\n"
+        "shrx %[shift], %[pos], %[offset]\n"
+        "mov (%[bitset],%[offset],8), %[load]\n"
+        "bts %[pos], %[load]\n"
+        "mov %[load], (%[bitset],%[offset],8)\n"
+        "sbb $-1, %[card]\n"
+        "add $2, %[list]\n"
+        "cmp %[list], %[end]\n"
+        "jnz 1b"
+        : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
+          [pos] "=&r"(pos), [offset] "=&r"(offset)
+        : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift));
+    return card;
+}
+
+void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
+    uint64_t pos;
+    const uint16_t *end = list + length;
+
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t load;
+    for (; list + 3 < end; list += 4) {
+        pos = list[0];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[bitset],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[bitset],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
+        pos = list[1];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[bitset],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[bitset],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
+        pos = list[2];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[bitset],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[bitset],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
+        pos = list[3];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[bitset],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[bitset],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
+    }
+
+    while (list != end) {
+        pos = list[0];
+        __asm volatile(
+            "shrx %[shift], %[pos], %[offset]\n"
+            "mov (%[bitset],%[offset],8), %[load]\n"
+            "bts %[pos], %[load]\n"
+            "mov %[load], (%[bitset],%[offset],8)"
+            : [load] "=&r"(load), [offset] "=&r"(offset)
+            : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
+        list++;
+    }
+}
+
+uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
+                           uint64_t length) {
+    uint64_t offset, load, pos;
+    uint64_t shift = 6;
+    const uint16_t *end = list + length;
+    if (!length) return card;
+    // btr is not available as an intrinsic in GCC
+    __asm volatile(
+        "1:\n"
+        "movzwq (%[list]), %[pos]\n"
+        "shrx %[shift], %[pos], %[offset]\n"
+        "mov (%[bitset],%[offset],8), %[load]\n"
+        "btr %[pos], %[load]\n"
+        "mov %[load], (%[bitset],%[offset],8)\n"
+        "sbb $0, %[card]\n"
+        "add $2, %[list]\n"
+        "cmp %[list], %[end]\n"
+        "jnz 1b"
+        : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
+          [pos] "=&r"(pos), [offset] "=&r"(offset)
+        : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift)
+        :
+        /* clobbers */ "memory");
+    return card;
+}
+
+#else
+uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
+                           uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *(const uint16_t *)list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = ((uint64_t *)bitset)[offset];
+        newload = load & ~(UINT64_C(1) << index);
+        card -= (load ^ newload) >> index;
+        ((uint64_t *)bitset)[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
+                                  const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *(const uint16_t *)list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = ((uint64_t *)bitset)[offset];
+        newload = load | (UINT64_C(1) << index);
+        card += (load ^ newload) >> index;
+        ((uint64_t *)bitset)[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *(const uint16_t *)list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = ((uint64_t *)bitset)[offset];
+        newload = load | (UINT64_C(1) << index);
+        ((uint64_t *)bitset)[offset] = newload;
+        list++;
+    }
+}
+
+#endif
+
+/* flip specified bits */
+/* TODO: consider whether worthwhile to make an asm version */
+
+uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card,
+                                   const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *(const uint16_t *)list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = ((uint64_t *)bitset)[offset];
+        newload = load ^ (UINT64_C(1) << index);
+        // todo: is a branch here all that bad?
+        card +=
+            (1 - 2 * (((UINT64_C(1) << index) & load) >> index));  // +1 or -1
+        ((uint64_t *)bitset)[offset] = newload;
+        list++;
+    }
+    return card;
+}
+
+void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length) {
+    uint64_t offset, load, newload, pos, index;
+    const uint16_t *end = list + length;
+    while (list != end) {
+        pos = *(const uint16_t *)list;
+        offset = pos >> 6;
+        index = pos % 64;
+        load = ((uint64_t *)bitset)[offset];
+        newload = load ^ (UINT64_C(1) << index);
+        ((uint64_t *)bitset)[offset] = newload;
+        list++;
+    }
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/bitset_util.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/array.c */
+/*
+ * array.c
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+extern inline uint16_t array_container_minimum(const array_container_t *arr);
+extern inline uint16_t array_container_maximum(const array_container_t *arr);
+extern inline int array_container_index_equalorlarger(const array_container_t *arr, uint16_t x);
+
+extern inline int array_container_rank(const array_container_t *arr,
+                                       uint16_t x);
+extern inline bool array_container_contains(const array_container_t *arr,
+                                            uint16_t pos);
+extern int array_container_cardinality(const array_container_t *array);
+extern bool array_container_nonzero_cardinality(const array_container_t *array);
+extern void array_container_clear(array_container_t *array);
+extern int32_t array_container_serialized_size_in_bytes(int32_t card);
+extern bool array_container_empty(const array_container_t *array);
+extern bool array_container_full(const array_container_t *array);
+
+/* Create a new array with capacity size. Return NULL in case of failure. */
+array_container_t *array_container_create_given_capacity(int32_t size) {
+    array_container_t *container;
+
+    if ((container = (array_container_t *)malloc(sizeof(array_container_t))) ==
+        NULL) {
+        return NULL;
+    }
+
+    if( size <= 0 ) { // we don't want to rely on malloc(0)
+        container->array = NULL;
+    } else if ((container->array = (uint16_t *)malloc(sizeof(uint16_t) * size)) ==
+        NULL) {
+        free(container);
+        return NULL;
+    }
+
+    container->capacity = size;
+    container->cardinality = 0;
+
+    return container;
+}
+
+/* Create a new array. Return NULL in case of failure. */
+array_container_t *array_container_create() {
+    return array_container_create_given_capacity(ARRAY_DEFAULT_INIT_SIZE);
+}
+
+/* Create a new array containing all values in [min,max). */
+array_container_t * array_container_create_range(uint32_t min, uint32_t max) {
+    array_container_t * answer = array_container_create_given_capacity(max - min + 1);
+    if(answer == NULL) return answer;
+    answer->cardinality = 0;
+    for(uint32_t k = min; k < max; k++) {
+      answer->array[answer->cardinality++] = k;
+    }
+    return answer;
+}
+
+/* Duplicate container */
+array_container_t *array_container_clone(const array_container_t *src) {
+    array_container_t *newcontainer =
+        array_container_create_given_capacity(src->capacity);
+    if (newcontainer == NULL) return NULL;
+
+    newcontainer->cardinality = src->cardinality;
+
+    memcpy(newcontainer->array, src->array,
+           src->cardinality * sizeof(uint16_t));
+
+    return newcontainer;
+}
+
+int array_container_shrink_to_fit(array_container_t *src) {
+    if (src->cardinality == src->capacity) return 0;  // nothing to do
+    int savings = src->capacity - src->cardinality;
+    src->capacity = src->cardinality;
+    if( src->capacity == 0) { // we do not want to rely on realloc for zero allocs
+      free(src->array);
+      src->array = NULL;
+    } else {
+      uint16_t *oldarray = src->array;
+      src->array =
+        (uint16_t *)realloc(oldarray, src->capacity * sizeof(uint16_t));
+      if (src->array == NULL) free(oldarray);  // should never happen?
+    }
+    return savings;
+}
+
+/* Free memory. */
+void array_container_free(array_container_t *arr) {
+    if(arr->array != NULL) {// Jon Strabala reports that some tools complain otherwise
+      free(arr->array);
+      arr->array = NULL; // pedantic
+    }
+    free(arr);
+}
+
+static inline int32_t grow_capacity(int32_t capacity) {
+    return (capacity <= 0) ? ARRAY_DEFAULT_INIT_SIZE
+                           : capacity < 64 ? capacity * 2
+                                           : capacity < 1024 ? capacity * 3 / 2
+                                                             : capacity * 5 / 4;
+}
+
+static inline int32_t clamp(int32_t val, int32_t min, int32_t max) {
+    return ((val < min) ? min : (val > max) ? max : val);
+}
+
+void array_container_grow(array_container_t *container, int32_t min,
+                          bool preserve) {
+
+    int32_t max = (min <= DEFAULT_MAX_SIZE ? DEFAULT_MAX_SIZE : 65536);
+    int32_t new_capacity = clamp(grow_capacity(container->capacity), min, max);
+
+    container->capacity = new_capacity;
+    uint16_t *array = container->array;
+
+    if (preserve) {
+        container->array =
+            (uint16_t *)realloc(array, new_capacity * sizeof(uint16_t));
+        if (container->array == NULL) free(array);
+    } else {
+        // Jon Strabala reports that some tools complain otherwise
+        if (array != NULL) {
+          free(array);
+        }
+        container->array = (uint16_t *)malloc(new_capacity * sizeof(uint16_t));
+    }
+
+    //  handle the case where realloc fails
+    if (container->array == NULL) {
+      fprintf(stderr, "could not allocate memory\n");
+    }
+    assert(container->array != NULL);
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void array_container_copy(const array_container_t *src,
+                          array_container_t *dst) {
+    const int32_t cardinality = src->cardinality;
+    if (cardinality > dst->capacity) {
+        array_container_grow(dst, cardinality, false);
+    }
+
+    dst->cardinality = cardinality;
+    memcpy(dst->array, src->array, cardinality * sizeof(uint16_t));
+}
+
+void array_container_add_from_range(array_container_t *arr, uint32_t min,
+                                    uint32_t max, uint16_t step) {
+    for (uint32_t value = min; value < max; value += step) {
+        array_container_append(arr, value);
+    }
+}
+
+/* Computes the union of array1 and array2 and write the result to arrayout.
+ * It is assumed that arrayout is distinct from both array1 and array2.
+ */
+void array_container_union(const array_container_t *array_1,
+                           const array_container_t *array_2,
+                           array_container_t *out) {
+    const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
+    const int32_t max_cardinality = card_1 + card_2;
+
+    if (out->capacity < max_cardinality) {
+      array_container_grow(out, max_cardinality, false);
+    }
+    out->cardinality = (int32_t)fast_union_uint16(array_1->array, card_1,
+                                      array_2->array, card_2, out->array);
+
+}
+
+/* Computes the  difference of array1 and array2 and write the result
+ * to array out.
+ * Array out does not need to be distinct from array_1
+ */
+void array_container_andnot(const array_container_t *array_1,
+                            const array_container_t *array_2,
+                            array_container_t *out) {
+    if (out->capacity < array_1->cardinality)
+        array_container_grow(out, array_1->cardinality, false);
+#ifdef ROARING_VECTOR_OPERATIONS_ENABLED
+    out->cardinality =
+        difference_vector16(array_1->array, array_1->cardinality,
+                            array_2->array, array_2->cardinality, out->array);
+#else
+    out->cardinality =
+        difference_uint16(array_1->array, array_1->cardinality, array_2->array,
+                          array_2->cardinality, out->array);
+#endif
+}
+
+/* Computes the symmetric difference of array1 and array2 and write the
+ * result
+ * to arrayout.
+ * It is assumed that arrayout is distinct from both array1 and array2.
+ */
+void array_container_xor(const array_container_t *array_1,
+                         const array_container_t *array_2,
+                         array_container_t *out) {
+    const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
+    const int32_t max_cardinality = card_1 + card_2;
+    if (out->capacity < max_cardinality) {
+        array_container_grow(out, max_cardinality, false);
+    }
+
+#ifdef ROARING_VECTOR_OPERATIONS_ENABLED
+    out->cardinality =
+        xor_vector16(array_1->array, array_1->cardinality, array_2->array,
+                     array_2->cardinality, out->array);
+#else
+    out->cardinality =
+        xor_uint16(array_1->array, array_1->cardinality, array_2->array,
+                   array_2->cardinality, out->array);
+#endif
+}
+
+static inline int32_t minimum_int32(int32_t a, int32_t b) {
+    return (a < b) ? a : b;
+}
+
+/* computes the intersection of array1 and array2 and write the result to
+ * arrayout.
+ * It is assumed that arrayout is distinct from both array1 and array2.
+ * */
+void array_container_intersection(const array_container_t *array1,
+                                  const array_container_t *array2,
+                                  array_container_t *out) {
+    int32_t card_1 = array1->cardinality, card_2 = array2->cardinality,
+            min_card = minimum_int32(card_1, card_2);
+    const int threshold = 64;  // subject to tuning
+#ifdef USEAVX
+    if (out->capacity < min_card) {
+      array_container_grow(out, min_card + sizeof(__m128i) / sizeof(uint16_t),
+        false);
+    }
+#else
+    if (out->capacity < min_card) {
+      array_container_grow(out, min_card, false);
+    }
+#endif
+
+    if (card_1 * threshold < card_2) {
+        out->cardinality = intersect_skewed_uint16(
+            array1->array, card_1, array2->array, card_2, out->array);
+    } else if (card_2 * threshold < card_1) {
+        out->cardinality = intersect_skewed_uint16(
+            array2->array, card_2, array1->array, card_1, out->array);
+    } else {
+#ifdef USEAVX
+        out->cardinality = intersect_vector16(
+            array1->array, card_1, array2->array, card_2, out->array);
+#else
+        out->cardinality = intersect_uint16(array1->array, card_1,
+                                            array2->array, card_2, out->array);
+#endif
+    }
+}
+
+/* computes the size of the intersection of array1 and array2
+ * */
+int array_container_intersection_cardinality(const array_container_t *array1,
+                                             const array_container_t *array2) {
+    int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
+    const int threshold = 64;  // subject to tuning
+    if (card_1 * threshold < card_2) {
+        return intersect_skewed_uint16_cardinality(array1->array, card_1,
+                                                   array2->array, card_2);
+    } else if (card_2 * threshold < card_1) {
+        return intersect_skewed_uint16_cardinality(array2->array, card_2,
+                                                   array1->array, card_1);
+    } else {
+#ifdef USEAVX
+        return intersect_vector16_cardinality(array1->array, card_1,
+                                              array2->array, card_2);
+#else
+        return intersect_uint16_cardinality(array1->array, card_1,
+                                            array2->array, card_2);
+#endif
+    }
+}
+
+bool array_container_intersect(const array_container_t *array1,
+                                  const array_container_t *array2) {
+    int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
+    const int threshold = 64;  // subject to tuning
+    if (card_1 * threshold < card_2) {
+        return intersect_skewed_uint16_nonempty(
+            array1->array, card_1, array2->array, card_2);
+    } else if (card_2 * threshold < card_1) {
+    	return intersect_skewed_uint16_nonempty(
+            array2->array, card_2, array1->array, card_1);
+    } else {
+    	// we do not bother vectorizing
+        return intersect_uint16_nonempty(array1->array, card_1,
+                                            array2->array, card_2);
+    }
+}
+
+/* computes the intersection of array1 and array2 and write the result to
+ * array1.
+ * */
+void array_container_intersection_inplace(array_container_t *src_1,
+                                          const array_container_t *src_2) {
+    // todo: can any of this be vectorized?
+    int32_t card_1 = src_1->cardinality, card_2 = src_2->cardinality;
+    const int threshold = 64;  // subject to tuning
+    if (card_1 * threshold < card_2) {
+        src_1->cardinality = intersect_skewed_uint16(
+            src_1->array, card_1, src_2->array, card_2, src_1->array);
+    } else if (card_2 * threshold < card_1) {
+        src_1->cardinality = intersect_skewed_uint16(
+            src_2->array, card_2, src_1->array, card_1, src_1->array);
+    } else {
+        src_1->cardinality = intersect_uint16(
+            src_1->array, card_1, src_2->array, card_2, src_1->array);
+    }
+}
+
+int array_container_to_uint32_array(void *vout, const array_container_t *cont,
+                                    uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    for (int i = 0; i < cont->cardinality; ++i) {
+        const uint32_t val = base + cont->array[i];
+        memcpy(out + outpos, &val,
+               sizeof(uint32_t));  // should be compiled as a MOV on x64
+        outpos++;
+    }
+    return outpos;
+}
+
+void array_container_printf(const array_container_t *v) {
+    if (v->cardinality == 0) {
+        printf("{}");
+        return;
+    }
+    printf("{");
+    printf("%d", v->array[0]);
+    for (int i = 1; i < v->cardinality; ++i) {
+        printf(",%d", v->array[i]);
+    }
+    printf("}");
+}
+
+void array_container_printf_as_uint32_array(const array_container_t *v,
+                                            uint32_t base) {
+    if (v->cardinality == 0) {
+        return;
+    }
+    printf("%u", v->array[0] + base);
+    for (int i = 1; i < v->cardinality; ++i) {
+        printf(",%u", v->array[i] + base);
+    }
+}
+
+/* Compute the number of runs */
+int32_t array_container_number_of_runs(const array_container_t *a) {
+    // Can SIMD work here?
+    int32_t nr_runs = 0;
+    int32_t prev = -2;
+    for (const uint16_t *p = a->array; p != a->array + a->cardinality; ++p) {
+        if (*p != prev + 1) nr_runs++;
+        prev = *p;
+    }
+    return nr_runs;
+}
+
+int32_t array_container_serialize(const array_container_t *container, char *buf) {
+    int32_t l, off;
+    uint16_t cardinality = (uint16_t)container->cardinality;
+
+    memcpy(buf, &cardinality, off = sizeof(cardinality));
+    l = sizeof(uint16_t) * container->cardinality;
+    if (l) memcpy(&buf[off], container->array, l);
+
+    return (off + l);
+}
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * The number of bytes written should be
+ * array_container_size_in_bytes(container).
+ *
+ */
+int32_t array_container_write(const array_container_t *container, char *buf) {
+    memcpy(buf, container->array, container->cardinality * sizeof(uint16_t));
+    return array_container_size_in_bytes(container);
+}
+
+bool array_container_equals(const array_container_t *container1,
+                            const array_container_t *container2) {
+    if (container1->cardinality != container2->cardinality) {
+        return false;
+    }
+    // could be vectorized:
+    for (int32_t i = 0; i < container1->cardinality; ++i) {
+        if (container1->array[i] != container2->array[i]) return false;
+    }
+    return true;
+}
+
+bool array_container_is_subset(const array_container_t *container1,
+                               const array_container_t *container2) {
+    if (container1->cardinality > container2->cardinality) {
+        return false;
+    }
+    int i1 = 0, i2 = 0;
+    while (i1 < container1->cardinality && i2 < container2->cardinality) {
+        if (container1->array[i1] == container2->array[i2]) {
+            i1++;
+            i2++;
+        } else if (container1->array[i1] > container2->array[i2]) {
+            i2++;
+        } else {  // container1->array[i1] < container2->array[i2]
+            return false;
+        }
+    }
+    if (i1 == container1->cardinality) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+int32_t array_container_read(int32_t cardinality, array_container_t *container,
+                             const char *buf) {
+    if (container->capacity < cardinality) {
+        array_container_grow(container, cardinality, false);
+    }
+    container->cardinality = cardinality;
+    memcpy(container->array, buf, container->cardinality * sizeof(uint16_t));
+
+    return array_container_size_in_bytes(container);
+}
+
+uint32_t array_container_serialization_len(const array_container_t *container) {
+    return (sizeof(uint16_t) /* container->cardinality converted to 16 bit */ +
+            (sizeof(uint16_t) * container->cardinality));
+}
+
+void *array_container_deserialize(const char *buf, size_t buf_len) {
+    array_container_t *ptr;
+
+    if (buf_len < 2) /* capacity converted to 16 bit */
+        return (NULL);
+    else
+        buf_len -= 2;
+
+    if ((ptr = (array_container_t *)malloc(sizeof(array_container_t))) !=
+        NULL) {
+        size_t len;
+        int32_t off;
+        uint16_t cardinality;
+
+        memcpy(&cardinality, buf, off = sizeof(cardinality));
+
+        ptr->capacity = ptr->cardinality = (uint32_t)cardinality;
+        len = sizeof(uint16_t) * ptr->cardinality;
+
+        if (len != buf_len) {
+            free(ptr);
+            return (NULL);
+        }
+
+        if ((ptr->array = (uint16_t *)malloc(sizeof(uint16_t) *
+                                             ptr->capacity)) == NULL) {
+            free(ptr);
+            return (NULL);
+        }
+
+        if (len) memcpy(ptr->array, &buf[off], len);
+
+        /* Check if returned values are monotonically increasing */
+        for (int32_t i = 0, j = 0; i < ptr->cardinality; i++) {
+            if (ptr->array[i] < j) {
+                free(ptr->array);
+                free(ptr);
+                return (NULL);
+            } else
+                j = ptr->array[i];
+        }
+    }
+
+    return (ptr);
+}
+
+bool array_container_iterate(const array_container_t *cont, uint32_t base,
+                             roaring_iterator iterator, void *ptr) {
+    for (int i = 0; i < cont->cardinality; i++)
+        if (!iterator(cont->array[i] + base, ptr)) return false;
+    return true;
+}
+
+bool array_container_iterate64(const array_container_t *cont, uint32_t base,
+                               roaring_iterator64 iterator, uint64_t high_bits,
+                               void *ptr) {
+    for (int i = 0; i < cont->cardinality; i++)
+        if (!iterator(high_bits | (uint64_t)(cont->array[i] + base), ptr))
+            return false;
+    return true;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/array.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/bitset.c */
+/*
+ * bitset.c
+ *
+ */
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+extern int bitset_container_cardinality(const bitset_container_t *bitset);
+extern bool bitset_container_nonzero_cardinality(bitset_container_t *bitset);
+extern void bitset_container_set(bitset_container_t *bitset, uint16_t pos);
+extern void bitset_container_unset(bitset_container_t *bitset, uint16_t pos);
+extern inline bool bitset_container_get(const bitset_container_t *bitset,
+                                        uint16_t pos);
+extern int32_t bitset_container_serialized_size_in_bytes();
+extern bool bitset_container_add(bitset_container_t *bitset, uint16_t pos);
+extern bool bitset_container_remove(bitset_container_t *bitset, uint16_t pos);
+extern inline bool bitset_container_contains(const bitset_container_t *bitset,
+                                             uint16_t pos);
+
+void bitset_container_clear(bitset_container_t *bitset) {
+    memset(bitset->array, 0, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    bitset->cardinality = 0;
+}
+
+void bitset_container_set_all(bitset_container_t *bitset) {
+    memset(bitset->array, INT64_C(-1),
+           sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    bitset->cardinality = (1 << 16);
+}
+
+
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_container_t *bitset_container_create(void) {
+    bitset_container_t *bitset =
+        (bitset_container_t *)malloc(sizeof(bitset_container_t));
+
+    if (!bitset) {
+        return NULL;
+    }
+    // sizeof(__m256i) == 32
+    bitset->array = (uint64_t *)aligned_malloc(
+        32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    if (!bitset->array) {
+        free(bitset);
+        return NULL;
+    }
+    bitset_container_clear(bitset);
+    return bitset;
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void bitset_container_copy(const bitset_container_t *source,
+                           bitset_container_t *dest) {
+    dest->cardinality = source->cardinality;
+    memcpy(dest->array, source->array,
+           sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+}
+
+void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
+                                     uint32_t max, uint16_t step) {
+    if (step == 0) return;   // refuse to crash
+    if ((64 % step) == 0) {  // step divides 64
+        uint64_t mask = 0;   // construct the repeated mask
+        for (uint32_t value = (min % step); value < 64; value += step) {
+            mask |= ((uint64_t)1 << value);
+        }
+        uint32_t firstword = min / 64;
+        uint32_t endword = (max - 1) / 64;
+        bitset->cardinality = (max - min + step - 1) / step;
+        if (firstword == endword) {
+            bitset->array[firstword] |=
+                mask & (((~UINT64_C(0)) << (min % 64)) &
+                        ((~UINT64_C(0)) >> ((~max + 1) % 64)));
+            return;
+        }
+        bitset->array[firstword] = mask & ((~UINT64_C(0)) << (min % 64));
+        for (uint32_t i = firstword + 1; i < endword; i++)
+            bitset->array[i] = mask;
+        bitset->array[endword] = mask & ((~UINT64_C(0)) >> ((~max + 1) % 64));
+    } else {
+        for (uint32_t value = min; value < max; value += step) {
+            bitset_container_add(bitset, value);
+        }
+    }
+}
+
+/* Free memory. */
+void bitset_container_free(bitset_container_t *bitset) {
+    if(bitset->array != NULL) {// Jon Strabala reports that some tools complain otherwise
+      aligned_free(bitset->array);
+      bitset->array = NULL; // pedantic
+    }
+    free(bitset);
+}
+
+/* duplicate container. */
+bitset_container_t *bitset_container_clone(const bitset_container_t *src) {
+    bitset_container_t *bitset =
+        (bitset_container_t *)malloc(sizeof(bitset_container_t));
+
+    if (!bitset) {
+        return NULL;
+    }
+    // sizeof(__m256i) == 32
+    bitset->array = (uint64_t *)aligned_malloc(
+        32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    if (!bitset->array) {
+        free(bitset);
+        return NULL;
+    }
+    bitset->cardinality = src->cardinality;
+    memcpy(bitset->array, src->array,
+           sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+    return bitset;
+}
+
+void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin,
+                                uint32_t end) {
+    bitset_set_range(bitset->array, begin, end);
+    bitset->cardinality =
+        bitset_container_compute_cardinality(bitset);  // could be smarter
+}
+
+
+bool bitset_container_intersect(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2) {
+	// could vectorize, but this is probably already quite fast in practice
+    const uint64_t * __restrict__ array_1 = src_1->array;
+    const uint64_t * __restrict__ array_2 = src_2->array;
+	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i ++) {
+        if((array_1[i] & array_2[i]) != 0) return true;
+    }
+    return false;
+}
+
+
+#ifdef USEAVX
+#ifndef WORDS_IN_AVX2_REG
+#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
+#endif
+/* Get the number of bits set (force computation) */
+int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
+    return (int) avx2_harley_seal_popcount256(
+        (const __m256i *)bitset->array,
+        BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));
+}
+#else
+
+/* Get the number of bits set (force computation) */
+int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
+    const uint64_t *array = bitset->array;
+    int32_t sum = 0;
+    for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
+        sum += hamming(array[i]);
+        sum += hamming(array[i + 1]);
+        sum += hamming(array[i + 2]);
+        sum += hamming(array[i + 3]);
+    }
+    return sum;
+}
+
+#endif
+
+#ifdef USEAVX
+
+#define BITSET_CONTAINER_FN_REPEAT 8
+#ifndef WORDS_IN_AVX2_REG
+#define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
+#endif
+#define LOOP_SIZE                    \
+    BITSET_CONTAINER_SIZE_IN_WORDS / \
+        ((WORDS_IN_AVX2_REG)*BITSET_CONTAINER_FN_REPEAT)
+
+/* Computes a binary operation (eg union) on bitset1 and bitset2 and write the
+   result to bitsetout */
+// clang-format off
+#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic)            \
+int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \
+                                       const bitset_container_t *src_2, \
+                                       bitset_container_t *dst) {       \
+    const uint8_t * __restrict__ array_1 = (const uint8_t *)src_1->array; \
+    const uint8_t * __restrict__ array_2 = (const uint8_t *)src_2->array; \
+    /* not using the blocking optimization for some reason*/            \
+    uint8_t *out = (uint8_t*)dst->array;                                \
+    const int innerloop = 8;                                            \
+    for (size_t i = 0;                                                  \
+        i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG);       \
+                                                         i+=innerloop) {\
+        __m256i A1, A2, AO;                                             \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1));                  \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2));                  \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)out, AO);                        \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 32));             \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 32));             \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)(out+32), AO);                   \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 64));             \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 64));             \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)(out+64), AO);                   \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 96));             \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 96));             \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)(out+96), AO);                   \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 128));            \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 128));            \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)(out+128), AO);                  \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 160));            \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 160));            \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)(out+160), AO);                  \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 192));            \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 192));            \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)(out+192), AO);                  \
+        A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 224));            \
+        A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 224));            \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm256_storeu_si256((__m256i *)(out+224), AO);                  \
+        out+=256;                                                       \
+        array_1 += 256;                                                 \
+        array_2 += 256;                                                 \
+    }                                                                   \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                      \
+    return dst->cardinality;                                            \
+}                                                                       \
+/* next, a version that updates cardinality*/                           \
+int bitset_container_##opname(const bitset_container_t *src_1,          \
+                              const bitset_container_t *src_2,          \
+                              bitset_container_t *dst) {                \
+    const __m256i * __restrict__ array_1 = (const __m256i *) src_1->array; \
+    const __m256i * __restrict__ array_2 = (const __m256i *) src_2->array; \
+    __m256i *out = (__m256i *) dst->array;                              \
+    dst->cardinality = (int32_t)avx2_harley_seal_popcount256andstore_##opname(array_2,\
+    		array_1, out,BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
+    return dst->cardinality;                                            \
+}                                                                       \
+/* next, a version that just computes the cardinality*/                 \
+int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
+                              const bitset_container_t *src_2) {        \
+    const __m256i * __restrict__ data1 = (const __m256i *) src_1->array; \
+    const __m256i * __restrict__ data2 = (const __m256i *) src_2->array; \
+    return (int)avx2_harley_seal_popcount256_##opname(data2,                \
+    		data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
+}
+
+
+
+#else /* not USEAVX  */
+
+#define BITSET_CONTAINER_FN(opname, opsymbol, avxintrinsic)               \
+int bitset_container_##opname(const bitset_container_t *src_1,            \
+                              const bitset_container_t *src_2,            \
+                              bitset_container_t *dst) {                  \
+    const uint64_t * __restrict__ array_1 = src_1->array;                 \
+    const uint64_t * __restrict__ array_2 = src_2->array;                 \
+    uint64_t *out = dst->array;                                           \
+    int32_t sum = 0;                                                      \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
+        const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]),         \
+                       word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
+        out[i] = word_1;                                                  \
+        out[i + 1] = word_2;                                              \
+        sum += hamming(word_1);                                    \
+        sum += hamming(word_2);                                    \
+    }                                                                     \
+    dst->cardinality = sum;                                               \
+    return dst->cardinality;                                              \
+}                                                                         \
+int bitset_container_##opname##_nocard(const bitset_container_t *src_1,   \
+                                       const bitset_container_t *src_2,   \
+                                       bitset_container_t *dst) {         \
+    const uint64_t * __restrict__ array_1 = src_1->array;                 \
+    const uint64_t * __restrict__ array_2 = src_2->array;                 \
+    uint64_t *out = dst->array;                                           \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) {         \
+        out[i] = (array_1[i])opsymbol(array_2[i]);                        \
+    }                                                                     \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                        \
+    return dst->cardinality;                                              \
+}                                                                         \
+int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
+                              const bitset_container_t *src_2) {          \
+    const uint64_t * __restrict__ array_1 = src_1->array;                 \
+    const uint64_t * __restrict__ array_2 = src_2->array;                 \
+    int32_t sum = 0;                                                      \
+    for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
+        const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]),         \
+                       word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
+        sum += hamming(word_1);                                    \
+        sum += hamming(word_2);                                    \
+    }                                                                     \
+    return sum;                                                           \
+}
+
+#endif
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+BITSET_CONTAINER_FN(or, |, _mm256_or_si256)
+BITSET_CONTAINER_FN(union, |, _mm256_or_si256)
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+BITSET_CONTAINER_FN(and, &, _mm256_and_si256)
+BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256)
+
+BITSET_CONTAINER_FN(xor, ^, _mm256_xor_si256)
+BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256)
+// clang-format On
+
+
+
+int bitset_container_to_uint32_array( void *vout, const bitset_container_t *cont, uint32_t base) {
+#ifdef USEAVX2FORDECODING
+	if(cont->cardinality >= 8192)// heuristic
+		return (int) bitset_extract_setbits_avx2(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,cont->cardinality,base);
+	else
+		return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
+#else
+	return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
+#endif
+}
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void bitset_container_printf(const bitset_container_t * v) {
+	printf("{");
+	uint32_t base = 0;
+	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
+	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
+		uint64_t w = v->array[i];
+		while (w != 0) {
+			uint64_t t = w & (~w + 1);
+			int r = __builtin_ctzll(w);
+			if(iamfirst) {// predicted to be false
+				printf("%u",base + r);
+				iamfirst = false;
+			} else {
+				printf(",%u",base + r);
+			}
+			w ^= t;
+		}
+		base += 64;
+	}
+	printf("}");
+}
+
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit integers starting at base.
+ */
+void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint32_t base) {
+	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
+	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
+		uint64_t w = v->array[i];
+		while (w != 0) {
+			uint64_t t = w & (~w + 1);
+			int r = __builtin_ctzll(w);
+			if(iamfirst) {// predicted to be false
+				printf("%u", r + base);
+				iamfirst = false;
+			} else {
+				printf(",%u",r + base);
+			}
+			w ^= t;
+		}
+		base += 64;
+	}
+}
+
+
+// TODO: use the fast lower bound, also
+int bitset_container_number_of_runs(bitset_container_t *b) {
+  int num_runs = 0;
+  uint64_t next_word = b->array[0];
+
+  for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS-1; ++i) {
+    uint64_t word = next_word;
+    next_word = b->array[i+1];
+    num_runs += hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word);
+  }
+
+  uint64_t word = next_word;
+  num_runs += hamming((~word) & (word << 1));
+  if((word & 0x8000000000000000ULL) != 0)
+    num_runs++;
+  return num_runs;
+}
+
+int32_t bitset_container_serialize(const bitset_container_t *container, char *buf) {
+  int32_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;
+  memcpy(buf, container->array, l);
+  return(l);
+}
+
+
+
+int32_t bitset_container_write(const bitset_container_t *container,
+                                  char *buf) {
+	memcpy(buf, container->array, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
+	return bitset_container_size_in_bytes(container);
+}
+
+
+int32_t bitset_container_read(int32_t cardinality, bitset_container_t *container,
+		const char *buf)  {
+	container->cardinality = cardinality;
+	memcpy(container->array, buf, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
+	return bitset_container_size_in_bytes(container);
+}
+
+uint32_t bitset_container_serialization_len() {
+  return(sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+}
+
+void* bitset_container_deserialize(const char *buf, size_t buf_len) {
+  bitset_container_t *ptr;
+  size_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;
+
+  if(l != buf_len)
+    return(NULL);
+
+  if((ptr = (bitset_container_t *)malloc(sizeof(bitset_container_t))) != NULL) {
+    memcpy(ptr, buf, sizeof(bitset_container_t));
+    // sizeof(__m256i) == 32
+    ptr->array = (uint64_t *) aligned_malloc(32, l);
+    if (! ptr->array) {
+        free(ptr);
+        return NULL;
+    }
+    memcpy(ptr->array, buf, l);
+    ptr->cardinality = bitset_container_compute_cardinality(ptr);
+  }
+
+  return((void*)ptr);
+}
+
+bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, roaring_iterator iterator, void *ptr) {
+  for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+    uint64_t w = cont->array[i];
+    while (w != 0) {
+      uint64_t t = w & (~w + 1);
+      int r = __builtin_ctzll(w);
+      if(!iterator(r + base, ptr)) return false;
+      w ^= t;
+    }
+    base += 64;
+  }
+  return true;
+}
+
+bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, roaring_iterator64 iterator, uint64_t high_bits, void *ptr) {
+  for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+    uint64_t w = cont->array[i];
+    while (w != 0) {
+      uint64_t t = w & (~w + 1);
+      int r = __builtin_ctzll(w);
+      if(!iterator(high_bits | (uint64_t)(r + base), ptr)) return false;
+      w ^= t;
+    }
+    base += 64;
+  }
+  return true;
+}
+
+
+bool bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
+	if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
+		if(container1->cardinality != container2->cardinality) {
+			return false;
+		}
+	}
+	for(int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+		if(container1->array[i] != container2->array[i]) {
+			return false;
+		}
+	}
+	return true;
+}
+
+bool bitset_container_is_subset(const bitset_container_t *container1,
+                          const bitset_container_t *container2) {
+    if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
+        if(container1->cardinality > container2->cardinality) {
+            return false;
+        }
+    }
+    for(int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+		if((container1->array[i] & container2->array[i]) != container1->array[i]) {
+			return false;
+		}
+	}
+	return true;
+}
+
+bool bitset_container_select(const bitset_container_t *container, uint32_t *start_rank, uint32_t rank, uint32_t *element) {
+    int card = bitset_container_cardinality(container);
+    if(rank >= *start_rank + card) {
+        *start_rank += card;
+        return false;
+    }
+    const uint64_t *array = container->array;
+    int32_t size;
+    for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 1) {
+        size = hamming(array[i]);
+        if(rank <= *start_rank + size) {
+            uint64_t w = container->array[i];
+            uint16_t base = i*64;
+            while (w != 0) {
+                uint64_t t = w & (~w + 1);
+                int r = __builtin_ctzll(w);
+                if(*start_rank == rank) {
+                    *element = r+base;
+                    return true;
+                }
+                w ^= t;
+                *start_rank += 1;
+            }
+        }
+        else
+            *start_rank += size;
+    }
+    assert(false);
+    __builtin_unreachable();
+}
+
+
+/* Returns the smallest value (assumes not empty) */
+uint16_t bitset_container_minimum(const bitset_container_t *container) {
+  for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
+    uint64_t w = container->array[i];
+    if (w != 0) {
+      int r = __builtin_ctzll(w);
+      return r + i * 64;
+    }
+  }
+  return UINT16_MAX;
+}
+
+/* Returns the largest value (assumes not empty) */
+uint16_t bitset_container_maximum(const bitset_container_t *container) {
+  for (int32_t i = BITSET_CONTAINER_SIZE_IN_WORDS - 1; i > 0; --i ) {
+    uint64_t w = container->array[i];
+    if (w != 0) {
+      int r = __builtin_clzll(w);
+      return i * 64 + 63  - r;
+    }
+  }
+  return 0;
+}
+
+/* Returns the number of values equal or smaller than x */
+int bitset_container_rank(const bitset_container_t *container, uint16_t x) {
+  uint32_t x32 = x;
+  int sum = 0;
+  uint32_t k = 0;
+  for (; k + 63 <= x32; k += 64)  {
+    sum += hamming(container->array[k / 64]);
+  }
+  // at this point, we have covered everything up to k, k not included.
+  // we have that k < x, but not so large that k+63<=x
+  // k is a power of 64
+  int bitsleft = x32 - k + 1;// will be in [0,64)
+  uint64_t leftoverword = container->array[k / 64];// k / 64 should be within scope
+  leftoverword = leftoverword & ((UINT64_C(1) << bitsleft) - 1);
+  sum += hamming(leftoverword);
+  return sum;
+}
+
+/* Returns the index of the first value equal or larger than x, or -1 */
+int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x) {
+  uint32_t x32 = x;
+  uint32_t k = x32 / 64;
+  uint64_t word = container->array[k];
+  const int diff = x32 - k * 64; // in [0,64)
+  word = (word >> diff) << diff; // a mask is faster, but we don't care
+  while(word == 0) {
+    k++;
+    if(k == BITSET_CONTAINER_SIZE_IN_WORDS) return -1;
+    word = container->array[k];
+  }
+  return k * 64 + __builtin_ctzll(word);
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/bitset.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/containers.c */
+
+
+extern inline const void *container_unwrap_shared(
+    const void *candidate_shared_container, uint8_t *type);
+extern inline void *container_mutable_unwrap_shared(
+    void *candidate_shared_container, uint8_t *type);
+
+extern const char *get_container_name(uint8_t typecode);
+
+extern int container_get_cardinality(const void *container, uint8_t typecode);
+
+extern void *container_iand(void *c1, uint8_t type1, const void *c2,
+                            uint8_t type2, uint8_t *result_type);
+
+extern void *container_ior(void *c1, uint8_t type1, const void *c2,
+                           uint8_t type2, uint8_t *result_type);
+
+extern void *container_ixor(void *c1, uint8_t type1, const void *c2,
+                            uint8_t type2, uint8_t *result_type);
+
+extern void *container_iandnot(void *c1, uint8_t type1, const void *c2,
+                               uint8_t type2, uint8_t *result_type);
+
+void container_free(void *container, uint8_t typecode) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            bitset_container_free((bitset_container_t *)container);
+            break;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            array_container_free((array_container_t *)container);
+            break;
+        case RUN_CONTAINER_TYPE_CODE:
+            run_container_free((run_container_t *)container);
+            break;
+        case SHARED_CONTAINER_TYPE_CODE:
+            shared_container_free((shared_container_t *)container);
+            break;
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+}
+
+void container_printf(const void *container, uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            bitset_container_printf((const bitset_container_t *)container);
+            return;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            array_container_printf((const array_container_t *)container);
+            return;
+        case RUN_CONTAINER_TYPE_CODE:
+            run_container_printf((const run_container_t *)container);
+            return;
+        default:
+            __builtin_unreachable();
+    }
+}
+
+void container_printf_as_uint32_array(const void *container, uint8_t typecode,
+                                      uint32_t base) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            bitset_container_printf_as_uint32_array(
+                (const bitset_container_t *)container, base);
+            return;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            array_container_printf_as_uint32_array(
+                (const array_container_t *)container, base);
+            return;
+        case RUN_CONTAINER_TYPE_CODE:
+            run_container_printf_as_uint32_array(
+                (const run_container_t *)container, base);
+            return;
+            return;
+        default:
+            __builtin_unreachable();
+    }
+}
+
+int32_t container_serialize(const void *container, uint8_t typecode,
+                            char *buf) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return (bitset_container_serialize((const bitset_container_t *)container,
+                                               buf));
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return (
+                array_container_serialize((const array_container_t *)container, buf));
+        case RUN_CONTAINER_TYPE_CODE:
+            return (run_container_serialize((const run_container_t *)container, buf));
+        default:
+            assert(0);
+            __builtin_unreachable();
+            return (-1);
+    }
+}
+
+uint32_t container_serialization_len(const void *container, uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_serialization_len();
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_serialization_len(
+                (const array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_serialization_len(
+                (const run_container_t *)container);
+        default:
+            assert(0);
+            __builtin_unreachable();
+            return (0);
+    }
+}
+
+void *container_deserialize(uint8_t typecode, const char *buf, size_t buf_len) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return (bitset_container_deserialize(buf, buf_len));
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return (array_container_deserialize(buf, buf_len));
+        case RUN_CONTAINER_TYPE_CODE:
+            return (run_container_deserialize(buf, buf_len));
+        case SHARED_CONTAINER_TYPE_CODE:
+            printf("this should never happen.\n");
+            assert(0);
+            __builtin_unreachable();
+            return (NULL);
+        default:
+            assert(0);
+            __builtin_unreachable();
+            return (NULL);
+    }
+}
+
+extern bool container_nonzero_cardinality(const void *container,
+                                          uint8_t typecode);
+
+extern void container_free(void *container, uint8_t typecode);
+
+extern int container_to_uint32_array(uint32_t *output, const void *container,
+                                     uint8_t typecode, uint32_t base);
+
+extern void *container_add(void *container, uint16_t val, uint8_t typecode,
+                           uint8_t *new_typecode);
+
+extern inline bool container_contains(const void *container, uint16_t val,
+                                      uint8_t typecode);
+
+extern void *container_clone(const void *container, uint8_t typecode);
+
+extern void *container_and(const void *c1, uint8_t type1, const void *c2,
+                           uint8_t type2, uint8_t *result_type);
+
+extern void *container_or(const void *c1, uint8_t type1, const void *c2,
+                          uint8_t type2, uint8_t *result_type);
+
+extern void *container_xor(const void *c1, uint8_t type1, const void *c2,
+                           uint8_t type2, uint8_t *result_type);
+
+void *get_copy_of_container(void *container, uint8_t *typecode,
+                            bool copy_on_write) {
+    if (copy_on_write) {
+        shared_container_t *shared_container;
+        if (*typecode == SHARED_CONTAINER_TYPE_CODE) {
+            shared_container = (shared_container_t *)container;
+            shared_container->counter += 1;
+            return shared_container;
+        }
+        assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
+
+        if ((shared_container = (shared_container_t *)malloc(
+                 sizeof(shared_container_t))) == NULL) {
+            return NULL;
+        }
+
+        shared_container->container = container;
+        shared_container->typecode = *typecode;
+
+        shared_container->counter = 2;
+        *typecode = SHARED_CONTAINER_TYPE_CODE;
+
+        return shared_container;
+    }  // copy_on_write
+    // otherwise, no copy on write...
+    const void *actualcontainer =
+        container_unwrap_shared((const void *)container, typecode);
+    assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
+    return container_clone(actualcontainer, *typecode);
+}
+/**
+ * Copies a container, requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ */
+void *container_clone(const void *container, uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_clone((const bitset_container_t *)container);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_clone((const array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_clone((const run_container_t *)container);
+        case SHARED_CONTAINER_TYPE_CODE:
+            printf("shared containers are not cloneable\n");
+            assert(false);
+            return NULL;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+void *shared_container_extract_copy(shared_container_t *container,
+                                    uint8_t *typecode) {
+    assert(container->counter > 0);
+    assert(container->typecode != SHARED_CONTAINER_TYPE_CODE);
+    container->counter--;
+    *typecode = container->typecode;
+    void *answer;
+    if (container->counter == 0) {
+        answer = container->container;
+        container->container = NULL;  // paranoid
+        free(container);
+    } else {
+        answer = container_clone(container->container, *typecode);
+    }
+    assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
+    return answer;
+}
+
+void shared_container_free(shared_container_t *container) {
+    assert(container->counter > 0);
+    container->counter--;
+    if (container->counter == 0) {
+        assert(container->typecode != SHARED_CONTAINER_TYPE_CODE);
+        container_free(container->container, container->typecode);
+        container->container = NULL;  // paranoid
+        free(container);
+    }
+}
+
+extern void *container_not(const void *c1, uint8_t type1, uint8_t *result_type);
+
+extern void *container_not_range(const void *c1, uint8_t type1,
+                                 uint32_t range_start, uint32_t range_end,
+                                 uint8_t *result_type);
+
+extern void *container_inot(void *c1, uint8_t type1, uint8_t *result_type);
+
+extern void *container_inot_range(void *c1, uint8_t type1, uint32_t range_start,
+                                  uint32_t range_end, uint8_t *result_type);
+
+extern void *container_range_of_ones(uint32_t range_start, uint32_t range_end,
+                                     uint8_t *result_type);
+
+// where are the correponding things for union and intersection??
+extern void *container_lazy_xor(const void *c1, uint8_t type1, const void *c2,
+                                uint8_t type2, uint8_t *result_type);
+
+extern void *container_lazy_ixor(void *c1, uint8_t type1, const void *c2,
+                                 uint8_t type2, uint8_t *result_type);
+
+extern void *container_andnot(const void *c1, uint8_t type1, const void *c2,
+                              uint8_t type2, uint8_t *result_type);
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/containers.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/convert.c */
+#include <stdio.h>
+
+
+// file contains grubby stuff that must know impl. details of all container
+// types.
+bitset_container_t *bitset_container_from_array(const array_container_t *a) {
+    bitset_container_t *ans = bitset_container_create();
+    int limit = array_container_cardinality(a);
+    for (int i = 0; i < limit; ++i) bitset_container_set(ans, a->array[i]);
+    return ans;
+}
+
+bitset_container_t *bitset_container_from_run(const run_container_t *arr) {
+    int card = run_container_cardinality(arr);
+    bitset_container_t *answer = bitset_container_create();
+    for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
+        rle16_t vl = arr->runs[rlepos];
+        bitset_set_lenrange(answer->array, vl.value, vl.length);
+    }
+    answer->cardinality = card;
+    return answer;
+}
+
+array_container_t *array_container_from_run(const run_container_t *arr) {
+    array_container_t *answer =
+        array_container_create_given_capacity(run_container_cardinality(arr));
+    answer->cardinality = 0;
+    for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
+        int run_start = arr->runs[rlepos].value;
+        int run_end = run_start + arr->runs[rlepos].length;
+
+        for (int run_value = run_start; run_value <= run_end; ++run_value) {
+            answer->array[answer->cardinality++] = (uint16_t)run_value;
+        }
+    }
+    return answer;
+}
+
+array_container_t *array_container_from_bitset(const bitset_container_t *bits) {
+    array_container_t *result =
+        array_container_create_given_capacity(bits->cardinality);
+    result->cardinality = bits->cardinality;
+    //  sse version ends up being slower here
+    // (bitset_extract_setbits_sse_uint16)
+    // because of the sparsity of the data
+    bitset_extract_setbits_uint16(bits->array, BITSET_CONTAINER_SIZE_IN_WORDS,
+                                  result->array, 0);
+    return result;
+}
+
+/* assumes that container has adequate space.  Run from [s,e] (inclusive) */
+static void add_run(run_container_t *r, int s, int e) {
+    r->runs[r->n_runs].value = s;
+    r->runs[r->n_runs].length = e - s;
+    r->n_runs++;
+}
+
+run_container_t *run_container_from_array(const array_container_t *c) {
+    int32_t n_runs = array_container_number_of_runs(c);
+    run_container_t *answer = run_container_create_given_capacity(n_runs);
+    int prev = -2;
+    int run_start = -1;
+    int32_t card = c->cardinality;
+    if (card == 0) return answer;
+    for (int i = 0; i < card; ++i) {
+        const uint16_t cur_val = c->array[i];
+        if (cur_val != prev + 1) {
+            // new run starts; flush old one, if any
+            if (run_start != -1) add_run(answer, run_start, prev);
+            run_start = cur_val;
+        }
+        prev = c->array[i];
+    }
+    // now prev is the last seen value
+    add_run(answer, run_start, prev);
+    // assert(run_container_cardinality(answer) == c->cardinality);
+    return answer;
+}
+
+/**
+ * Convert the runcontainer to either a Bitmap or an Array Container, depending
+ * on the cardinality.  Frees the container.
+ * Allocates and returns new container, which caller is responsible for freeing
+ */
+
+void *convert_to_bitset_or_array_container(run_container_t *r, int32_t card,
+                                           uint8_t *resulttype) {
+    if (card <= DEFAULT_MAX_SIZE) {
+        array_container_t *answer = array_container_create_given_capacity(card);
+        answer->cardinality = 0;
+        for (int rlepos = 0; rlepos < r->n_runs; ++rlepos) {
+            uint16_t run_start = r->runs[rlepos].value;
+            uint16_t run_end = run_start + r->runs[rlepos].length;
+            for (uint16_t run_value = run_start; run_value <= run_end;
+                 ++run_value) {
+                answer->array[answer->cardinality++] = run_value;
+            }
+        }
+        assert(card == answer->cardinality);
+        *resulttype = ARRAY_CONTAINER_TYPE_CODE;
+        run_container_free(r);
+        return answer;
+    }
+    bitset_container_t *answer = bitset_container_create();
+    for (int rlepos = 0; rlepos < r->n_runs; ++rlepos) {
+        uint16_t run_start = r->runs[rlepos].value;
+        bitset_set_lenrange(answer->array, run_start, r->runs[rlepos].length);
+    }
+    answer->cardinality = card;
+    *resulttype = BITSET_CONTAINER_TYPE_CODE;
+    run_container_free(r);
+    return answer;
+}
+
+/* Converts a run container to either an array or a bitset, IF it saves space.
+ */
+/* If a conversion occurs, the caller is responsible to free the original
+ * container and
+ * he becomes responsible to free the new one. */
+void *convert_run_to_efficient_container(run_container_t *c,
+                                         uint8_t *typecode_after) {
+    int32_t size_as_run_container =
+        run_container_serialized_size_in_bytes(c->n_runs);
+
+    int32_t size_as_bitset_container =
+        bitset_container_serialized_size_in_bytes();
+    int32_t card = run_container_cardinality(c);
+    int32_t size_as_array_container =
+        array_container_serialized_size_in_bytes(card);
+
+    int32_t min_size_non_run =
+        size_as_bitset_container < size_as_array_container
+            ? size_as_bitset_container
+            : size_as_array_container;
+    if (size_as_run_container <= min_size_non_run) {  // no conversion
+        *typecode_after = RUN_CONTAINER_TYPE_CODE;
+        return c;
+    }
+    if (card <= DEFAULT_MAX_SIZE) {
+        // to array
+        array_container_t *answer = array_container_create_given_capacity(card);
+        answer->cardinality = 0;
+        for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
+            int run_start = c->runs[rlepos].value;
+            int run_end = run_start + c->runs[rlepos].length;
+
+            for (int run_value = run_start; run_value <= run_end; ++run_value) {
+                answer->array[answer->cardinality++] = (uint16_t)run_value;
+            }
+        }
+        *typecode_after = ARRAY_CONTAINER_TYPE_CODE;
+        return answer;
+    }
+
+    // else to bitset
+    bitset_container_t *answer = bitset_container_create();
+
+    for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
+        int start = c->runs[rlepos].value;
+        int end = start + c->runs[rlepos].length;
+        bitset_set_range(answer->array, start, end + 1);
+    }
+    answer->cardinality = card;
+    *typecode_after = BITSET_CONTAINER_TYPE_CODE;
+    return answer;
+}
+
+// like convert_run_to_efficient_container but frees the old result if needed
+void *convert_run_to_efficient_container_and_free(run_container_t *c,
+                                                  uint8_t *typecode_after) {
+    void *answer = convert_run_to_efficient_container(c, typecode_after);
+    if (answer != c) run_container_free(c);
+    return answer;
+}
+
+/* once converted, the original container is disposed here, rather than
+   in roaring_array
+*/
+
+// TODO: split into run-  array-  and bitset-  subfunctions for sanity;
+// a few function calls won't really matter.
+
+void *convert_run_optimize(void *c, uint8_t typecode_original,
+                           uint8_t *typecode_after) {
+    if (typecode_original == RUN_CONTAINER_TYPE_CODE) {
+        void *newc = convert_run_to_efficient_container((run_container_t *)c,
+                                                        typecode_after);
+        if (newc != c) {
+            container_free(c, typecode_original);
+        }
+        return newc;
+    } else if (typecode_original == ARRAY_CONTAINER_TYPE_CODE) {
+        // it might need to be converted to a run container.
+        array_container_t *c_qua_array = (array_container_t *)c;
+        int32_t n_runs = array_container_number_of_runs(c_qua_array);
+        int32_t size_as_run_container =
+            run_container_serialized_size_in_bytes(n_runs);
+        int32_t card = array_container_cardinality(c_qua_array);
+        int32_t size_as_array_container =
+            array_container_serialized_size_in_bytes(card);
+
+        if (size_as_run_container >= size_as_array_container) {
+            *typecode_after = ARRAY_CONTAINER_TYPE_CODE;
+            return c;
+        }
+        // else convert array to run container
+        run_container_t *answer = run_container_create_given_capacity(n_runs);
+        int prev = -2;
+        int run_start = -1;
+
+        assert(card > 0);
+        for (int i = 0; i < card; ++i) {
+            uint16_t cur_val = c_qua_array->array[i];
+            if (cur_val != prev + 1) {
+                // new run starts; flush old one, if any
+                if (run_start != -1) add_run(answer, run_start, prev);
+                run_start = cur_val;
+            }
+            prev = c_qua_array->array[i];
+        }
+        assert(run_start >= 0);
+        // now prev is the last seen value
+        add_run(answer, run_start, prev);
+        *typecode_after = RUN_CONTAINER_TYPE_CODE;
+        array_container_free(c_qua_array);
+        return answer;
+    } else if (typecode_original ==
+               BITSET_CONTAINER_TYPE_CODE) {  // run conversions on bitset
+        // does bitset need conversion to run?
+        bitset_container_t *c_qua_bitset = (bitset_container_t *)c;
+        int32_t n_runs = bitset_container_number_of_runs(c_qua_bitset);
+        int32_t size_as_run_container =
+            run_container_serialized_size_in_bytes(n_runs);
+        int32_t size_as_bitset_container =
+            bitset_container_serialized_size_in_bytes();
+
+        if (size_as_bitset_container <= size_as_run_container) {
+            // no conversion needed.
+            *typecode_after = BITSET_CONTAINER_TYPE_CODE;
+            return c;
+        }
+        // bitset to runcontainer (ported from Java  RunContainer(
+        // BitmapContainer bc, int nbrRuns))
+        assert(n_runs > 0);  // no empty bitmaps
+        run_container_t *answer = run_container_create_given_capacity(n_runs);
+
+        int long_ctr = 0;
+        uint64_t cur_word = c_qua_bitset->array[0];
+        int run_count = 0;
+        while (true) {
+            while (cur_word == UINT64_C(0) &&
+                   long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
+                cur_word = c_qua_bitset->array[++long_ctr];
+
+            if (cur_word == UINT64_C(0)) {
+                bitset_container_free(c_qua_bitset);
+                *typecode_after = RUN_CONTAINER_TYPE_CODE;
+                return answer;
+            }
+
+            int local_run_start = __builtin_ctzll(cur_word);
+            int run_start = local_run_start + 64 * long_ctr;
+            uint64_t cur_word_with_1s = cur_word | (cur_word - 1);
+
+            int run_end = 0;
+            while (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF) &&
+                   long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
+                cur_word_with_1s = c_qua_bitset->array[++long_ctr];
+
+            if (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF)) {
+                run_end = 64 + long_ctr * 64;  // exclusive, I guess
+                add_run(answer, run_start, run_end - 1);
+                bitset_container_free(c_qua_bitset);
+                *typecode_after = RUN_CONTAINER_TYPE_CODE;
+                return answer;
+            }
+            int local_run_end = __builtin_ctzll(~cur_word_with_1s);
+            run_end = local_run_end + long_ctr * 64;
+            add_run(answer, run_start, run_end - 1);
+            run_count++;
+            cur_word = cur_word_with_1s & (cur_word_with_1s + 1);
+        }
+        return answer;
+    } else {
+        assert(false);
+        __builtin_unreachable();
+        return NULL;
+    }
+}
+
+bitset_container_t *bitset_container_from_run_range(const run_container_t *run,
+                                                    uint32_t min, uint32_t max) {
+    bitset_container_t *bitset = bitset_container_create();
+    int32_t union_cardinality = 0;
+    for (int32_t i = 0; i < run->n_runs; ++i) {
+        uint32_t rle_min = run->runs[i].value;
+        uint32_t rle_max = rle_min + run->runs[i].length;
+        bitset_set_lenrange(bitset->array, rle_min, rle_max - rle_min);
+        union_cardinality += run->runs[i].length + 1;
+    }
+    union_cardinality += max - min + 1;
+    union_cardinality -= bitset_lenrange_cardinality(bitset->array, min, max-min);
+    bitset_set_lenrange(bitset->array, min, max - min);
+    bitset->cardinality = union_cardinality;
+    return bitset;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/convert.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_andnot.c */
+/*
+ * mixed_andnot.c.  More methods since operation is not symmetric,
+ * except no "wide" andnot , so no lazy options motivated.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, a valid array container that could be the same as dst.*/
+void array_bitset_container_andnot(const array_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   array_container_t *dst) {
+    // follows Java implementation as of June 2016
+    if (dst->capacity < src_1->cardinality) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+    int32_t newcard = 0;
+    const int32_t origcard = src_1->cardinality;
+    for (int i = 0; i < origcard; ++i) {
+        uint16_t key = src_1->array[i];
+        dst->array[newcard] = key;
+        newcard += 1 - bitset_container_contains(src_2, key);
+    }
+    dst->cardinality = newcard;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * src_1 */
+
+void array_bitset_container_iandnot(array_container_t *src_1,
+                                    const bitset_container_t *src_2) {
+    array_bitset_container_andnot(src_1, src_2, src_1);
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, which does not initially have a valid container.
+ * Return true for a bitset result; false for array
+ */
+
+bool bitset_array_container_andnot(const bitset_container_t *src_1,
+                                   const array_container_t *src_2, void **dst) {
+    // Java did this directly, but we have option of asm or avx
+    bitset_container_t *result = bitset_container_create();
+    bitset_container_copy(src_1, result);
+    result->cardinality =
+        (int32_t)bitset_clear_list(result->array, (uint64_t)result->cardinality,
+                                   src_2->array, (uint64_t)src_2->cardinality);
+
+    // do required type conversions.
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;
+    }
+    *dst = result;
+    return true;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_iandnot(bitset_container_t *src_1,
+                                    const array_container_t *src_2,
+                                    void **dst) {
+    *dst = src_1;
+    src_1->cardinality =
+        (int32_t)bitset_clear_list(src_1->array, (uint64_t)src_1->cardinality,
+                                   src_2->array, (uint64_t)src_2->cardinality);
+
+    if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else
+        return true;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_andnot(const run_container_t *src_1,
+                                 const bitset_container_t *src_2, void **dst) {
+    // follows the Java implementation as of June 2016
+    int card = run_container_cardinality(src_1);
+    if (card <= DEFAULT_MAX_SIZE) {
+        // must be an array
+        array_container_t *answer = array_container_create_given_capacity(card);
+        answer->cardinality = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            rle16_t rle = src_1->runs[rlepos];
+            for (int run_value = rle.value; run_value <= rle.value + rle.length;
+                 ++run_value) {
+                if (!bitset_container_get(src_2, (uint16_t)run_value)) {
+                    answer->array[answer->cardinality++] = (uint16_t)run_value;
+                }
+            }
+        }
+        *dst = answer;
+        return false;
+    } else {  // we guess it will be a bitset, though have to check guess when
+              // done
+        bitset_container_t *answer = bitset_container_clone(src_2);
+
+        uint32_t last_pos = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            rle16_t rle = src_1->runs[rlepos];
+
+            uint32_t start = rle.value;
+            uint32_t end = start + rle.length + 1;
+            bitset_reset_range(answer->array, last_pos, start);
+            bitset_flip_range(answer->array, start, end);
+            last_pos = end;
+        }
+        bitset_reset_range(answer->array, last_pos, (uint32_t)(1 << 16));
+
+        answer->cardinality = bitset_container_compute_cardinality(answer);
+
+        if (answer->cardinality <= DEFAULT_MAX_SIZE) {
+            *dst = array_container_from_bitset(answer);
+            bitset_container_free(answer);
+            return false;  // not bitset
+        }
+        *dst = answer;
+        return true;  // bitset
+    }
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_iandnot(run_container_t *src_1,
+                                  const bitset_container_t *src_2, void **dst) {
+    // dummy implementation
+    bool ans = run_bitset_container_andnot(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset").  dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool bitset_run_container_andnot(const bitset_container_t *src_1,
+                                 const run_container_t *src_2, void **dst) {
+    // follows Java implementation
+    bitset_container_t *result = bitset_container_create();
+
+    bitset_container_copy(src_1, result);
+    for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
+        rle16_t rle = src_2->runs[rlepos];
+        bitset_reset_range(result->array, rle.value,
+                           rle.value + rle.length + UINT32_C(1));
+    }
+    result->cardinality = bitset_container_compute_cardinality(result);
+
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;  // not bitset
+    }
+    *dst = result;
+    return true;  // bitset
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_run_container_iandnot(bitset_container_t *src_1,
+                                  const run_container_t *src_2, void **dst) {
+    *dst = src_1;
+
+    for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
+        rle16_t rle = src_2->runs[rlepos];
+        bitset_reset_range(src_1->array, rle.value,
+                           rle.value + rle.length + UINT32_C(1));
+    }
+    src_1->cardinality = bitset_container_compute_cardinality(src_1);
+
+    if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else
+        return true;
+}
+
+/* helper. a_out must be a valid array container with adequate capacity.
+ * Returns the cardinality of the output container. Partly Based on Java
+ * implementation Util.unsignedDifference.
+ *
+ * TODO: Util.unsignedDifference does not use advanceUntil.  Is it cheaper
+ * to avoid advanceUntil?
+ */
+
+static int run_array_array_subtract(const run_container_t *r,
+                                    const array_container_t *a_in,
+                                    array_container_t *a_out) {
+    int out_card = 0;
+    int32_t in_array_pos =
+        -1;  // since advanceUntil always assumes we start the search AFTER this
+
+    for (int rlepos = 0; rlepos < r->n_runs; rlepos++) {
+        int32_t start = r->runs[rlepos].value;
+        int32_t end = start + r->runs[rlepos].length + 1;
+
+        in_array_pos = advanceUntil(a_in->array, in_array_pos,
+                                    a_in->cardinality, (uint16_t)start);
+
+        if (in_array_pos >= a_in->cardinality) {  // run has no items subtracted
+            for (int32_t i = start; i < end; ++i)
+                a_out->array[out_card++] = (uint16_t)i;
+        } else {
+            uint16_t next_nonincluded = a_in->array[in_array_pos];
+            if (next_nonincluded >= end) {
+                // another case when run goes unaltered
+                for (int32_t i = start; i < end; ++i)
+                    a_out->array[out_card++] = (uint16_t)i;
+                in_array_pos--;  // ensure we see this item again if necessary
+            } else {
+                for (int32_t i = start; i < end; ++i)
+                    if (i != next_nonincluded)
+                        a_out->array[out_card++] = (uint16_t)i;
+                    else  // 0 should ensure  we don't match
+                        next_nonincluded =
+                            (in_array_pos + 1 >= a_in->cardinality)
+                                ? 0
+                                : a_in->array[++in_array_pos];
+                in_array_pos--;  // see again
+            }
+        }
+    }
+    return out_card;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any type of container.
+ */
+
+int run_array_container_andnot(const run_container_t *src_1,
+                               const array_container_t *src_2, void **dst) {
+    // follows the Java impl as of June 2016
+
+    int card = run_container_cardinality(src_1);
+    const int arbitrary_threshold = 32;
+
+    if (card <= arbitrary_threshold) {
+        if (src_2->cardinality == 0) {
+            *dst = run_container_clone(src_1);
+            return RUN_CONTAINER_TYPE_CODE;
+        }
+        // Java's "lazyandNot.toEfficientContainer" thing
+        run_container_t *answer = run_container_create_given_capacity(
+            card + array_container_cardinality(src_2));
+
+        int rlepos = 0;
+        int xrlepos = 0;  // "x" is src_2
+        rle16_t rle = src_1->runs[rlepos];
+        int32_t start = rle.value;
+        int32_t end = start + rle.length + 1;
+        int32_t xstart = src_2->array[xrlepos];
+
+        while ((rlepos < src_1->n_runs) && (xrlepos < src_2->cardinality)) {
+            if (end <= xstart) {
+                // output the first run
+                answer->runs[answer->n_runs++] =
+                    (rle16_t){.value = (uint16_t)start,
+                              .length = (uint16_t)(end - start - 1)};
+                rlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+            } else if (xstart + 1 <= start) {
+                // exit the second run
+                xrlepos++;
+                if (xrlepos < src_2->cardinality) {
+                    xstart = src_2->array[xrlepos];
+                }
+            } else {
+                if (start < xstart) {
+                    answer->runs[answer->n_runs++] =
+                        (rle16_t){.value = (uint16_t)start,
+                                  .length = (uint16_t)(xstart - start - 1)};
+                }
+                if (xstart + 1 < end) {
+                    start = xstart + 1;
+                } else {
+                    rlepos++;
+                    if (rlepos < src_1->n_runs) {
+                        start = src_1->runs[rlepos].value;
+                        end = start + src_1->runs[rlepos].length + 1;
+                    }
+                }
+            }
+        }
+        if (rlepos < src_1->n_runs) {
+            answer->runs[answer->n_runs++] =
+                (rle16_t){.value = (uint16_t)start,
+                          .length = (uint16_t)(end - start - 1)};
+            rlepos++;
+            if (rlepos < src_1->n_runs) {
+                memcpy(answer->runs + answer->n_runs, src_1->runs + rlepos,
+                       (src_1->n_runs - rlepos) * sizeof(rle16_t));
+                answer->n_runs += (src_1->n_runs - rlepos);
+            }
+        }
+        uint8_t return_type;
+        *dst = convert_run_to_efficient_container(answer, &return_type);
+        if (answer != *dst) run_container_free(answer);
+        return return_type;
+    }
+    // else it's a bitmap or array
+
+    if (card <= DEFAULT_MAX_SIZE) {
+        array_container_t *ac = array_container_create_given_capacity(card);
+        // nb Java code used a generic iterator-based merge to compute
+        // difference
+        ac->cardinality = run_array_array_subtract(src_1, src_2, ac);
+        *dst = ac;
+        return ARRAY_CONTAINER_TYPE_CODE;
+    }
+    bitset_container_t *ans = bitset_container_from_run(src_1);
+    bool result_is_bitset = bitset_array_container_iandnot(ans, src_2, dst);
+    return (result_is_bitset ? BITSET_CONTAINER_TYPE_CODE
+                             : ARRAY_CONTAINER_TYPE_CODE);
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_array_container_iandnot(run_container_t *src_1,
+                                const array_container_t *src_2, void **dst) {
+    // dummy implementation same as June 2016 Java
+    int ans = run_array_container_andnot(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+/* dst must be a valid array container, allowed to be src_1 */
+
+void array_run_container_andnot(const array_container_t *src_1,
+                                const run_container_t *src_2,
+                                array_container_t *dst) {
+    // basically following Java impl as of June 2016
+    if (src_1->cardinality > dst->capacity) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+
+    if (src_2->n_runs == 0) {
+        memmove(dst->array, src_1->array,
+                sizeof(uint16_t) * src_1->cardinality);
+        dst->cardinality = src_1->cardinality;
+        return;
+    }
+    int32_t run_start = src_2->runs[0].value;
+    int32_t run_end = run_start + src_2->runs[0].length;
+    int which_run = 0;
+
+    uint16_t val = 0;
+    int dest_card = 0;
+    for (int i = 0; i < src_1->cardinality; ++i) {
+        val = src_1->array[i];
+        if (val < run_start)
+            dst->array[dest_card++] = val;
+        else if (val <= run_end) {
+            ;  // omitted item
+        } else {
+            do {
+                if (which_run + 1 < src_2->n_runs) {
+                    ++which_run;
+                    run_start = src_2->runs[which_run].value;
+                    run_end = run_start + src_2->runs[which_run].length;
+
+                } else
+                    run_start = run_end = (1 << 16) + 1;
+            } while (val > run_end);
+            --i;
+        }
+    }
+    dst->cardinality = dest_card;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+void array_run_container_iandnot(array_container_t *src_1,
+                                 const run_container_t *src_2) {
+    array_run_container_andnot(src_1, src_2, src_1);
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_andnot(const run_container_t *src_1,
+                             const run_container_t *src_2, void **dst) {
+    run_container_t *ans = run_container_create();
+    run_container_andnot(src_1, src_2, ans);
+    uint8_t typecode_after;
+    *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
+    return typecode_after;
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_run_container_iandnot(run_container_t *src_1,
+                              const run_container_t *src_2, void **dst) {
+    // following Java impl as of June 2016 (dummy)
+    int ans = run_run_container_andnot(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+/*
+ * dst is a valid array container and may be the same as src_1
+ */
+
+void array_array_container_andnot(const array_container_t *src_1,
+                                  const array_container_t *src_2,
+                                  array_container_t *dst) {
+    array_container_andnot(src_1, src_2, dst);
+}
+
+/* inplace array-array andnot will always be able to reuse the space of
+ * src_1 */
+void array_array_container_iandnot(array_container_t *src_1,
+                                   const array_container_t *src_2) {
+    array_container_andnot(src_1, src_2, src_1);
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_andnot(const bitset_container_t *src_1,
+                                    const bitset_container_t *src_2,
+                                    void **dst) {
+    bitset_container_t *ans = bitset_container_create();
+    int card = bitset_container_andnot(src_1, src_2, ans);
+    if (card <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(ans);
+        bitset_container_free(ans);
+        return false;  // not bitset
+    } else {
+        *dst = ans;
+        return true;
+    }
+}
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_bitset_container_iandnot(bitset_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     void **dst) {
+    int card = bitset_container_andnot(src_1, src_2, src_1);
+    if (card <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else {
+        *dst = src_1;
+        return true;
+    }
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_andnot.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_equal.c */
+
+bool array_container_equal_bitset(const array_container_t* container1,
+                                  const bitset_container_t* container2) {
+    if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container2->cardinality != container1->cardinality) {
+            return false;
+        }
+    }
+    int32_t pos = 0;
+    for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
+        uint64_t w = container2->array[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            uint16_t r = i * 64 + __builtin_ctzll(w);
+            if (pos >= container1->cardinality) {
+                return false;
+            }
+            if (container1->array[pos] != r) {
+                return false;
+            }
+            ++pos;
+            w ^= t;
+        }
+    }
+    return (pos == container1->cardinality);
+}
+
+bool run_container_equals_array(const run_container_t* container1,
+                                const array_container_t* container2) {
+    if (run_container_cardinality(container1) != container2->cardinality)
+        return false;
+    int32_t pos = 0;
+    for (int i = 0; i < container1->n_runs; ++i) {
+        const uint32_t run_start = container1->runs[i].value;
+        const uint32_t le = container1->runs[i].length;
+
+        if (container2->array[pos] != run_start) {
+            return false;
+        }
+
+        if (container2->array[pos + le] != run_start + le) {
+            return false;
+        }
+
+        pos += le + 1;
+    }
+    return true;
+}
+
+bool run_container_equals_bitset(const run_container_t* container1,
+                                 const bitset_container_t* container2) {
+    if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container2->cardinality != run_container_cardinality(container1)) {
+            return false;
+        }
+    } else {
+        int32_t card = bitset_container_compute_cardinality(
+            container2);  // modify container2?
+        if (card != run_container_cardinality(container1)) {
+            return false;
+        }
+    }
+    for (int i = 0; i < container1->n_runs; ++i) {
+        uint32_t run_start = container1->runs[i].value;
+        uint32_t le = container1->runs[i].length;
+        for (uint32_t j = run_start; j <= run_start + le; ++j) {
+            // todo: this code could be much faster
+            if (!bitset_container_contains(container2, j)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_equal.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_intersection.c */
+/*
+ * mixed_intersection.c
+ *
+ */
+
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst.  */
+void array_bitset_container_intersection(const array_container_t *src_1,
+                                         const bitset_container_t *src_2,
+                                         array_container_t *dst) {
+    if (dst->capacity < src_1->cardinality) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+    int32_t newcard = 0;  // dst could be src_1
+    const int32_t origcard = src_1->cardinality;
+    for (int i = 0; i < origcard; ++i) {
+        uint16_t key = src_1->array[i];
+        // this branchless approach is much faster...
+        dst->array[newcard] = key;
+        newcard += bitset_container_contains(src_2, key);
+        /**
+         * we could do it this way instead...
+         * if (bitset_container_contains(src_2, key)) {
+         * dst->array[newcard++] = key;
+         * }
+         * but if the result is unpredictible, the processor generates
+         * many mispredicted branches.
+         * Difference can be huge (from 3 cycles when predictible all the way
+         * to 16 cycles when unpredictible.
+         * See
+         * https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/bitset/c/arraybitsetintersection.c
+         */
+    }
+    dst->cardinality = newcard;
+}
+
+/* Compute the size of the intersection of src_1 and src_2. */
+int array_bitset_container_intersection_cardinality(
+    const array_container_t *src_1, const bitset_container_t *src_2) {
+    int32_t newcard = 0;
+    const int32_t origcard = src_1->cardinality;
+    for (int i = 0; i < origcard; ++i) {
+        uint16_t key = src_1->array[i];
+        newcard += bitset_container_contains(src_2, key);
+    }
+    return newcard;
+}
+
+
+bool array_bitset_container_intersect(const array_container_t *src_1,
+                                         const bitset_container_t *src_2) {
+	const int32_t origcard = src_1->cardinality;
+	for (int i = 0; i < origcard; ++i) {
+	        uint16_t key = src_1->array[i];
+	        if(bitset_container_contains(src_2, key)) return true;
+	}
+	return false;
+}
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
+ * valid container. */
+void array_run_container_intersection(const array_container_t *src_1,
+                                      const run_container_t *src_2,
+                                      array_container_t *dst) {
+    if (run_container_is_full(src_2)) {
+        if (dst != src_1) array_container_copy(src_1, dst);
+        return;
+    }
+    if (dst->capacity < src_1->cardinality) {
+        array_container_grow(dst, src_1->cardinality, false);
+    }
+    if (src_2->n_runs == 0) {
+        return;
+    }
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t rle = src_2->runs[rlepos];
+    int32_t newcard = 0;
+    while (arraypos < src_1->cardinality) {
+        const uint16_t arrayval = src_1->array[arraypos];
+        while (rle.value + rle.length <
+               arrayval) {  // this will frequently be false
+            ++rlepos;
+            if (rlepos == src_2->n_runs) {
+                dst->cardinality = newcard;
+                return;  // we are done
+            }
+            rle = src_2->runs[rlepos];
+        }
+        if (rle.value > arrayval) {
+            arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
+                                    rle.value);
+        } else {
+            dst->array[newcard] = arrayval;
+            newcard++;
+            arraypos++;
+        }
+    }
+    dst->cardinality = newcard;
+}
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * *dst. If the result is true then the result is a bitset_container_t
+ * otherwise is a array_container_t. If *dst ==  src_2, an in-place processing
+ * is attempted.*/
+bool run_bitset_container_intersection(const run_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       void **dst) {
+    if (run_container_is_full(src_1)) {
+        if (*dst != src_2) *dst = bitset_container_clone(src_2);
+        return true;
+    }
+    int32_t card = run_container_cardinality(src_1);
+    if (card <= DEFAULT_MAX_SIZE) {
+        // result can only be an array (assuming that we never make a
+        // RunContainer)
+        if (card > src_2->cardinality) {
+            card = src_2->cardinality;
+        }
+        array_container_t *answer = array_container_create_given_capacity(card);
+        *dst = answer;
+        if (*dst == NULL) {
+            return false;
+        }
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            rle16_t rle = src_1->runs[rlepos];
+            uint32_t endofrun = (uint32_t)rle.value + rle.length;
+            for (uint32_t runValue = rle.value; runValue <= endofrun;
+                 ++runValue) {
+                answer->array[answer->cardinality] = (uint16_t)runValue;
+                answer->cardinality +=
+                    bitset_container_contains(src_2, runValue);
+            }
+        }
+        return false;
+    }
+    if (*dst == src_2) {  // we attempt in-place
+        bitset_container_t *answer = (bitset_container_t *)*dst;
+        uint32_t start = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            const rle16_t rle = src_1->runs[rlepos];
+            uint32_t end = rle.value;
+            bitset_reset_range(src_2->array, start, end);
+
+            start = end + rle.length + 1;
+        }
+        bitset_reset_range(src_2->array, start, UINT32_C(1) << 16);
+        answer->cardinality = bitset_container_compute_cardinality(answer);
+        if (src_2->cardinality > DEFAULT_MAX_SIZE) {
+            return true;
+        } else {
+            array_container_t *newanswer = array_container_from_bitset(src_2);
+            if (newanswer == NULL) {
+                *dst = NULL;
+                return false;
+            }
+            *dst = newanswer;
+            return false;
+        }
+    } else {  // no inplace
+        // we expect the answer to be a bitmap (if we are lucky)
+        bitset_container_t *answer = bitset_container_clone(src_2);
+
+        *dst = answer;
+        if (answer == NULL) {
+            return true;
+        }
+        uint32_t start = 0;
+        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+            const rle16_t rle = src_1->runs[rlepos];
+            uint32_t end = rle.value;
+            bitset_reset_range(answer->array, start, end);
+            start = end + rle.length + 1;
+        }
+        bitset_reset_range(answer->array, start, UINT32_C(1) << 16);
+        answer->cardinality = bitset_container_compute_cardinality(answer);
+
+        if (answer->cardinality > DEFAULT_MAX_SIZE) {
+            return true;
+        } else {
+            array_container_t *newanswer = array_container_from_bitset(answer);
+            bitset_container_free((bitset_container_t *)*dst);
+            if (newanswer == NULL) {
+                *dst = NULL;
+                return false;
+            }
+            *dst = newanswer;
+            return false;
+        }
+    }
+}
+
+/* Compute the size of the intersection between src_1 and src_2 . */
+int array_run_container_intersection_cardinality(const array_container_t *src_1,
+                                                 const run_container_t *src_2) {
+    if (run_container_is_full(src_2)) {
+        return src_1->cardinality;
+    }
+    if (src_2->n_runs == 0) {
+        return 0;
+    }
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t rle = src_2->runs[rlepos];
+    int32_t newcard = 0;
+    while (arraypos < src_1->cardinality) {
+        const uint16_t arrayval = src_1->array[arraypos];
+        while (rle.value + rle.length <
+               arrayval) {  // this will frequently be false
+            ++rlepos;
+            if (rlepos == src_2->n_runs) {
+                return newcard;  // we are done
+            }
+            rle = src_2->runs[rlepos];
+        }
+        if (rle.value > arrayval) {
+            arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
+                                    rle.value);
+        } else {
+            newcard++;
+            arraypos++;
+        }
+    }
+    return newcard;
+}
+
+/* Compute the intersection  between src_1 and src_2
+ **/
+int run_bitset_container_intersection_cardinality(
+    const run_container_t *src_1, const bitset_container_t *src_2) {
+    if (run_container_is_full(src_1)) {
+        return bitset_container_cardinality(src_2);
+    }
+    int answer = 0;
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        answer +=
+            bitset_lenrange_cardinality(src_2->array, rle.value, rle.length);
+    }
+    return answer;
+}
+
+
+bool array_run_container_intersect(const array_container_t *src_1,
+                                      const run_container_t *src_2) {
+	if( run_container_is_full(src_2) ) {
+	    return !array_container_empty(src_1);
+	}
+	if (src_2->n_runs == 0) {
+        return false;
+    }
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t rle = src_2->runs[rlepos];
+    while (arraypos < src_1->cardinality) {
+        const uint16_t arrayval = src_1->array[arraypos];
+        while (rle.value + rle.length <
+               arrayval) {  // this will frequently be false
+            ++rlepos;
+            if (rlepos == src_2->n_runs) {
+                return false;  // we are done
+            }
+            rle = src_2->runs[rlepos];
+        }
+        if (rle.value > arrayval) {
+            arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
+                                    rle.value);
+        } else {
+            return true;
+        }
+    }
+    return false;
+}
+
+/* Compute the intersection  between src_1 and src_2
+ **/
+bool run_bitset_container_intersect(const run_container_t *src_1,
+                                       const bitset_container_t *src_2) {
+	   if( run_container_is_full(src_1) ) {
+		   return !bitset_container_empty(src_2);
+	   }
+       for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+           rle16_t rle = src_1->runs[rlepos];
+           if(!bitset_lenrange_empty(src_2->array, rle.value,rle.length)) return true;
+       }
+       return false;
+}
+
+/*
+ * Compute the intersection between src_1 and src_2 and write the result
+ * to *dst. If the return function is true, the result is a bitset_container_t
+ * otherwise is a array_container_t.
+ */
+bool bitset_bitset_container_intersection(const bitset_container_t *src_1,
+                                          const bitset_container_t *src_2,
+                                          void **dst) {
+    const int newCardinality = bitset_container_and_justcard(src_1, src_2);
+    if (newCardinality > DEFAULT_MAX_SIZE) {
+        *dst = bitset_container_create();
+        if (*dst != NULL) {
+            bitset_container_and_nocard(src_1, src_2,
+                                        (bitset_container_t *)*dst);
+            ((bitset_container_t *)*dst)->cardinality = newCardinality;
+        }
+        return true;  // it is a bitset
+    }
+    *dst = array_container_create_given_capacity(newCardinality);
+    if (*dst != NULL) {
+        ((array_container_t *)*dst)->cardinality = newCardinality;
+        bitset_extract_intersection_setbits_uint16(
+            ((const bitset_container_t *)src_1)->array,
+            ((const bitset_container_t *)src_2)->array,
+            BITSET_CONTAINER_SIZE_IN_WORDS, ((array_container_t *)*dst)->array,
+            0);
+    }
+    return false;  // not a bitset
+}
+
+bool bitset_bitset_container_intersection_inplace(
+    bitset_container_t *src_1, const bitset_container_t *src_2, void **dst) {
+    const int newCardinality = bitset_container_and_justcard(src_1, src_2);
+    if (newCardinality > DEFAULT_MAX_SIZE) {
+        *dst = src_1;
+        bitset_container_and_nocard(src_1, src_2, src_1);
+        ((bitset_container_t *)*dst)->cardinality = newCardinality;
+        return true;  // it is a bitset
+    }
+    *dst = array_container_create_given_capacity(newCardinality);
+    if (*dst != NULL) {
+        ((array_container_t *)*dst)->cardinality = newCardinality;
+        bitset_extract_intersection_setbits_uint16(
+            ((const bitset_container_t *)src_1)->array,
+            ((const bitset_container_t *)src_2)->array,
+            BITSET_CONTAINER_SIZE_IN_WORDS, ((array_container_t *)*dst)->array,
+            0);
+    }
+    return false;  // not a bitset
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_intersection.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_negation.c */
+/*
+ * mixed_negation.c
+ *
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+// TODO: make simplified and optimized negation code across
+// the full range.
+
+/* Negation across the entire range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. The complement of a
+ * sufficiently sparse set will always be dense and a hence a bitmap
+' * We assume that dst is pre-allocated and a valid bitset container
+ * There can be no in-place version.
+ */
+void array_container_negation(const array_container_t *src,
+                              bitset_container_t *dst) {
+    uint64_t card = UINT64_C(1 << 16);
+    bitset_container_set_all(dst);
+
+    dst->cardinality = (int32_t)bitset_clear_list(dst->array, card, src->array,
+                                                  (uint64_t)src->cardinality);
+}
+
+/* Negation across the entire range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation(const bitset_container_t *src, void **dst) {
+    return bitset_container_negation_range(src, 0, (1 << 16), dst);
+}
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_inplace(bitset_container_t *src, void **dst) {
+    return bitset_container_negation_range_inplace(src, 0, (1 << 16), dst);
+}
+
+/* Negation across the entire range of container
+ * Compute the  negation of src  and write the result
+ * to *dst.  Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation(const run_container_t *src, void **dst) {
+    return run_container_negation_range(src, 0, (1 << 16), dst);
+}
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_inplace(run_container_t *src, void **dst) {
+    return run_container_negation_range_inplace(src, 0, (1 << 16), dst);
+}
+
+/* Negation across a range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. Returns true if the result is a bitset container
+ * and false for an array container.  *dst is not preallocated.
+ */
+bool array_container_negation_range(const array_container_t *src,
+                                    const int range_start, const int range_end,
+                                    void **dst) {
+    /* close port of the Java implementation */
+    if (range_start >= range_end) {
+        *dst = array_container_clone(src);
+        return false;
+    }
+
+    int32_t start_index =
+        binarySearch(src->array, src->cardinality, (uint16_t)range_start);
+    if (start_index < 0) start_index = -start_index - 1;
+
+    int32_t last_index =
+        binarySearch(src->array, src->cardinality, (uint16_t)(range_end - 1));
+    if (last_index < 0) last_index = -last_index - 2;
+
+    const int32_t current_values_in_range = last_index - start_index + 1;
+    const int32_t span_to_be_flipped = range_end - range_start;
+    const int32_t new_values_in_range =
+        span_to_be_flipped - current_values_in_range;
+    const int32_t cardinality_change =
+        new_values_in_range - current_values_in_range;
+    const int32_t new_cardinality = src->cardinality + cardinality_change;
+
+    if (new_cardinality > DEFAULT_MAX_SIZE) {
+        bitset_container_t *temp = bitset_container_from_array(src);
+        bitset_flip_range(temp->array, (uint32_t)range_start,
+                          (uint32_t)range_end);
+        temp->cardinality = new_cardinality;
+        *dst = temp;
+        return true;
+    }
+
+    array_container_t *arr =
+        array_container_create_given_capacity(new_cardinality);
+    *dst = (void *)arr;
+    if(new_cardinality == 0) {
+      arr->cardinality = new_cardinality;
+      return false; // we are done.
+    }
+    // copy stuff before the active area
+    memcpy(arr->array, src->array, start_index * sizeof(uint16_t));
+
+    // work on the range
+    int32_t out_pos = start_index, in_pos = start_index;
+    int32_t val_in_range = range_start;
+    for (; val_in_range < range_end && in_pos <= last_index; ++val_in_range) {
+        if ((uint16_t)val_in_range != src->array[in_pos]) {
+            arr->array[out_pos++] = (uint16_t)val_in_range;
+        } else {
+            ++in_pos;
+        }
+    }
+    for (; val_in_range < range_end; ++val_in_range)
+        arr->array[out_pos++] = (uint16_t)val_in_range;
+
+    // content after the active range
+    memcpy(arr->array + out_pos, src->array + (last_index + 1),
+           (src->cardinality - (last_index + 1)) * sizeof(uint16_t));
+    arr->cardinality = new_cardinality;
+    return false;
+}
+
+/* Even when the result would fit, it is unclear how to make an
+ * inplace version without inefficient copying.
+ */
+
+bool array_container_negation_range_inplace(array_container_t *src,
+                                            const int range_start,
+                                            const int range_end, void **dst) {
+    bool ans = array_container_negation_range(src, range_start, range_end, dst);
+    // TODO : try a real inplace version
+    array_container_free(src);
+    return ans;
+}
+
+/* Negation across a range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation_range(const bitset_container_t *src,
+                                     const int range_start, const int range_end,
+                                     void **dst) {
+    // TODO maybe consider density-based estimate
+    // and sometimes build result directly as array, with
+    // conversion back to bitset if wrong.  Or determine
+    // actual result cardinality, then go directly for the known final cont.
+
+    // keep computation using bitsets as long as possible.
+    bitset_container_t *t = bitset_container_clone(src);
+    bitset_flip_range(t->array, (uint32_t)range_start, (uint32_t)range_end);
+    t->cardinality = bitset_container_compute_cardinality(t);
+
+    if (t->cardinality > DEFAULT_MAX_SIZE) {
+        *dst = t;
+        return true;
+    } else {
+        *dst = array_container_from_bitset(t);
+        bitset_container_free(t);
+        return false;
+    }
+}
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_range_inplace(bitset_container_t *src,
+                                             const int range_start,
+                                             const int range_end, void **dst) {
+    bitset_flip_range(src->array, (uint32_t)range_start, (uint32_t)range_end);
+    src->cardinality = bitset_container_compute_cardinality(src);
+    if (src->cardinality > DEFAULT_MAX_SIZE) {
+        *dst = src;
+        return true;
+    }
+    *dst = array_container_from_bitset(src);
+    bitset_container_free(src);
+    return false;
+}
+
+/* Negation across a range of container
+ * Compute the  negation of src  and write the result
+ * to *dst. Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation_range(const run_container_t *src,
+                                 const int range_start, const int range_end,
+                                 void **dst) {
+    uint8_t return_typecode;
+
+    // follows the Java implementation
+    if (range_end <= range_start) {
+        *dst = run_container_clone(src);
+        return RUN_CONTAINER_TYPE_CODE;
+    }
+
+    run_container_t *ans = run_container_create_given_capacity(
+        src->n_runs + 1);  // src->n_runs + 1);
+    int k = 0;
+    for (; k < src->n_runs && src->runs[k].value < range_start; ++k) {
+        ans->runs[k] = src->runs[k];
+        ans->n_runs++;
+    }
+
+    run_container_smart_append_exclusive(
+        ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));
+
+    for (; k < src->n_runs; ++k) {
+        run_container_smart_append_exclusive(ans, src->runs[k].value,
+                                             src->runs[k].length);
+    }
+
+    *dst = convert_run_to_efficient_container(ans, &return_typecode);
+    if (return_typecode != RUN_CONTAINER_TYPE_CODE) run_container_free(ans);
+
+    return return_typecode;
+}
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_range_inplace(run_container_t *src,
+                                         const int range_start,
+                                         const int range_end, void **dst) {
+    uint8_t return_typecode;
+
+    if (range_end <= range_start) {
+        *dst = src;
+        return RUN_CONTAINER_TYPE_CODE;
+    }
+
+    // TODO: efficient special case when range is 0 to 65535 inclusive
+
+    if (src->capacity == src->n_runs) {
+        // no excess room.  More checking to see if result can fit
+        bool last_val_before_range = false;
+        bool first_val_in_range = false;
+        bool last_val_in_range = false;
+        bool first_val_past_range = false;
+
+        if (range_start > 0)
+            last_val_before_range =
+                run_container_contains(src, (uint16_t)(range_start - 1));
+        first_val_in_range = run_container_contains(src, (uint16_t)range_start);
+
+        if (last_val_before_range == first_val_in_range) {
+            last_val_in_range =
+                run_container_contains(src, (uint16_t)(range_end - 1));
+            if (range_end != 0x10000)
+                first_val_past_range =
+                    run_container_contains(src, (uint16_t)range_end);
+
+            if (last_val_in_range ==
+                first_val_past_range) {  // no space for inplace
+                int ans = run_container_negation_range(src, range_start,
+                                                       range_end, dst);
+                run_container_free(src);
+                return ans;
+            }
+        }
+    }
+    // all other cases: result will fit
+
+    run_container_t *ans = src;
+    int my_nbr_runs = src->n_runs;
+
+    ans->n_runs = 0;
+    int k = 0;
+    for (; (k < my_nbr_runs) && (src->runs[k].value < range_start); ++k) {
+        // ans->runs[k] = src->runs[k]; (would be self-copy)
+        ans->n_runs++;
+    }
+
+    // as with Java implementation, use locals to give self a buffer of depth 1
+    rle16_t buffered = (rle16_t){.value = (uint16_t)0, .length = (uint16_t)0};
+    rle16_t next = buffered;
+    if (k < my_nbr_runs) buffered = src->runs[k];
+
+    run_container_smart_append_exclusive(
+        ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));
+
+    for (; k < my_nbr_runs; ++k) {
+        if (k + 1 < my_nbr_runs) next = src->runs[k + 1];
+
+        run_container_smart_append_exclusive(ans, buffered.value,
+                                             buffered.length);
+        buffered = next;
+    }
+
+    *dst = convert_run_to_efficient_container(ans, &return_typecode);
+    if (return_typecode != RUN_CONTAINER_TYPE_CODE) run_container_free(ans);
+
+    return return_typecode;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_negation.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_subset.c */
+
+bool array_container_is_subset_bitset(const array_container_t* container1,
+                                      const bitset_container_t* container2) {
+    if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container2->cardinality < container1->cardinality) {
+            return false;
+        }
+    }
+    for (int i = 0; i < container1->cardinality; ++i) {
+        if (!bitset_container_contains(container2, container1->array[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool run_container_is_subset_array(const run_container_t* container1,
+                                   const array_container_t* container2) {
+    if (run_container_cardinality(container1) > container2->cardinality)
+        return false;
+    int32_t start_pos = -1, stop_pos = -1;
+    for (int i = 0; i < container1->n_runs; ++i) {
+        int32_t start = container1->runs[i].value;
+        int32_t stop = start + container1->runs[i].length;
+        start_pos = advanceUntil(container2->array, stop_pos,
+                                 container2->cardinality, start);
+        stop_pos = advanceUntil(container2->array, stop_pos,
+                                container2->cardinality, stop);
+        if (start_pos == container2->cardinality) {
+            return false;
+        } else if (stop_pos - start_pos != stop - start ||
+                   container2->array[start_pos] != start ||
+                   container2->array[stop_pos] != stop) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool array_container_is_subset_run(const array_container_t* container1,
+                                   const run_container_t* container2) {
+    if (container1->cardinality > run_container_cardinality(container2))
+        return false;
+    int i_array = 0, i_run = 0;
+    while (i_array < container1->cardinality && i_run < container2->n_runs) {
+        uint32_t start = container2->runs[i_run].value;
+        uint32_t stop = start + container2->runs[i_run].length;
+        if (container1->array[i_array] < start) {
+            return false;
+        } else if (container1->array[i_array] > stop) {
+            i_run++;
+        } else {  // the value of the array is in the run
+            i_array++;
+        }
+    }
+    if (i_array == container1->cardinality) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool run_container_is_subset_bitset(const run_container_t* container1,
+                                    const bitset_container_t* container2) {
+    // todo: this code could be much faster
+    if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container2->cardinality < run_container_cardinality(container1)) {
+            return false;
+        }
+    } else {
+        int32_t card = bitset_container_compute_cardinality(
+            container2);  // modify container2?
+        if (card < run_container_cardinality(container1)) {
+            return false;
+        }
+    }
+    for (int i = 0; i < container1->n_runs; ++i) {
+        uint32_t run_start = container1->runs[i].value;
+        uint32_t le = container1->runs[i].length;
+        for (uint32_t j = run_start; j <= run_start + le; ++j) {
+            if (!bitset_container_contains(container2, j)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool bitset_container_is_subset_run(const bitset_container_t* container1,
+                                    const run_container_t* container2) {
+    // todo: this code could be much faster
+    if (container1->cardinality != BITSET_UNKNOWN_CARDINALITY) {
+        if (container1->cardinality > run_container_cardinality(container2)) {
+            return false;
+        }
+    }
+    int32_t i_bitset = 0, i_run = 0;
+    while (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS &&
+           i_run < container2->n_runs) {
+        uint64_t w = container1->array[i_bitset];
+        while (w != 0 && i_run < container2->n_runs) {
+            uint32_t start = container2->runs[i_run].value;
+            uint32_t stop = start + container2->runs[i_run].length;
+            uint64_t t = w & (~w + 1);
+            uint16_t r = i_bitset * 64 + __builtin_ctzll(w);
+            if (r < start) {
+                return false;
+            } else if (r > stop) {
+                i_run++;
+                continue;
+            } else {
+                w ^= t;
+            }
+        }
+        if (w == 0) {
+            i_bitset++;
+        } else {
+            return false;
+        }
+    }
+    if (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS) {
+        // terminated iterating on the run containers, check that rest of bitset
+        // is empty
+        for (; i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS; i_bitset++) {
+            if (container1->array[i_bitset] != 0) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_subset.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_union.c */
+/*
+ * mixed_union.c
+ *
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst.  */
+void array_bitset_container_union(const array_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    dst->cardinality = (int32_t)bitset_set_list_withcard(
+        dst->array, dst->cardinality, src_1->array, src_1->cardinality);
+}
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */
+void array_bitset_container_lazy_union(const array_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    bitset_set_list(dst->array, src_1->array, src_1->cardinality);
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+void run_bitset_container_union(const run_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst) {
+    assert(!run_container_is_full(src_1));  // catch this case upstream
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_set_lenrange(dst->array, rle.value, rle.length);
+    }
+    dst->cardinality = bitset_container_compute_cardinality(dst);
+}
+
+void run_bitset_container_lazy_union(const run_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst) {
+    assert(!run_container_is_full(src_1));  // catch this case upstream
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_set_lenrange(dst->array, rle.value, rle.length);
+    }
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+// why do we leave the result as a run container??
+void array_run_container_union(const array_container_t *src_1,
+                               const run_container_t *src_2,
+                               run_container_t *dst) {
+    if (run_container_is_full(src_2)) {
+        run_container_copy(src_2, dst);
+        return;
+    }
+    // TODO: see whether the "2*" is spurious
+    run_container_grow(dst, 2 * (src_1->cardinality + src_2->n_runs), false);
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    rle16_t previousrle;
+    if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
+        previousrle = run_container_append_first(dst, src_2->runs[rlepos]);
+        rlepos++;
+    } else {
+        previousrle =
+            run_container_append_value_first(dst, src_1->array[arraypos]);
+        arraypos++;
+    }
+    while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
+        if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
+            run_container_append(dst, src_2->runs[rlepos], &previousrle);
+            rlepos++;
+        } else {
+            run_container_append_value(dst, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    }
+    if (arraypos < src_1->cardinality) {
+        while (arraypos < src_1->cardinality) {
+            run_container_append_value(dst, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    } else {
+        while (rlepos < src_2->n_runs) {
+            run_container_append(dst, src_2->runs[rlepos], &previousrle);
+            rlepos++;
+        }
+    }
+}
+
+void array_run_container_inplace_union(const array_container_t *src_1,
+                                       run_container_t *src_2) {
+    if (run_container_is_full(src_2)) {
+        return;
+    }
+    const int32_t maxoutput = src_1->cardinality + src_2->n_runs;
+    const int32_t neededcapacity = maxoutput + src_2->n_runs;
+    if (src_2->capacity < neededcapacity)
+        run_container_grow(src_2, neededcapacity, true);
+    memmove(src_2->runs + maxoutput, src_2->runs,
+            src_2->n_runs * sizeof(rle16_t));
+    rle16_t *inputsrc2 = src_2->runs + maxoutput;
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    int src2nruns = src_2->n_runs;
+    src_2->n_runs = 0;
+
+    rle16_t previousrle;
+
+    if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
+        previousrle = run_container_append_first(src_2, inputsrc2[rlepos]);
+        rlepos++;
+    } else {
+        previousrle =
+            run_container_append_value_first(src_2, src_1->array[arraypos]);
+        arraypos++;
+    }
+
+    while ((rlepos < src2nruns) && (arraypos < src_1->cardinality)) {
+        if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
+            run_container_append(src_2, inputsrc2[rlepos], &previousrle);
+            rlepos++;
+        } else {
+            run_container_append_value(src_2, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    }
+    if (arraypos < src_1->cardinality) {
+        while (arraypos < src_1->cardinality) {
+            run_container_append_value(src_2, src_1->array[arraypos],
+                                       &previousrle);
+            arraypos++;
+        }
+    } else {
+        while (rlepos < src2nruns) {
+            run_container_append(src_2, inputsrc2[rlepos], &previousrle);
+            rlepos++;
+        }
+    }
+}
+
+bool array_array_container_union(const array_container_t *src_1,
+                                 const array_container_t *src_2, void **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    if (totalCardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        if (*dst != NULL) {
+            array_container_union(src_1, src_2, (array_container_t *)*dst);
+        } else {
+            return true; // otherwise failure won't be caught
+        }
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = (bitset_container_t *)*dst;
+        bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
+        ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
+            ourbitset->array, src_1->cardinality, src_2->array,
+            src_2->cardinality);
+        if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
+            // need to convert!
+            *dst = array_container_from_bitset(ourbitset);
+            bitset_container_free(ourbitset);
+            returnval = false;  // not going to be a bitset
+        }
+    }
+    return returnval;
+}
+
+bool array_array_container_inplace_union(array_container_t *src_1,
+                                 const array_container_t *src_2, void **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    *dst = NULL;
+    if (totalCardinality <= DEFAULT_MAX_SIZE) {
+        if(src_1->capacity < totalCardinality) {
+          *dst = array_container_create_given_capacity(2  * totalCardinality); // be purposefully generous
+          if (*dst != NULL) {
+              array_container_union(src_1, src_2, (array_container_t *)*dst);
+          } else {
+              return true; // otherwise failure won't be caught
+          }
+          return false;  // not a bitset
+        } else {
+          memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
+          src_1->cardinality = (int32_t)fast_union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
+                                  src_2->array, src_2->cardinality, src_1->array);
+          return false; // not a bitset
+        }
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = (bitset_container_t *)*dst;
+        bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
+        ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
+            ourbitset->array, src_1->cardinality, src_2->array,
+            src_2->cardinality);
+        if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
+            // need to convert!
+            if(src_1->capacity < ourbitset->cardinality) {
+              array_container_grow(src_1, ourbitset->cardinality, false);
+            }
+
+            bitset_extract_setbits_uint16(ourbitset->array, BITSET_CONTAINER_SIZE_IN_WORDS,
+                                  src_1->array, 0);
+            src_1->cardinality =  ourbitset->cardinality;
+            *dst = src_1;
+            bitset_container_free(ourbitset);
+            returnval = false;  // not going to be a bitset
+        }
+    }
+    return returnval;
+}
+
+
+bool array_array_container_lazy_union(const array_container_t *src_1,
+                                      const array_container_t *src_2,
+                                      void **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        if (*dst != NULL) {
+            array_container_union(src_1, src_2, (array_container_t *)*dst);
+        } else {
+              return true; // otherwise failure won't be caught
+        }
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = (bitset_container_t *)*dst;
+        bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
+        bitset_set_list(ourbitset->array, src_2->array, src_2->cardinality);
+        ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
+    }
+    return returnval;
+}
+
+
+bool array_array_container_lazy_inplace_union(array_container_t *src_1,
+                                      const array_container_t *src_2,
+                                      void **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    *dst = NULL;
+    if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
+        if(src_1->capacity < totalCardinality) {
+          *dst = array_container_create_given_capacity(2  * totalCardinality); // be purposefully generous
+          if (*dst != NULL) {
+              array_container_union(src_1, src_2, (array_container_t *)*dst);
+          } else {
+            return true; // otherwise failure won't be caught
+          }
+          return false;  // not a bitset
+        } else {
+          memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
+          src_1->cardinality = (int32_t)fast_union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
+                                  src_2->array, src_2->cardinality, src_1->array);
+          return false; // not a bitset
+        }
+    }
+    *dst = bitset_container_create();
+    bool returnval = true;  // expect a bitset
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = (bitset_container_t *)*dst;
+        bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
+        bitset_set_list(ourbitset->array, src_2->array, src_2->cardinality);
+        ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
+    }
+    return returnval;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_union.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_xor.c */
+/*
+ * mixed_xor.c
+ */
+
+#include <assert.h>
+#include <string.h>
+
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).
+ * Result is true iff dst is a bitset  */
+bool array_bitset_container_xor(const array_container_t *src_1,
+                                const bitset_container_t *src_2, void **dst) {
+    bitset_container_t *result = bitset_container_create();
+    bitset_container_copy(src_2, result);
+    result->cardinality = (int32_t)bitset_flip_list_withcard(
+        result->array, result->cardinality, src_1->array, src_1->cardinality);
+
+    // do required type conversions.
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;  // not bitset
+    }
+    *dst = result;
+    return true;  // bitset
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
+ */
+
+void array_bitset_container_lazy_xor(const array_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    bitset_flip_list(dst->array, src_1->array, src_1->cardinality);
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_xor(const run_container_t *src_1,
+                              const bitset_container_t *src_2, void **dst) {
+    bitset_container_t *result = bitset_container_create();
+
+    bitset_container_copy(src_2, result);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_flip_range(result->array, rle.value,
+                          rle.value + rle.length + UINT32_C(1));
+    }
+    result->cardinality = bitset_container_compute_cardinality(result);
+
+    if (result->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(result);
+        bitset_container_free(result);
+        return false;  // not bitset
+    }
+    *dst = result;
+    return true;  // bitset
+}
+
+/* lazy xor.  Dst is initialized and may be equal to src_2.
+ *  Result is left as a bitset container, even if actual
+ *  cardinality would dictate an array container.
+ */
+
+void run_bitset_container_lazy_xor(const run_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   bitset_container_t *dst) {
+    if (src_2 != dst) bitset_container_copy(src_2, dst);
+    for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
+        rle16_t rle = src_1->runs[rlepos];
+        bitset_flip_range(dst->array, rle.value,
+                          rle.value + rle.length + UINT32_C(1));
+    }
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_xor(const array_container_t *src_1,
+                            const run_container_t *src_2, void **dst) {
+    // semi following Java XOR implementation as of May 2016
+    // the C OR implementation works quite differently and can return a run
+    // container
+    // TODO could optimize for full run containers.
+
+    // use of lazy following Java impl.
+    const int arbitrary_threshold = 32;
+    if (src_1->cardinality < arbitrary_threshold) {
+        run_container_t *ans = run_container_create();
+        array_run_container_lazy_xor(src_1, src_2, ans);  // keeps runs.
+        uint8_t typecode_after;
+        *dst =
+            convert_run_to_efficient_container_and_free(ans, &typecode_after);
+        return typecode_after;
+    }
+
+    int card = run_container_cardinality(src_2);
+    if (card <= DEFAULT_MAX_SIZE) {
+        // Java implementation works with the array, xoring the run elements via
+        // iterator
+        array_container_t *temp = array_container_from_run(src_2);
+        bool ret_is_bitset = array_array_container_xor(temp, src_1, dst);
+        array_container_free(temp);
+        return ret_is_bitset ? BITSET_CONTAINER_TYPE_CODE
+                             : ARRAY_CONTAINER_TYPE_CODE;
+
+    } else {  // guess that it will end up as a bitset
+        bitset_container_t *result = bitset_container_from_run(src_2);
+        bool is_bitset = bitset_array_container_ixor(result, src_1, dst);
+        // any necessary type conversion has been done by the ixor
+        int retval = (is_bitset ? BITSET_CONTAINER_TYPE_CODE
+                                : ARRAY_CONTAINER_TYPE_CODE);
+        return retval;
+    }
+}
+
+/* Dst is a valid run container. (Can it be src_2? Let's say not.)
+ * Leaves result as run container, even if other options are
+ * smaller.
+ */
+
+void array_run_container_lazy_xor(const array_container_t *src_1,
+                                  const run_container_t *src_2,
+                                  run_container_t *dst) {
+    run_container_grow(dst, src_1->cardinality + src_2->n_runs, false);
+    int32_t rlepos = 0;
+    int32_t arraypos = 0;
+    dst->n_runs = 0;
+
+    while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
+        if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
+            run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
+                                                 src_2->runs[rlepos].length);
+            rlepos++;
+        } else {
+            run_container_smart_append_exclusive(dst, src_1->array[arraypos],
+                                                 0);
+            arraypos++;
+        }
+    }
+    while (arraypos < src_1->cardinality) {
+        run_container_smart_append_exclusive(dst, src_1->array[arraypos], 0);
+        arraypos++;
+    }
+    while (rlepos < src_2->n_runs) {
+        run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
+                                             src_2->runs[rlepos].length);
+        rlepos++;
+    }
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_xor(const run_container_t *src_1,
+                          const run_container_t *src_2, void **dst) {
+    run_container_t *ans = run_container_create();
+    run_container_xor(src_1, src_2, ans);
+    uint8_t typecode_after;
+    *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
+    return typecode_after;
+}
+
+/*
+ * Java implementation (as of May 2016) for array_run, run_run
+ * and  bitset_run don't do anything different for inplace.
+ * Could adopt the mixed_union.c approach instead (ie, using
+ * smart_append_exclusive)
+ *
+ */
+
+bool array_array_container_xor(const array_container_t *src_1,
+                               const array_container_t *src_2, void **dst) {
+    int totalCardinality =
+        src_1->cardinality + src_2->cardinality;  // upper bound
+    if (totalCardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        array_container_xor(src_1, src_2, (array_container_t *)*dst);
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_from_array(src_1);
+    bool returnval = true;  // expect a bitset
+    bitset_container_t *ourbitset = (bitset_container_t *)*dst;
+    ourbitset->cardinality = (uint32_t)bitset_flip_list_withcard(
+        ourbitset->array, src_1->cardinality, src_2->array, src_2->cardinality);
+    if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
+        // need to convert!
+        *dst = array_container_from_bitset(ourbitset);
+        bitset_container_free(ourbitset);
+        returnval = false;  // not going to be a bitset
+    }
+
+    return returnval;
+}
+
+bool array_array_container_lazy_xor(const array_container_t *src_1,
+                                    const array_container_t *src_2,
+                                    void **dst) {
+    int totalCardinality = src_1->cardinality + src_2->cardinality;
+    // upper bound, but probably poor estimate for xor
+    if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
+        *dst = array_container_create_given_capacity(totalCardinality);
+        if (*dst != NULL)
+            array_container_xor(src_1, src_2, (array_container_t *)*dst);
+        return false;  // not a bitset
+    }
+    *dst = bitset_container_from_array(src_1);
+    bool returnval = true;  // expect a bitset (maybe, for XOR??)
+    if (*dst != NULL) {
+        bitset_container_t *ourbitset = (bitset_container_t *)*dst;
+        bitset_flip_list(ourbitset->array, src_2->array, src_2->cardinality);
+        ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
+    }
+    return returnval;
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_xor(const bitset_container_t *src_1,
+                                 const bitset_container_t *src_2, void **dst) {
+    bitset_container_t *ans = bitset_container_create();
+    int card = bitset_container_xor(src_1, src_2, ans);
+    if (card <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(ans);
+        bitset_container_free(ans);
+        return false;  // not bitset
+    } else {
+        *dst = ans;
+        return true;
+    }
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_ixor(bitset_container_t *src_1,
+                                 const array_container_t *src_2, void **dst) {
+    *dst = src_1;
+    src_1->cardinality = (uint32_t)bitset_flip_list_withcard(
+        src_1->array, src_1->cardinality, src_2->array, src_2->cardinality);
+
+    if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
+        *dst = array_container_from_bitset(src_1);
+        bitset_container_free(src_1);
+        return false;  // not bitset
+    } else
+        return true;
+}
+
+/* a bunch of in-place, some of which may not *really* be inplace.
+ * TODO: write actual inplace routine if efficiency warrants it
+ * Anything inplace with a bitset is a good candidate
+ */
+
+bool bitset_bitset_container_ixor(bitset_container_t *src_1,
+                                  const bitset_container_t *src_2, void **dst) {
+    bool ans = bitset_bitset_container_xor(src_1, src_2, dst);
+    bitset_container_free(src_1);
+    return ans;
+}
+
+bool array_bitset_container_ixor(array_container_t *src_1,
+                                 const bitset_container_t *src_2, void **dst) {
+    bool ans = array_bitset_container_xor(src_1, src_2, dst);
+    array_container_free(src_1);
+    return ans;
+}
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_ixor(run_container_t *src_1,
+                               const bitset_container_t *src_2, void **dst) {
+    bool ans = run_bitset_container_xor(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+bool bitset_run_container_ixor(bitset_container_t *src_1,
+                               const run_container_t *src_2, void **dst) {
+    bool ans = run_bitset_container_xor(src_2, src_1, dst);
+    bitset_container_free(src_1);
+    return ans;
+}
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_ixor(array_container_t *src_1,
+                             const run_container_t *src_2, void **dst) {
+    int ans = array_run_container_xor(src_1, src_2, dst);
+    array_container_free(src_1);
+    return ans;
+}
+
+int run_array_container_ixor(run_container_t *src_1,
+                             const array_container_t *src_2, void **dst) {
+    int ans = array_run_container_xor(src_2, src_1, dst);
+    run_container_free(src_1);
+    return ans;
+}
+
+bool array_array_container_ixor(array_container_t *src_1,
+                                const array_container_t *src_2, void **dst) {
+    bool ans = array_array_container_xor(src_1, src_2, dst);
+    array_container_free(src_1);
+    return ans;
+}
+
+int run_run_container_ixor(run_container_t *src_1, const run_container_t *src_2,
+                           void **dst) {
+    int ans = run_run_container_xor(src_1, src_2, dst);
+    run_container_free(src_1);
+    return ans;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/mixed_xor.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/containers/run.c */
+#include <stdio.h>
+#include <stdlib.h>
+
+
+extern inline uint16_t run_container_minimum(const run_container_t *run);
+extern inline uint16_t run_container_maximum(const run_container_t *run);
+extern inline int32_t interleavedBinarySearch(const rle16_t *array,
+                                              int32_t lenarray, uint16_t ikey);
+extern inline bool run_container_contains(const run_container_t *run,
+                                          uint16_t pos);
+extern inline int run_container_index_equalorlarger(const run_container_t *arr, uint16_t x);
+extern bool run_container_is_full(const run_container_t *run);
+extern bool run_container_nonzero_cardinality(const run_container_t *r);
+extern void run_container_clear(run_container_t *run);
+extern int32_t run_container_serialized_size_in_bytes(int32_t num_runs);
+extern run_container_t *run_container_create_range(uint32_t start,
+                                                   uint32_t stop);
+
+bool run_container_add(run_container_t *run, uint16_t pos) {
+    int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
+    if (index >= 0) return false;  // already there
+    index = -index - 2;            // points to preceding value, possibly -1
+    if (index >= 0) {              // possible match
+        int32_t offset = pos - run->runs[index].value;
+        int32_t le = run->runs[index].length;
+        if (offset <= le) return false;  // already there
+        if (offset == le + 1) {
+            // we may need to fuse
+            if (index + 1 < run->n_runs) {
+                if (run->runs[index + 1].value == pos + 1) {
+                    // indeed fusion is needed
+                    run->runs[index].length = run->runs[index + 1].value +
+                                              run->runs[index + 1].length -
+                                              run->runs[index].value;
+                    recoverRoomAtIndex(run, (uint16_t)(index + 1));
+                    return true;
+                }
+            }
+            run->runs[index].length++;
+            return true;
+        }
+        if (index + 1 < run->n_runs) {
+            // we may need to fuse
+            if (run->runs[index + 1].value == pos + 1) {
+                // indeed fusion is needed
+                run->runs[index + 1].value = pos;
+                run->runs[index + 1].length = run->runs[index + 1].length + 1;
+                return true;
+            }
+        }
+    }
+    if (index == -1) {
+        // we may need to extend the first run
+        if (0 < run->n_runs) {
+            if (run->runs[0].value == pos + 1) {
+                run->runs[0].length++;
+                run->runs[0].value--;
+                return true;
+            }
+        }
+    }
+    makeRoomAtIndex(run, (uint16_t)(index + 1));
+    run->runs[index + 1].value = pos;
+    run->runs[index + 1].length = 0;
+    return true;
+}
+
+/* Create a new run container. Return NULL in case of failure. */
+run_container_t *run_container_create_given_capacity(int32_t size) {
+    run_container_t *run;
+    /* Allocate the run container itself. */
+    if ((run = (run_container_t *)malloc(sizeof(run_container_t))) == NULL) {
+        return NULL;
+    }
+    if (size <= 0 ) { // we don't want to rely on malloc(0)
+        run->runs = NULL;
+    } else if ((run->runs = (rle16_t *)malloc(sizeof(rle16_t) * size)) == NULL) {
+        free(run);
+        return NULL;
+    }
+    run->capacity = size;
+    run->n_runs = 0;
+    return run;
+}
+
+int run_container_shrink_to_fit(run_container_t *src) {
+    if (src->n_runs == src->capacity) return 0;  // nothing to do
+    int savings = src->capacity - src->n_runs;
+    src->capacity = src->n_runs;
+    rle16_t *oldruns = src->runs;
+    src->runs = (rle16_t *)realloc(oldruns, src->capacity * sizeof(rle16_t));
+    if (src->runs == NULL) free(oldruns);  // should never happen?
+    return savings;
+}
+/* Create a new run container. Return NULL in case of failure. */
+run_container_t *run_container_create(void) {
+    return run_container_create_given_capacity(RUN_DEFAULT_INIT_SIZE);
+}
+
+run_container_t *run_container_clone(const run_container_t *src) {
+    run_container_t *run = run_container_create_given_capacity(src->capacity);
+    if (run == NULL) return NULL;
+    run->capacity = src->capacity;
+    run->n_runs = src->n_runs;
+    memcpy(run->runs, src->runs, src->n_runs * sizeof(rle16_t));
+    return run;
+}
+
+/* Free memory. */
+void run_container_free(run_container_t *run) {
+    if(run->runs != NULL) {// Jon Strabala reports that some tools complain otherwise
+      free(run->runs);
+      run->runs = NULL;  // pedantic
+    }
+    free(run);
+}
+
+void run_container_grow(run_container_t *run, int32_t min, bool copy) {
+    int32_t newCapacity =
+        (run->capacity == 0)
+            ? RUN_DEFAULT_INIT_SIZE
+            : run->capacity < 64 ? run->capacity * 2
+                                 : run->capacity < 1024 ? run->capacity * 3 / 2
+                                                        : run->capacity * 5 / 4;
+    if (newCapacity < min) newCapacity = min;
+    run->capacity = newCapacity;
+    assert(run->capacity >= min);
+    if (copy) {
+        rle16_t *oldruns = run->runs;
+        run->runs =
+            (rle16_t *)realloc(oldruns, run->capacity * sizeof(rle16_t));
+        if (run->runs == NULL) free(oldruns);
+    } else {
+        // Jon Strabala reports that some tools complain otherwise
+        if (run->runs != NULL) {
+          free(run->runs);
+        }
+        run->runs = (rle16_t *)malloc(run->capacity * sizeof(rle16_t));
+    }
+    // handle the case where realloc fails
+    if (run->runs == NULL) {
+      fprintf(stderr, "could not allocate memory\n");
+    }
+    assert(run->runs != NULL);
+}
+
+/* copy one container into another */
+void run_container_copy(const run_container_t *src, run_container_t *dst) {
+    const int32_t n_runs = src->n_runs;
+    if (src->n_runs > dst->capacity) {
+        run_container_grow(dst, n_runs, false);
+    }
+    dst->n_runs = n_runs;
+    memcpy(dst->runs, src->runs, sizeof(rle16_t) * n_runs);
+}
+
+/* Compute the union of `src_1' and `src_2' and write the result to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_union(const run_container_t *src_1,
+                         const run_container_t *src_2, run_container_t *dst) {
+    // TODO: this could be a lot more efficient
+
+    // we start out with inexpensive checks
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            run_container_copy(src_1, dst);
+            return;
+        }
+        if (if2) {
+            run_container_copy(src_2, dst);
+            return;
+        }
+    }
+    const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
+    if (dst->capacity < neededcapacity)
+        run_container_grow(dst, neededcapacity, false);
+    dst->n_runs = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+
+    rle16_t previousrle;
+    if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
+        previousrle = run_container_append_first(dst, src_1->runs[rlepos]);
+        rlepos++;
+    } else {
+        previousrle = run_container_append_first(dst, src_2->runs[xrlepos]);
+        xrlepos++;
+    }
+
+    while ((xrlepos < src_2->n_runs) && (rlepos < src_1->n_runs)) {
+        rle16_t newrl;
+        if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
+            newrl = src_1->runs[rlepos];
+            rlepos++;
+        } else {
+            newrl = src_2->runs[xrlepos];
+            xrlepos++;
+        }
+        run_container_append(dst, newrl, &previousrle);
+    }
+    while (xrlepos < src_2->n_runs) {
+        run_container_append(dst, src_2->runs[xrlepos], &previousrle);
+        xrlepos++;
+    }
+    while (rlepos < src_1->n_runs) {
+        run_container_append(dst, src_1->runs[rlepos], &previousrle);
+        rlepos++;
+    }
+}
+
+/* Compute the union of `src_1' and `src_2' and write the result to `src_1'
+ */
+void run_container_union_inplace(run_container_t *src_1,
+                                 const run_container_t *src_2) {
+    // TODO: this could be a lot more efficient
+
+    // we start out with inexpensive checks
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            return;
+        }
+        if (if2) {
+            run_container_copy(src_2, src_1);
+            return;
+        }
+    }
+    // we move the data to the end of the current array
+    const int32_t maxoutput = src_1->n_runs + src_2->n_runs;
+    const int32_t neededcapacity = maxoutput + src_1->n_runs;
+    if (src_1->capacity < neededcapacity)
+        run_container_grow(src_1, neededcapacity, true);
+    memmove(src_1->runs + maxoutput, src_1->runs,
+            src_1->n_runs * sizeof(rle16_t));
+    rle16_t *inputsrc1 = src_1->runs + maxoutput;
+    const int32_t input1nruns = src_1->n_runs;
+    src_1->n_runs = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+
+    rle16_t previousrle;
+    if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
+        previousrle = run_container_append_first(src_1, inputsrc1[rlepos]);
+        rlepos++;
+    } else {
+        previousrle = run_container_append_first(src_1, src_2->runs[xrlepos]);
+        xrlepos++;
+    }
+    while ((xrlepos < src_2->n_runs) && (rlepos < input1nruns)) {
+        rle16_t newrl;
+        if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
+            newrl = inputsrc1[rlepos];
+            rlepos++;
+        } else {
+            newrl = src_2->runs[xrlepos];
+            xrlepos++;
+        }
+        run_container_append(src_1, newrl, &previousrle);
+    }
+    while (xrlepos < src_2->n_runs) {
+        run_container_append(src_1, src_2->runs[xrlepos], &previousrle);
+        xrlepos++;
+    }
+    while (rlepos < input1nruns) {
+        run_container_append(src_1, inputsrc1[rlepos], &previousrle);
+        rlepos++;
+    }
+}
+
+/* Compute the symmetric difference of `src_1' and `src_2' and write the result
+ * to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_xor(const run_container_t *src_1,
+                       const run_container_t *src_2, run_container_t *dst) {
+    // don't bother to convert xor with full range into negation
+    // since negation is implemented similarly
+
+    const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
+    if (dst->capacity < neededcapacity)
+        run_container_grow(dst, neededcapacity, false);
+
+    int32_t pos1 = 0;
+    int32_t pos2 = 0;
+    dst->n_runs = 0;
+
+    while ((pos1 < src_1->n_runs) && (pos2 < src_2->n_runs)) {
+        if (src_1->runs[pos1].value <= src_2->runs[pos2].value) {
+            run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
+                                                 src_1->runs[pos1].length);
+            pos1++;
+        } else {
+            run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
+                                                 src_2->runs[pos2].length);
+            pos2++;
+        }
+    }
+    while (pos1 < src_1->n_runs) {
+        run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
+                                             src_1->runs[pos1].length);
+        pos1++;
+    }
+
+    while (pos2 < src_2->n_runs) {
+        run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
+                                             src_2->runs[pos2].length);
+        pos2++;
+    }
+}
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void run_container_intersection(const run_container_t *src_1,
+                                const run_container_t *src_2,
+                                run_container_t *dst) {
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            run_container_copy(src_2, dst);
+            return;
+        }
+        if (if2) {
+            run_container_copy(src_1, dst);
+            return;
+        }
+    }
+    // TODO: this could be a lot more efficient, could use SIMD optimizations
+    const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
+    if (dst->capacity < neededcapacity)
+        run_container_grow(dst, neededcapacity, false);
+    dst->n_runs = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+    int32_t start = src_1->runs[rlepos].value;
+    int32_t end = start + src_1->runs[rlepos].length + 1;
+    int32_t xstart = src_2->runs[xrlepos].value;
+    int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
+    while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
+        if (end <= xstart) {
+            ++rlepos;
+            if (rlepos < src_1->n_runs) {
+                start = src_1->runs[rlepos].value;
+                end = start + src_1->runs[rlepos].length + 1;
+            }
+        } else if (xend <= start) {
+            ++xrlepos;
+            if (xrlepos < src_2->n_runs) {
+                xstart = src_2->runs[xrlepos].value;
+                xend = xstart + src_2->runs[xrlepos].length + 1;
+            }
+        } else {  // they overlap
+            const int32_t lateststart = start > xstart ? start : xstart;
+            int32_t earliestend;
+            if (end == xend) {  // improbable
+                earliestend = end;
+                rlepos++;
+                xrlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            } else if (end < xend) {
+                earliestend = end;
+                rlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+
+            } else {  // end > xend
+                earliestend = xend;
+                xrlepos++;
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            }
+            dst->runs[dst->n_runs].value = (uint16_t)lateststart;
+            dst->runs[dst->n_runs].length =
+                (uint16_t)(earliestend - lateststart - 1);
+            dst->n_runs++;
+        }
+    }
+}
+
+/* Compute the size of the intersection of src_1 and src_2 . */
+int run_container_intersection_cardinality(const run_container_t *src_1,
+                                           const run_container_t *src_2) {
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            return run_container_cardinality(src_2);
+        }
+        if (if2) {
+            return run_container_cardinality(src_1);
+        }
+    }
+    int answer = 0;
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+    int32_t start = src_1->runs[rlepos].value;
+    int32_t end = start + src_1->runs[rlepos].length + 1;
+    int32_t xstart = src_2->runs[xrlepos].value;
+    int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
+    while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
+        if (end <= xstart) {
+            ++rlepos;
+            if (rlepos < src_1->n_runs) {
+                start = src_1->runs[rlepos].value;
+                end = start + src_1->runs[rlepos].length + 1;
+            }
+        } else if (xend <= start) {
+            ++xrlepos;
+            if (xrlepos < src_2->n_runs) {
+                xstart = src_2->runs[xrlepos].value;
+                xend = xstart + src_2->runs[xrlepos].length + 1;
+            }
+        } else {  // they overlap
+            const int32_t lateststart = start > xstart ? start : xstart;
+            int32_t earliestend;
+            if (end == xend) {  // improbable
+                earliestend = end;
+                rlepos++;
+                xrlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            } else if (end < xend) {
+                earliestend = end;
+                rlepos++;
+                if (rlepos < src_1->n_runs) {
+                    start = src_1->runs[rlepos].value;
+                    end = start + src_1->runs[rlepos].length + 1;
+                }
+
+            } else {  // end > xend
+                earliestend = xend;
+                xrlepos++;
+                if (xrlepos < src_2->n_runs) {
+                    xstart = src_2->runs[xrlepos].value;
+                    xend = xstart + src_2->runs[xrlepos].length + 1;
+                }
+            }
+            answer += earliestend - lateststart;
+        }
+    }
+    return answer;
+}
+
+bool run_container_intersect(const run_container_t *src_1,
+                                const run_container_t *src_2) {
+    const bool if1 = run_container_is_full(src_1);
+    const bool if2 = run_container_is_full(src_2);
+    if (if1 || if2) {
+        if (if1) {
+            return !run_container_empty(src_2);
+        }
+        if (if2) {
+        	return !run_container_empty(src_1);
+        }
+    }
+    int32_t rlepos = 0;
+    int32_t xrlepos = 0;
+    int32_t start = src_1->runs[rlepos].value;
+    int32_t end = start + src_1->runs[rlepos].length + 1;
+    int32_t xstart = src_2->runs[xrlepos].value;
+    int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
+    while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
+        if (end <= xstart) {
+            ++rlepos;
+            if (rlepos < src_1->n_runs) {
+                start = src_1->runs[rlepos].value;
+                end = start + src_1->runs[rlepos].length + 1;
+            }
+        } else if (xend <= start) {
+            ++xrlepos;
+            if (xrlepos < src_2->n_runs) {
+                xstart = src_2->runs[xrlepos].value;
+                xend = xstart + src_2->runs[xrlepos].length + 1;
+            }
+        } else {  // they overlap
+            return true;
+        }
+    }
+    return false;
+}
+
+
+/* Compute the difference of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void run_container_andnot(const run_container_t *src_1,
+                          const run_container_t *src_2, run_container_t *dst) {
+    // following Java implementation as of June 2016
+
+    if (dst->capacity < src_1->n_runs + src_2->n_runs)
+        run_container_grow(dst, src_1->n_runs + src_2->n_runs, false);
+
+    dst->n_runs = 0;
+
+    int rlepos1 = 0;
+    int rlepos2 = 0;
+    int32_t start = src_1->runs[rlepos1].value;
+    int32_t end = start + src_1->runs[rlepos1].length + 1;
+    int32_t start2 = src_2->runs[rlepos2].value;
+    int32_t end2 = start2 + src_2->runs[rlepos2].length + 1;
+
+    while ((rlepos1 < src_1->n_runs) && (rlepos2 < src_2->n_runs)) {
+        if (end <= start2) {
+            // output the first run
+            dst->runs[dst->n_runs++] =
+                (rle16_t){.value = (uint16_t)start,
+                          .length = (uint16_t)(end - start - 1)};
+            rlepos1++;
+            if (rlepos1 < src_1->n_runs) {
+                start = src_1->runs[rlepos1].value;
+                end = start + src_1->runs[rlepos1].length + 1;
+            }
+        } else if (end2 <= start) {
+            // exit the second run
+            rlepos2++;
+            if (rlepos2 < src_2->n_runs) {
+                start2 = src_2->runs[rlepos2].value;
+                end2 = start2 + src_2->runs[rlepos2].length + 1;
+            }
+        } else {
+            if (start < start2) {
+                dst->runs[dst->n_runs++] =
+                    (rle16_t){.value = (uint16_t)start,
+                              .length = (uint16_t)(start2 - start - 1)};
+            }
+            if (end2 < end) {
+                start = end2;
+            } else {
+                rlepos1++;
+                if (rlepos1 < src_1->n_runs) {
+                    start = src_1->runs[rlepos1].value;
+                    end = start + src_1->runs[rlepos1].length + 1;
+                }
+            }
+        }
+    }
+    if (rlepos1 < src_1->n_runs) {
+        dst->runs[dst->n_runs++] = (rle16_t){
+            .value = (uint16_t)start, .length = (uint16_t)(end - start - 1)};
+        rlepos1++;
+        if (rlepos1 < src_1->n_runs) {
+            memcpy(dst->runs + dst->n_runs, src_1->runs + rlepos1,
+                   sizeof(rle16_t) * (src_1->n_runs - rlepos1));
+            dst->n_runs += src_1->n_runs - rlepos1;
+        }
+    }
+}
+
+int run_container_to_uint32_array(void *vout, const run_container_t *cont,
+                                  uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+        for (int j = 0; j <= le; ++j) {
+            uint32_t val = run_start + j;
+            memcpy(out + outpos, &val,
+                   sizeof(uint32_t));  // should be compiled as a MOV on x64
+            outpos++;
+        }
+    }
+    return outpos;
+}
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void run_container_printf(const run_container_t *cont) {
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint16_t run_start = cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+        printf("[%d,%d]", run_start, run_start + le);
+    }
+}
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void run_container_printf_as_uint32_array(const run_container_t *cont,
+                                          uint32_t base) {
+    if (cont->n_runs == 0) return;
+    {
+        uint32_t run_start = base + cont->runs[0].value;
+        uint16_t le = cont->runs[0].length;
+        printf("%u", run_start);
+        for (uint32_t j = 1; j <= le; ++j) printf(",%u", run_start + j);
+    }
+    for (int32_t i = 1; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+        for (uint32_t j = 0; j <= le; ++j) printf(",%u", run_start + j);
+    }
+}
+
+int32_t run_container_serialize(const run_container_t *container, char *buf) {
+    int32_t l, off;
+
+    memcpy(buf, &container->n_runs, off = sizeof(container->n_runs));
+    memcpy(&buf[off], &container->capacity, sizeof(container->capacity));
+    off += sizeof(container->capacity);
+
+    l = sizeof(rle16_t) * container->n_runs;
+    memcpy(&buf[off], container->runs, l);
+    return (off + l);
+}
+
+int32_t run_container_write(const run_container_t *container, char *buf) {
+    memcpy(buf, &container->n_runs, sizeof(uint16_t));
+    memcpy(buf + sizeof(uint16_t), container->runs,
+           container->n_runs * sizeof(rle16_t));
+    return run_container_size_in_bytes(container);
+}
+
+int32_t run_container_read(int32_t cardinality, run_container_t *container,
+                           const char *buf) {
+    (void)cardinality;
+    memcpy(&container->n_runs, buf, sizeof(uint16_t));
+    if (container->n_runs > container->capacity)
+        run_container_grow(container, container->n_runs, false);
+    if(container->n_runs > 0) {
+      memcpy(container->runs, buf + sizeof(uint16_t),
+           container->n_runs * sizeof(rle16_t));
+    }
+    return run_container_size_in_bytes(container);
+}
+
+uint32_t run_container_serialization_len(const run_container_t *container) {
+    return (sizeof(container->n_runs) + sizeof(container->capacity) +
+            sizeof(rle16_t) * container->n_runs);
+}
+
+void *run_container_deserialize(const char *buf, size_t buf_len) {
+    run_container_t *ptr;
+
+    if (buf_len < 8 /* n_runs + capacity */)
+        return (NULL);
+    else
+        buf_len -= 8;
+
+    if ((ptr = (run_container_t *)malloc(sizeof(run_container_t))) != NULL) {
+        size_t len;
+        int32_t off;
+
+        memcpy(&ptr->n_runs, buf, off = 4);
+        memcpy(&ptr->capacity, &buf[off], 4);
+        off += 4;
+
+        len = sizeof(rle16_t) * ptr->n_runs;
+
+        if (len != buf_len) {
+            free(ptr);
+            return (NULL);
+        }
+
+        if ((ptr->runs = (rle16_t *)malloc(len)) == NULL) {
+            free(ptr);
+            return (NULL);
+        }
+
+        memcpy(ptr->runs, &buf[off], len);
+
+        /* Check if returned values are monotonically increasing */
+        for (int32_t i = 0, j = 0; i < ptr->n_runs; i++) {
+            if (ptr->runs[i].value < j) {
+                free(ptr->runs);
+                free(ptr);
+                return (NULL);
+            } else
+                j = ptr->runs[i].value;
+        }
+    }
+
+    return (ptr);
+}
+
+bool run_container_iterate(const run_container_t *cont, uint32_t base,
+                           roaring_iterator iterator, void *ptr) {
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+
+        for (int j = 0; j <= le; ++j)
+            if (!iterator(run_start + j, ptr)) return false;
+    }
+    return true;
+}
+
+bool run_container_iterate64(const run_container_t *cont, uint32_t base,
+                             roaring_iterator64 iterator, uint64_t high_bits,
+                             void *ptr) {
+    for (int i = 0; i < cont->n_runs; ++i) {
+        uint32_t run_start = base + cont->runs[i].value;
+        uint16_t le = cont->runs[i].length;
+
+        for (int j = 0; j <= le; ++j)
+            if (!iterator(high_bits | (uint64_t)(run_start + j), ptr))
+                return false;
+    }
+    return true;
+}
+
+bool run_container_equals(const run_container_t *container1,
+                          const run_container_t *container2) {
+    if (container1->n_runs != container2->n_runs) {
+        return false;
+    }
+    for (int32_t i = 0; i < container1->n_runs; ++i) {
+        if ((container1->runs[i].value != container2->runs[i].value) ||
+            (container1->runs[i].length != container2->runs[i].length))
+            return false;
+    }
+    return true;
+}
+
+bool run_container_is_subset(const run_container_t *container1,
+                             const run_container_t *container2) {
+    int i1 = 0, i2 = 0;
+    while (i1 < container1->n_runs && i2 < container2->n_runs) {
+        int start1 = container1->runs[i1].value;
+        int stop1 = start1 + container1->runs[i1].length;
+        int start2 = container2->runs[i2].value;
+        int stop2 = start2 + container2->runs[i2].length;
+        if (start1 < start2) {
+            return false;
+        } else {  // start1 >= start2
+            if (stop1 < stop2) {
+                i1++;
+            } else if (stop1 == stop2) {
+                i1++;
+                i2++;
+            } else {  // stop1 > stop2
+                i2++;
+            }
+        }
+    }
+    if (i1 == container1->n_runs) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+// TODO: write smart_append_exclusive version to match the overloaded 1 param
+// Java version (or  is it even used?)
+
+// follows the Java implementation closely
+// length is the rle-value.  Ie, run [10,12) uses a length value 1.
+void run_container_smart_append_exclusive(run_container_t *src,
+                                          const uint16_t start,
+                                          const uint16_t length) {
+    int old_end;
+    rle16_t *last_run = src->n_runs ? src->runs + (src->n_runs - 1) : NULL;
+    rle16_t *appended_last_run = src->runs + src->n_runs;
+
+    if (!src->n_runs ||
+        (start > (old_end = last_run->value + last_run->length + 1))) {
+        *appended_last_run = (rle16_t){.value = start, .length = length};
+        src->n_runs++;
+        return;
+    }
+    if (old_end == start) {
+        // we merge
+        last_run->length += (length + 1);
+        return;
+    }
+    int new_end = start + length + 1;
+
+    if (start == last_run->value) {
+        // wipe out previous
+        if (new_end < old_end) {
+            *last_run = (rle16_t){.value = (uint16_t)new_end,
+                                  .length = (uint16_t)(old_end - new_end - 1)};
+            return;
+        } else if (new_end > old_end) {
+            *last_run = (rle16_t){.value = (uint16_t)old_end,
+                                  .length = (uint16_t)(new_end - old_end - 1)};
+            return;
+        } else {
+            src->n_runs--;
+            return;
+        }
+    }
+    last_run->length = start - last_run->value - 1;
+    if (new_end < old_end) {
+        *appended_last_run =
+            (rle16_t){.value = (uint16_t)new_end,
+                      .length = (uint16_t)(old_end - new_end - 1)};
+        src->n_runs++;
+    } else if (new_end > old_end) {
+        *appended_last_run =
+            (rle16_t){.value = (uint16_t)old_end,
+                      .length = (uint16_t)(new_end - old_end - 1)};
+        src->n_runs++;
+    }
+}
+
+bool run_container_select(const run_container_t *container,
+                          uint32_t *start_rank, uint32_t rank,
+                          uint32_t *element) {
+    for (int i = 0; i < container->n_runs; i++) {
+        uint16_t length = container->runs[i].length;
+        if (rank <= *start_rank + length) {
+            uint16_t value = container->runs[i].value;
+            *element = value + rank - (*start_rank);
+            return true;
+        } else
+            *start_rank += length + 1;
+    }
+    return false;
+}
+
+int run_container_rank(const run_container_t *container, uint16_t x) {
+    int sum = 0;
+    uint32_t x32 = x;
+    for (int i = 0; i < container->n_runs; i++) {
+        uint32_t startpoint = container->runs[i].value;
+        uint32_t length = container->runs[i].length;
+        uint32_t endpoint = length + startpoint;
+        if (x <= endpoint) {
+            if (x < startpoint) break;
+            return sum + (x32 - startpoint) + 1;
+        } else {
+            sum += length + 1;
+        }
+    }
+    return sum;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/containers/run.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/roaring.c */
+#include <assert.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+extern inline bool roaring_bitmap_contains(const roaring_bitmap_t *r,
+                                           uint32_t val);
+
+// this is like roaring_bitmap_add, but it populates pointer arguments in such a
+// way
+// that we can recover the container touched, which, in turn can be used to
+// accelerate some functions (when you repeatedly need to add to the same
+// container)
+void *containerptr_roaring_bitmap_add(roaring_bitmap_t *r,
+                                                    uint32_t val,
+                                                    uint8_t *typecode,
+                                                    int *index) {
+    uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, i);
+        void *container =
+            ra_get_container_at_index(&r->high_low_container, i, typecode);
+        uint8_t newtypecode = *typecode;
+        void *container2 =
+            container_add(container, val & 0xFFFF, *typecode, &newtypecode);
+        *index = i;
+        if (container2 != container) {
+            container_free(container, *typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+            *typecode = newtypecode;
+            return container2;
+        } else {
+            return container;
+        }
+    } else {
+        array_container_t *newac = array_container_create();
+        void *container = container_add(newac, val & 0xFFFF,
+                                        ARRAY_CONTAINER_TYPE_CODE, typecode);
+        // we could just assume that it stays an array container
+        ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
+                                   container, *typecode);
+        *index = -i - 1;
+        return container;
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_create() {
+    roaring_bitmap_t *ans =
+        (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
+    if (!ans) {
+        return NULL;
+    }
+    bool is_ok = ra_init(&ans->high_low_container);
+    if (!is_ok) {
+        free(ans);
+        return NULL;
+    }
+    ans->copy_on_write = false;
+    return ans;
+}
+
+roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap) {
+    roaring_bitmap_t *ans =
+        (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
+    if (!ans) {
+        return NULL;
+    }
+    bool is_ok = ra_init_with_capacity(&ans->high_low_container, cap);
+    if (!is_ok) {
+        free(ans);
+        return NULL;
+    }
+    ans->copy_on_write = false;
+    return ans;
+}
+
+void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
+                             const uint32_t *vals) {
+    void *container = NULL;  // hold value of last container touched
+    uint8_t typecode = 0;    // typecode of last container touched
+    uint32_t prev = 0;       // previous valued inserted
+    size_t i = 0;            // index of value
+    int containerindex = 0;
+    if (n_args == 0) return;
+    uint32_t val;
+    memcpy(&val, vals + i, sizeof(val));
+    container =
+        containerptr_roaring_bitmap_add(r, val, &typecode, &containerindex);
+    prev = val;
+    i++;
+    for (; i < n_args; i++) {
+        memcpy(&val, vals + i, sizeof(val));
+        if (((prev ^ val) >> 16) ==
+            0) {  // no need to seek the container, it is at hand
+            // because we already have the container at hand, we can do the
+            // insertion
+            // automatically, bypassing the roaring_bitmap_add call
+            uint8_t newtypecode = typecode;
+            void *container2 =
+                container_add(container, val & 0xFFFF, typecode, &newtypecode);
+            if (container2 != container) {  // rare instance when we need to
+                                            // change the container type
+                container_free(container, typecode);
+                ra_set_container_at_index(&r->high_low_container,
+                                          containerindex, container2,
+                                          newtypecode);
+                typecode = newtypecode;
+                container = container2;
+            }
+        } else {
+            container = containerptr_roaring_bitmap_add(r, val, &typecode,
+                                                        &containerindex);
+        }
+        prev = val;
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals) {
+    roaring_bitmap_t *answer = roaring_bitmap_create();
+    roaring_bitmap_add_many(answer, n_args, vals);
+    return answer;
+}
+
+roaring_bitmap_t *roaring_bitmap_of(size_t n_args, ...) {
+    // todo: could be greatly optimized but we do not expect this call to ever
+    // include long lists
+    roaring_bitmap_t *answer = roaring_bitmap_create();
+    va_list ap;
+    va_start(ap, n_args);
+    for (size_t i = 1; i <= n_args; i++) {
+        uint32_t val = va_arg(ap, uint32_t);
+        roaring_bitmap_add(answer, val);
+    }
+    va_end(ap);
+    return answer;
+}
+
+static inline uint32_t minimum_uint32(uint32_t a, uint32_t b) {
+    return (a < b) ? a : b;
+}
+
+static inline uint64_t minimum_uint64(uint64_t a, uint64_t b) {
+    return (a < b) ? a : b;
+}
+
+roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max,
+                                            uint32_t step) {
+    if(max >= UINT64_C(0x100000000)) {
+        max = UINT64_C(0x100000000);
+    }
+    if (step == 0) return NULL;
+    if (max <= min) return NULL;
+    roaring_bitmap_t *answer = roaring_bitmap_create();
+    if (step >= (1 << 16)) {
+        for (uint32_t value = (uint32_t)min; value < max; value += step) {
+            roaring_bitmap_add(answer, value);
+        }
+        return answer;
+    }
+    uint64_t min_tmp = min;
+    do {
+        uint32_t key = (uint32_t)min_tmp >> 16;
+        uint32_t container_min = min_tmp & 0xFFFF;
+        uint32_t container_max = (uint32_t)minimum_uint64(max - (key << 16), 1 << 16);
+        uint8_t type;
+        void *container = container_from_range(&type, container_min,
+                                               container_max, (uint16_t)step);
+        ra_append(&answer->high_low_container, key, container, type);
+        uint32_t gap = container_max - container_min + step - 1;
+        min_tmp += gap - (gap % step);
+    } while (min_tmp < max);
+    // cardinality of bitmap will be ((uint64_t) max - min + step - 1 ) / step
+    return answer;
+}
+
+void roaring_bitmap_add_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max) {
+    if (min > max) {
+        return;
+    }
+
+    uint32_t min_key = min >> 16;
+    uint32_t max_key = max >> 16;
+
+    int32_t num_required_containers = max_key - min_key + 1;
+    int32_t suffix_length = count_greater(ra->high_low_container.keys,
+                                          ra->high_low_container.size,
+                                          max_key);
+    int32_t prefix_length = count_less(ra->high_low_container.keys,
+                                       ra->high_low_container.size - suffix_length,
+                                       min_key);
+    int32_t common_length = ra->high_low_container.size - prefix_length - suffix_length;
+
+    if (num_required_containers > common_length) {
+        ra_shift_tail(&ra->high_low_container, suffix_length,
+                      num_required_containers - common_length);
+    }
+
+    int32_t src = prefix_length + common_length - 1;
+    int32_t dst = ra->high_low_container.size - suffix_length - 1;
+    for (uint32_t key = max_key; key != min_key-1; key--) { // beware of min_key==0
+        uint32_t container_min = (min_key == key) ? (min & 0xffff) : 0;
+        uint32_t container_max = (max_key == key) ? (max & 0xffff) : 0xffff;
+        void* new_container;
+        uint8_t new_type;
+
+        if (src >= 0 && ra->high_low_container.keys[src] == key) {
+            ra_unshare_container_at_index(&ra->high_low_container, src);
+            new_container = container_add_range(ra->high_low_container.containers[src],
+                                                ra->high_low_container.typecodes[src],
+                                                container_min, container_max, &new_type);
+            if (new_container != ra->high_low_container.containers[src]) {
+                container_free(ra->high_low_container.containers[src],
+                               ra->high_low_container.typecodes[src]);
+            }
+            src--;
+        } else {
+            new_container = container_from_range(&new_type, container_min,
+                                                 container_max+1, 1);
+        }
+        ra_replace_key_and_container_at_index(&ra->high_low_container, dst,
+                                              key, new_container, new_type);
+        dst--;
+    }
+}
+
+void roaring_bitmap_remove_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max) {
+    if (min > max) {
+        return;
+    }
+
+    uint32_t min_key = min >> 16;
+    uint32_t max_key = max >> 16;
+
+    int32_t src = count_less(ra->high_low_container.keys, ra->high_low_container.size, min_key);
+    int32_t dst = src;
+    while (src < ra->high_low_container.size && ra->high_low_container.keys[src] <= max_key) {
+        uint32_t container_min = (min_key == ra->high_low_container.keys[src]) ? (min & 0xffff) : 0;
+        uint32_t container_max = (max_key == ra->high_low_container.keys[src]) ? (max & 0xffff) : 0xffff;
+        ra_unshare_container_at_index(&ra->high_low_container, src);
+        void *new_container;
+        uint8_t new_type;
+        new_container = container_remove_range(ra->high_low_container.containers[src],
+                                               ra->high_low_container.typecodes[src],
+                                               container_min, container_max,
+                                               &new_type);
+        if (new_container != ra->high_low_container.containers[src]) {
+            container_free(ra->high_low_container.containers[src],
+                           ra->high_low_container.typecodes[src]);
+        }
+        if (new_container) {
+            ra_replace_key_and_container_at_index(&ra->high_low_container, dst,
+                                                  ra->high_low_container.keys[src],
+                                                  new_container, new_type);
+            dst++;
+        }
+        src++;
+    }
+    if (src > dst) {
+        ra_shift_tail(&ra->high_low_container, ra->high_low_container.size - src, dst - src);
+    }
+}
+
+void roaring_bitmap_add_range(roaring_bitmap_t *ra, uint64_t min, uint64_t max);
+void roaring_bitmap_remove_range(roaring_bitmap_t *ra, uint64_t min, uint64_t max);
+
+void roaring_bitmap_printf(const roaring_bitmap_t *ra) {
+    printf("{");
+    for (int i = 0; i < ra->high_low_container.size; ++i) {
+        container_printf_as_uint32_array(
+            ra->high_low_container.containers[i],
+            ra->high_low_container.typecodes[i],
+            ((uint32_t)ra->high_low_container.keys[i]) << 16);
+        if (i + 1 < ra->high_low_container.size) printf(",");
+    }
+    printf("}");
+}
+
+void roaring_bitmap_printf_describe(const roaring_bitmap_t *ra) {
+    printf("{");
+    for (int i = 0; i < ra->high_low_container.size; ++i) {
+        printf("%d: %s (%d)", ra->high_low_container.keys[i],
+               get_full_container_name(ra->high_low_container.containers[i],
+                                       ra->high_low_container.typecodes[i]),
+               container_get_cardinality(ra->high_low_container.containers[i],
+                                         ra->high_low_container.typecodes[i]));
+        if (ra->high_low_container.typecodes[i] == SHARED_CONTAINER_TYPE_CODE) {
+            printf(
+                "(shared count = %" PRIu32 " )",
+                ((shared_container_t *)(ra->high_low_container.containers[i]))
+                    ->counter);
+        }
+
+        if (i + 1 < ra->high_low_container.size) printf(", ");
+    }
+    printf("}");
+}
+
+typedef struct min_max_sum_s {
+    uint32_t min;
+    uint32_t max;
+    uint64_t sum;
+} min_max_sum_t;
+
+static bool min_max_sum_fnc(uint32_t value, void *param) {
+    min_max_sum_t *mms = (min_max_sum_t *)param;
+    if (value > mms->max) mms->max = value;
+    if (value < mms->min) mms->min = value;
+    mms->sum += value;
+    return true;  // we always process all data points
+}
+
+/**
+*  (For advanced users.)
+* Collect statistics about the bitmap
+*/
+void roaring_bitmap_statistics(const roaring_bitmap_t *ra,
+                               roaring_statistics_t *stat) {
+    memset(stat, 0, sizeof(*stat));
+    stat->n_containers = ra->high_low_container.size;
+    stat->cardinality = roaring_bitmap_get_cardinality(ra);
+    min_max_sum_t mms;
+    mms.min = UINT32_C(0xFFFFFFFF);
+    mms.max = UINT32_C(0);
+    mms.sum = 0;
+    roaring_iterate(ra, &min_max_sum_fnc, &mms);
+    stat->min_value = mms.min;
+    stat->max_value = mms.max;
+    stat->sum_value = mms.sum;
+
+    for (int i = 0; i < ra->high_low_container.size; ++i) {
+        uint8_t truetype =
+            get_container_type(ra->high_low_container.containers[i],
+                               ra->high_low_container.typecodes[i]);
+        uint32_t card =
+            container_get_cardinality(ra->high_low_container.containers[i],
+                                      ra->high_low_container.typecodes[i]);
+        uint32_t sbytes =
+            container_size_in_bytes(ra->high_low_container.containers[i],
+                                    ra->high_low_container.typecodes[i]);
+        switch (truetype) {
+            case BITSET_CONTAINER_TYPE_CODE:
+                stat->n_bitset_containers++;
+                stat->n_values_bitset_containers += card;
+                stat->n_bytes_bitset_containers += sbytes;
+                break;
+            case ARRAY_CONTAINER_TYPE_CODE:
+                stat->n_array_containers++;
+                stat->n_values_array_containers += card;
+                stat->n_bytes_array_containers += sbytes;
+                break;
+            case RUN_CONTAINER_TYPE_CODE:
+                stat->n_run_containers++;
+                stat->n_values_run_containers += card;
+                stat->n_bytes_run_containers += sbytes;
+                break;
+            default:
+                assert(false);
+                __builtin_unreachable();
+        }
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r) {
+    roaring_bitmap_t *ans =
+        (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
+    if (!ans) {
+        return NULL;
+    }
+    bool is_ok = ra_copy(&r->high_low_container, &ans->high_low_container,
+                         r->copy_on_write);
+    if (!is_ok) {
+        free(ans);
+        return NULL;
+    }
+    ans->copy_on_write = r->copy_on_write;
+    return ans;
+}
+
+bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
+                                     const roaring_bitmap_t *src) {
+    return ra_overwrite(&src->high_low_container, &dest->high_low_container,
+                        src->copy_on_write);
+}
+
+void roaring_bitmap_free(roaring_bitmap_t *r) {
+    ra_clear(&r->high_low_container);
+    free(r);
+}
+
+void roaring_bitmap_clear(roaring_bitmap_t *r) {
+  ra_reset(&r->high_low_container);
+}
+
+void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    uint8_t typecode;
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, i);
+        void *container =
+            ra_get_container_at_index(&r->high_low_container, i, &typecode);
+        uint8_t newtypecode = typecode;
+        void *container2 =
+            container_add(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        }
+    } else {
+        array_container_t *newac = array_container_create();
+        void *container = container_add(newac, val & 0xFFFF,
+                                        ARRAY_CONTAINER_TYPE_CODE, &typecode);
+        // we could just assume that it stays an array container
+        ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
+                                   container, typecode);
+    }
+}
+
+bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    uint8_t typecode;
+    bool result = false;
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, i);
+        void *container =
+            ra_get_container_at_index(&r->high_low_container, i, &typecode);
+
+        const int oldCardinality =
+            container_get_cardinality(container, typecode);
+
+        uint8_t newtypecode = typecode;
+        void *container2 =
+            container_add(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+            result = true;
+        } else {
+            const int newCardinality =
+                container_get_cardinality(container, newtypecode);
+
+            result = oldCardinality != newCardinality;
+        }
+    } else {
+        array_container_t *newac = array_container_create();
+        void *container = container_add(newac, val & 0xFFFF,
+                                        ARRAY_CONTAINER_TYPE_CODE, &typecode);
+        // we could just assume that it stays an array container
+        ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
+                                   container, typecode);
+        result = true;
+    }
+
+    return result;
+}
+
+void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    uint8_t typecode;
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, i);
+        void *container =
+            ra_get_container_at_index(&r->high_low_container, i, &typecode);
+        uint8_t newtypecode = typecode;
+        void *container2 =
+            container_remove(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        }
+        if (container_get_cardinality(container2, newtypecode) != 0) {
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        } else {
+            ra_remove_at_index_and_free(&r->high_low_container, i);
+        }
+    }
+}
+
+bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    const int i = ra_get_index(&r->high_low_container, hb);
+    uint8_t typecode;
+    bool result = false;
+    if (i >= 0) {
+        ra_unshare_container_at_index(&r->high_low_container, i);
+        void *container =
+            ra_get_container_at_index(&r->high_low_container, i, &typecode);
+
+        const int oldCardinality =
+            container_get_cardinality(container, typecode);
+
+        uint8_t newtypecode = typecode;
+        void *container2 =
+            container_remove(container, val & 0xFFFF, typecode, &newtypecode);
+        if (container2 != container) {
+            container_free(container, typecode);
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        }
+
+        const int newCardinality =
+            container_get_cardinality(container2, newtypecode);
+
+        if (newCardinality != 0) {
+            ra_set_container_at_index(&r->high_low_container, i, container2,
+                                      newtypecode);
+        } else {
+            ra_remove_at_index_and_free(&r->high_low_container, i);
+        }
+
+        result = oldCardinality != newCardinality;
+    }
+    return result;
+}
+
+void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args,
+                                const uint32_t *vals) {
+    if (n_args == 0 || r->high_low_container.size == 0) {
+        return;
+    }
+    int32_t pos = -1; // position of the container used in the previous iteration
+    for (size_t i = 0; i < n_args; i++) {
+        uint16_t key = (uint16_t)(vals[i] >> 16);
+        if (pos < 0 || key != r->high_low_container.keys[pos]) {
+            pos = ra_get_index(&r->high_low_container, key);
+        }
+        if (pos >= 0) {
+            uint8_t new_typecode;
+            void *new_container;
+            new_container = container_remove(r->high_low_container.containers[pos],
+                                             vals[i] & 0xffff,
+                                             r->high_low_container.typecodes[pos],
+                                             &new_typecode);
+            if (new_container != r->high_low_container.containers[pos]) {
+                container_free(r->high_low_container.containers[pos],
+                               r->high_low_container.typecodes[pos]);
+                ra_replace_key_and_container_at_index(&r->high_low_container,
+                                                      pos, key, new_container,
+                                                      new_typecode);
+            }
+            if (!container_nonzero_cardinality(new_container, new_typecode)) {
+                container_free(new_container, new_typecode);
+                ra_remove_at_index(&r->high_low_container, pos);
+                pos = -1;
+            }
+        }
+    }
+}
+
+// there should be some SIMD optimizations possible here
+roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2) {
+    uint8_t container_result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    uint32_t neededcap = length1 > length2 ? length2 : length1;
+    roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
+    answer->copy_on_write = x1->copy_on_write && x2->copy_on_write;
+
+    int pos1 = 0, pos2 = 0;
+
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+        const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        if (s1 == s2) {
+            uint8_t container_type_1, container_type_2;
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c = container_and(c1, container_type_1, c2, container_type_2,
+                                    &container_result_type);
+            if (container_nonzero_cardinality(c, container_result_type)) {
+                ra_append(&answer->high_low_container, s1, c,
+                          container_result_type);
+            } else {
+                container_free(
+                    c, container_result_type);  // otherwise:memory leak!
+            }
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+        }
+    }
+    return answer;
+}
+
+/**
+ * Compute the union of 'number' bitmaps.
+ */
+roaring_bitmap_t *roaring_bitmap_or_many(size_t number,
+                                         const roaring_bitmap_t **x) {
+    if (number == 0) {
+        return roaring_bitmap_create();
+    }
+    if (number == 1) {
+        return roaring_bitmap_copy(x[0]);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_lazy_or(x[0], x[1], LAZY_OR_BITSET_CONVERSION);
+    for (size_t i = 2; i < number; i++) {
+        roaring_bitmap_lazy_or_inplace(answer, x[i], LAZY_OR_BITSET_CONVERSION);
+    }
+    roaring_bitmap_repair_after_lazy(answer);
+    return answer;
+}
+
+/**
+ * Compute the xor of 'number' bitmaps.
+ */
+roaring_bitmap_t *roaring_bitmap_xor_many(size_t number,
+                                          const roaring_bitmap_t **x) {
+    if (number == 0) {
+        return roaring_bitmap_create();
+    }
+    if (number == 1) {
+        return roaring_bitmap_copy(x[0]);
+    }
+    roaring_bitmap_t *answer = roaring_bitmap_lazy_xor(x[0], x[1]);
+    for (size_t i = 2; i < number; i++) {
+        roaring_bitmap_lazy_xor_inplace(answer, x[i]);
+    }
+    roaring_bitmap_repair_after_lazy(answer);
+    return answer;
+}
+
+// inplace and (modifies its first argument).
+void roaring_bitmap_and_inplace(roaring_bitmap_t *x1,
+                                const roaring_bitmap_t *x2) {
+    if (x1 == x2) return;
+    int pos1 = 0, pos2 = 0, intersection_size = 0;
+    const int length1 = ra_get_size(&x1->high_low_container);
+    const int length2 = ra_get_size(&x2->high_low_container);
+
+    // any skipped-over or newly emptied containers in x1
+    // have to be freed.
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+        const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        if (s1 == s2) {
+            uint8_t typecode1, typecode2, typecode_result;
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &typecode1);
+            c1 = get_writable_copy_if_shared(c1, &typecode1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &typecode2);
+            void *c =
+                container_iand(c1, typecode1, c2, typecode2, &typecode_result);
+            if (c != c1) {  // in this instance a new container was created, and
+                            // we need to free the old one
+                container_free(c1, typecode1);
+            }
+            if (container_nonzero_cardinality(c, typecode_result)) {
+                ra_replace_key_and_container_at_index(&x1->high_low_container,
+                                                      intersection_size, s1, c,
+                                                      typecode_result);
+                intersection_size++;
+            } else {
+                container_free(c, typecode_result);
+            }
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {
+            pos1 = ra_advance_until_freeing(&x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+        }
+    }
+
+    // if we ended early because x2 ran out, then all remaining in x1 should be
+    // freed
+    while (pos1 < length1) {
+        container_free(x1->high_low_container.containers[pos1],
+                       x1->high_low_container.typecodes[pos1]);
+        ++pos1;
+    }
+
+    // all containers after this have either been copied or freed
+    ra_downsize(&x1->high_low_container, intersection_size);
+}
+
+roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2) {
+    uint8_t container_result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    answer->copy_on_write = x1->copy_on_write && x2->copy_on_write;
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c = container_or(c1, container_type_1, c2, container_type_2,
+                                   &container_result_type);
+            // since we assume that the initial containers are non-empty, the
+            // result here
+            // can only be non-empty
+            ra_append(&answer->high_low_container, s1, c,
+                      container_result_type);
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            // c1 = container_clone(c1, container_type_1);
+            c1 =
+                get_copy_of_container(c1, &container_type_1, x1->copy_on_write);
+            if (x1->copy_on_write) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          container_type_1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, container_type_1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            // c2 = container_clone(c2, container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, container_type_2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             x2->copy_on_write);
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             x1->copy_on_write);
+    }
+    return answer;
+}
+
+// inplace or (modifies its first argument).
+void roaring_bitmap_or_inplace(roaring_bitmap_t *x1,
+                               const roaring_bitmap_t *x2) {
+    uint8_t container_result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            if (!container_is_full(c1, container_type_1)) {
+                c1 = get_writable_copy_if_shared(c1, &container_type_1);
+
+                void *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                     pos2, &container_type_2);
+                void *c =
+                    container_ior(c1, container_type_1, c2, container_type_2,
+                                  &container_result_type);
+                if (c !=
+                    c1) {  // in this instance a new container was created, and
+                           // we need to free the old one
+                    container_free(c1, container_type_1);
+                }
+
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          container_result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+
+            // void *c2_clone = container_clone(c2, container_type_2);
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       container_type_2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, x2->copy_on_write);
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2) {
+    uint8_t container_result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    answer->copy_on_write = x1->copy_on_write && x2->copy_on_write;
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c = container_xor(c1, container_type_1, c2, container_type_2,
+                                    &container_result_type);
+
+            if (container_nonzero_cardinality(c, container_result_type)) {
+                ra_append(&answer->high_low_container, s1, c,
+                          container_result_type);
+            } else {
+                container_free(c, container_result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            c1 =
+                get_copy_of_container(c1, &container_type_1, x1->copy_on_write);
+            if (x1->copy_on_write) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          container_type_1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, container_type_1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, container_type_2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             x2->copy_on_write);
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             x1->copy_on_write);
+    }
+    return answer;
+}
+
+// inplace xor (modifies its first argument).
+
+void roaring_bitmap_xor_inplace(roaring_bitmap_t *x1,
+                                const roaring_bitmap_t *x2) {
+    assert(x1 != x2);
+    uint8_t container_result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+
+    // XOR can have new containers inserted from x2, but can also
+    // lose containers when x1 and x2 are nonempty and identical.
+
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            c1 = get_writable_copy_if_shared(c1, &container_type_1);
+
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c = container_ixor(c1, container_type_1, c2, container_type_2,
+                                     &container_result_type);
+
+            if (container_nonzero_cardinality(c, container_result_type)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          container_result_type);
+                ++pos1;
+            } else {
+                container_free(c, container_result_type);
+                ra_remove_at_index(&x1->high_low_container, pos1);
+                --length1;
+            }
+
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       container_type_2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, x2->copy_on_write);
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2) {
+    uint8_t container_result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        roaring_bitmap_t *empty_bitmap = roaring_bitmap_create();
+        empty_bitmap->copy_on_write = x1->copy_on_write && x2->copy_on_write;
+        return empty_bitmap;
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(length1);
+    answer->copy_on_write = x1->copy_on_write && x2->copy_on_write;
+
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = 0;
+    uint16_t s2 = 0;
+    while (true) {
+        s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+        s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c =
+                container_andnot(c1, container_type_1, c2, container_type_2,
+                                 &container_result_type);
+
+            if (container_nonzero_cardinality(c, container_result_type)) {
+                ra_append(&answer->high_low_container, s1, c,
+                          container_result_type);
+            } else {
+                container_free(c, container_result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+        } else if (s1 < s2) {  // s1 < s2
+            const int next_pos1 =
+                ra_advance_until(&x1->high_low_container, s2, pos1);
+            ra_append_copy_range(&answer->high_low_container,
+                                 &x1->high_low_container, pos1, next_pos1,
+                                 x1->copy_on_write);
+            // TODO : perhaps some of the copy_on_write should be based on
+            // answer rather than x1 (more stringent?).  Many similar cases
+            pos1 = next_pos1;
+            if (pos1 == length1) break;
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+            if (pos2 == length2) break;
+        }
+    }
+    if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             x1->copy_on_write);
+    }
+    return answer;
+}
+
+// inplace andnot (modifies its first argument).
+
+void roaring_bitmap_andnot_inplace(roaring_bitmap_t *x1,
+                                   const roaring_bitmap_t *x2) {
+    assert(x1 != x2);
+
+    uint8_t container_result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+    int intersection_size = 0;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_clear(x1);
+        return;
+    }
+
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            c1 = get_writable_copy_if_shared(c1, &container_type_1);
+
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c =
+                container_iandnot(c1, container_type_1, c2, container_type_2,
+                                  &container_result_type);
+
+            if (container_nonzero_cardinality(c, container_result_type)) {
+                ra_replace_key_and_container_at_index(&x1->high_low_container,
+                                                      intersection_size++, s1,
+                                                      c, container_result_type);
+            } else {
+                container_free(c, container_result_type);
+            }
+
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            if (pos1 != intersection_size) {
+                void *c1 = ra_get_container_at_index(&x1->high_low_container,
+                                                     pos1, &container_type_1);
+
+                ra_replace_key_and_container_at_index(&x1->high_low_container,
+                                                      intersection_size, s1, c1,
+                                                      container_type_1);
+            }
+            intersection_size++;
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+
+    if (pos1 < length1) {
+        // all containers between intersection_size and
+        // pos1 are junk.  However, they have either been moved
+        // (thus still referenced) or involved in an iandnot
+        // that will clean up all containers that could not be reused.
+        // Thus we should not free the junk containers between
+        // intersection_size and pos1.
+        if (pos1 > intersection_size) {
+            // left slide of remaining items
+            ra_copy_range(&x1->high_low_container, pos1, length1,
+                          intersection_size);
+        }
+        // else current placement is fine
+        intersection_size += (length1 - pos1);
+    }
+    ra_downsize(&x1->high_low_container, intersection_size);
+}
+
+uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *ra) {
+    uint64_t card = 0;
+    for (int i = 0; i < ra->high_low_container.size; ++i)
+        card += container_get_cardinality(ra->high_low_container.containers[i],
+                                          ra->high_low_container.typecodes[i]);
+    return card;
+}
+
+uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *ra,
+                                          uint64_t range_start,
+                                          uint64_t range_end) {
+    if (range_end > UINT32_MAX) {
+        range_end = UINT32_MAX + UINT64_C(1);
+    }
+    if (range_start >= range_end) {
+        return 0;
+    }
+    range_end--; // make range_end inclusive
+    // now we have: 0 <= range_start <= range_end <= UINT32_MAX
+
+    int minhb = range_start >> 16;
+    int maxhb = range_end >> 16;
+
+    uint64_t card = 0;
+
+    int i = ra_get_index(&ra->high_low_container, minhb);
+    if (i >= 0) {
+        if (minhb == maxhb) {
+            card += container_rank(ra->high_low_container.containers[i],
+                                   ra->high_low_container.typecodes[i],
+                                   range_end & 0xffff);
+        } else {
+            card += container_get_cardinality(ra->high_low_container.containers[i],
+                                              ra->high_low_container.typecodes[i]);
+        }
+        if ((range_start & 0xffff) != 0) {
+            card -= container_rank(ra->high_low_container.containers[i],
+                                   ra->high_low_container.typecodes[i],
+                                   (range_start & 0xffff) - 1);
+        }
+        i++;
+    } else {
+        i = -i - 1;
+    }
+
+    for (; i < ra->high_low_container.size; i++) {
+        uint16_t key = ra->high_low_container.keys[i];
+        if (key < maxhb) {
+            card += container_get_cardinality(ra->high_low_container.containers[i],
+                                              ra->high_low_container.typecodes[i]);
+        } else if (key == maxhb) {
+            card += container_rank(ra->high_low_container.containers[i],
+                                   ra->high_low_container.typecodes[i],
+                                   range_end & 0xffff);
+            break;
+        } else {
+            break;
+        }
+    }
+
+    return card;
+}
+
+
+bool roaring_bitmap_is_empty(const roaring_bitmap_t *ra) {
+    return ra->high_low_container.size == 0;
+}
+
+void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *ra, uint32_t *ans) {
+    ra_to_uint32_array(&ra->high_low_container, ans);
+}
+
+bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *ra, size_t offset, size_t limit,  uint32_t *ans) {
+    return ra_range_uint32_array(&ra->high_low_container, offset, limit, ans);
+}
+
+/** convert array and bitmap containers to run containers when it is more
+ * efficient;
+ * also convert from run containers when more space efficient.  Returns
+ * true if the result has at least one run container.
+*/
+bool roaring_bitmap_run_optimize(roaring_bitmap_t *r) {
+    bool answer = false;
+    for (int i = 0; i < r->high_low_container.size; i++) {
+        uint8_t typecode_original, typecode_after;
+        ra_unshare_container_at_index(
+            &r->high_low_container, i);  // TODO: this introduces extra cloning!
+        void *c = ra_get_container_at_index(&r->high_low_container, i,
+                                            &typecode_original);
+        void *c1 = convert_run_optimize(c, typecode_original, &typecode_after);
+        if (typecode_after == RUN_CONTAINER_TYPE_CODE) answer = true;
+        ra_set_container_at_index(&r->high_low_container, i, c1,
+                                  typecode_after);
+    }
+    return answer;
+}
+
+size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r) {
+    size_t answer = 0;
+    for (int i = 0; i < r->high_low_container.size; i++) {
+        uint8_t typecode_original;
+        void *c = ra_get_container_at_index(&r->high_low_container, i,
+                                            &typecode_original);
+        answer += container_shrink_to_fit(c, typecode_original);
+    }
+    answer += ra_shrink_to_fit(&r->high_low_container);
+    return answer;
+}
+
+/**
+ *  Remove run-length encoding even when it is more space efficient
+ *  return whether a change was applied
+ */
+bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r) {
+    bool answer = false;
+    for (int i = 0; i < r->high_low_container.size; i++) {
+        uint8_t typecode_original, typecode_after;
+        void *c = ra_get_container_at_index(&r->high_low_container, i,
+                                            &typecode_original);
+        if (get_container_type(c, typecode_original) ==
+            RUN_CONTAINER_TYPE_CODE) {
+            answer = true;
+            if (typecode_original == SHARED_CONTAINER_TYPE_CODE) {
+                run_container_t *truec =
+                    (run_container_t *)((shared_container_t *)c)->container;
+                int32_t card = run_container_cardinality(truec);
+                void *c1 = convert_to_bitset_or_array_container(
+                    truec, card, &typecode_after);
+                shared_container_free((shared_container_t *)c);
+                ra_set_container_at_index(&r->high_low_container, i, c1,
+                                          typecode_after);
+
+            } else {
+                int32_t card = run_container_cardinality((run_container_t *)c);
+                void *c1 = convert_to_bitset_or_array_container(
+                    (run_container_t *)c, card, &typecode_after);
+                ra_set_container_at_index(&r->high_low_container, i, c1,
+                                          typecode_after);
+            }
+        }
+    }
+    return answer;
+}
+
+size_t roaring_bitmap_serialize(const roaring_bitmap_t *ra, char *buf) {
+    size_t portablesize = roaring_bitmap_portable_size_in_bytes(ra);
+    uint64_t cardinality = roaring_bitmap_get_cardinality(ra);
+    uint64_t sizeasarray = cardinality * sizeof(uint32_t) + sizeof(uint32_t);
+    if (portablesize < sizeasarray) {
+        buf[0] = SERIALIZATION_CONTAINER;
+        return roaring_bitmap_portable_serialize(ra, buf + 1) + 1;
+    } else {
+        buf[0] = SERIALIZATION_ARRAY_UINT32;
+        memcpy(buf + 1, &cardinality, sizeof(uint32_t));
+        roaring_bitmap_to_uint32_array(
+            ra, (uint32_t *)(buf + 1 + sizeof(uint32_t)));
+        return 1 + (size_t)sizeasarray;
+    }
+}
+
+size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *ra) {
+    size_t portablesize = roaring_bitmap_portable_size_in_bytes(ra);
+    uint64_t sizeasarray = roaring_bitmap_get_cardinality(ra) * sizeof(uint32_t) +
+                         sizeof(uint32_t);
+    return portablesize < sizeasarray ? portablesize + 1 : (size_t)sizeasarray + 1;
+}
+
+size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *ra) {
+    return ra_portable_size_in_bytes(&ra->high_low_container);
+}
+
+
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, size_t maxbytes) {
+    roaring_bitmap_t *ans =
+        (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
+    if (ans == NULL) {
+        return NULL;
+    }
+    size_t bytesread;
+    bool is_ok = ra_portable_deserialize(&ans->high_low_container, buf, maxbytes, &bytesread);
+    if(is_ok) assert(bytesread <= maxbytes);
+    ans->copy_on_write = false;
+    if (!is_ok) {
+        free(ans);
+        return NULL;
+    }
+    return ans;
+}
+
+roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf) {
+    return roaring_bitmap_portable_deserialize_safe(buf, SIZE_MAX);
+}
+
+
+size_t roaring_bitmap_portable_deserialize_size(const char *buf, size_t maxbytes) {
+  return ra_portable_deserialize_size(buf, maxbytes);
+}
+
+
+size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *ra,
+                                         char *buf) {
+    return ra_portable_serialize(&ra->high_low_container, buf);
+}
+
+roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) {
+    const char *bufaschar = (const char *)buf;
+    if (*(const unsigned char *)buf == SERIALIZATION_ARRAY_UINT32) {
+        /* This looks like a compressed set of uint32_t elements */
+        uint32_t card;
+        memcpy(&card, bufaschar + 1, sizeof(uint32_t));
+        const uint32_t *elems =
+            (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t));
+
+        return roaring_bitmap_of_ptr(card, elems);
+    } else if (bufaschar[0] == SERIALIZATION_CONTAINER) {
+        return roaring_bitmap_portable_deserialize(bufaschar + 1);
+    } else
+        return (NULL);
+}
+
+bool roaring_iterate(const roaring_bitmap_t *ra, roaring_iterator iterator,
+                     void *ptr) {
+    for (int i = 0; i < ra->high_low_container.size; ++i)
+        if (!container_iterate(ra->high_low_container.containers[i],
+                               ra->high_low_container.typecodes[i],
+                               ((uint32_t)ra->high_low_container.keys[i]) << 16,
+                               iterator, ptr)) {
+            return false;
+        }
+    return true;
+}
+
+bool roaring_iterate64(const roaring_bitmap_t *ra, roaring_iterator64 iterator,
+                       uint64_t high_bits, void *ptr) {
+    for (int i = 0; i < ra->high_low_container.size; ++i)
+        if (!container_iterate64(
+                ra->high_low_container.containers[i],
+                ra->high_low_container.typecodes[i],
+                ((uint32_t)ra->high_low_container.keys[i]) << 16, iterator,
+                high_bits, ptr)) {
+            return false;
+        }
+    return true;
+}
+
+/****
+* begin roaring_uint32_iterator_t
+*****/
+
+static bool loadfirstvalue(roaring_uint32_iterator_t *newit) {
+    newit->in_container_index = 0;
+    newit->run_index = 0;
+    newit->current_value = 0;
+    if (newit->container_index >=
+        newit->parent->high_low_container.size) {  // otherwise nothing
+        newit->current_value = UINT32_MAX;
+        return (newit->has_value = false);
+    }
+    // assume not empty
+    newit->has_value = true;
+    // we precompute container, typecode and highbits so that successive
+    // iterators do not have to grab them from odd memory locations
+    // and have to worry about the (easily predicted) container_unwrap_shared
+    // call.
+    newit->container =
+        newit->parent->high_low_container.containers[newit->container_index];
+    newit->typecode =
+        newit->parent->high_low_container.typecodes[newit->container_index];
+    newit->highbits =
+        ((uint32_t)
+             newit->parent->high_low_container.keys[newit->container_index])
+        << 16;
+    newit->container =
+        container_unwrap_shared(newit->container, &(newit->typecode));
+    uint32_t wordindex;
+    uint64_t word;  // used for bitsets
+    switch (newit->typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            wordindex = 0;
+            while ((word = ((const bitset_container_t *)(newit->container))
+                               ->array[wordindex]) == 0)
+                wordindex++;  // advance
+            // here "word" is non-zero
+            newit->in_container_index = wordindex * 64 + __builtin_ctzll(word);
+            newit->current_value = newit->highbits | newit->in_container_index;
+            break;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            newit->current_value =
+                newit->highbits |
+                ((const array_container_t *)(newit->container))->array[0];
+            break;
+        case RUN_CONTAINER_TYPE_CODE:
+            newit->current_value =
+                newit->highbits |
+                (((const run_container_t *)(newit->container))->runs[0].value);
+            newit->in_run_index =
+                newit->current_value +
+                (((const run_container_t *)(newit->container))->runs[0].length);
+            break;
+        default:
+            // if this ever happens, bug!
+            assert(false);
+    }  // switch (typecode)
+    return true;
+}
+
+// prerequesite: the value should be in range of the container
+static bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, uint32_t val) {
+    uint16_t lb = val & 0xFFFF;
+    newit->in_container_index = 0;
+    newit->run_index = 0;
+    newit->current_value = 0;
+    // assume it is found
+    newit->has_value = true;
+    newit->container =
+        newit->parent->high_low_container.containers[newit->container_index];
+    newit->typecode =
+        newit->parent->high_low_container.typecodes[newit->container_index];
+    newit->highbits =
+        ((uint32_t)
+             newit->parent->high_low_container.keys[newit->container_index])
+        << 16;
+    newit->container =
+        container_unwrap_shared(newit->container, &(newit->typecode));
+    switch (newit->typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            newit->in_container_index =  bitset_container_index_equalorlarger((const bitset_container_t *)(newit->container), lb);
+            newit->current_value = newit->highbits | newit->in_container_index;
+            break;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            newit->in_container_index = array_container_index_equalorlarger((const array_container_t *)(newit->container), lb);
+            newit->current_value =
+                newit->highbits |
+                ((const array_container_t *)(newit->container))->array[newit->in_container_index];
+            break;
+        case RUN_CONTAINER_TYPE_CODE:
+            newit->run_index = run_container_index_equalorlarger((const run_container_t *)(newit->container), lb);
+            if(((const run_container_t *)(newit->container))->runs[newit->run_index].value <= lb) {
+              newit->current_value = val;
+            } else {
+              newit->current_value =
+                newit->highbits |
+                (((const run_container_t *)(newit->container))->runs[newit->run_index].value);
+            }
+            newit->in_run_index =
+                (newit->highbits | (((const run_container_t *)(newit->container))->runs[newit->run_index].value)) +
+                (((const run_container_t *)(newit->container))->runs[newit->run_index].length);
+
+            break;
+        default:
+            // if this ever happens, bug!
+            assert(false);
+    }  // switch (typecode)
+    return true;
+}
+
+void roaring_init_iterator(const roaring_bitmap_t *ra,
+                           roaring_uint32_iterator_t *newit) {
+    newit->parent = ra;
+    newit->container_index = 0;
+    newit->has_value = loadfirstvalue(newit);
+}
+
+roaring_uint32_iterator_t *roaring_create_iterator(const roaring_bitmap_t *ra) {
+    roaring_uint32_iterator_t *newit =
+        (roaring_uint32_iterator_t *)malloc(sizeof(roaring_uint32_iterator_t));
+    if (newit == NULL) return NULL;
+    roaring_init_iterator(ra, newit);
+    return newit;
+}
+
+roaring_uint32_iterator_t *roaring_copy_uint32_iterator(
+    const roaring_uint32_iterator_t *it) {
+    roaring_uint32_iterator_t *newit =
+        (roaring_uint32_iterator_t *)malloc(sizeof(roaring_uint32_iterator_t));
+    memcpy(newit, it, sizeof(roaring_uint32_iterator_t));
+    return newit;
+}
+
+bool roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t *it, uint32_t val) {
+    uint16_t hb = val >> 16;
+    const int i = ra_get_index(& it->parent->high_low_container, hb);
+    if (i >= 0) {
+      uint32_t lowvalue = container_maximum(it->parent->high_low_container.containers[i], it->parent->high_low_container.typecodes[i]);
+      uint16_t lb = val & 0xFFFF;
+      if(lowvalue < lb ) {
+        it->container_index = i+1; // will have to load first value of next container
+      } else {// the value is necessarily within the range of the container
+        it->container_index = i;
+        it->has_value = loadfirstvalue_largeorequal(it, val);
+        return it->has_value;
+      }
+    } else {
+      // there is no matching, so we are going for the next container
+      it->container_index = -i-1;
+    }
+    it->has_value = loadfirstvalue(it);
+    return it->has_value;
+}
+
+
+bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it) {
+    if (it->container_index >= it->parent->high_low_container.size) {
+        return (it->has_value = false);
+    }
+    uint32_t wordindex;  // used for bitsets
+    uint64_t word;       // used for bitsets
+    switch (it->typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            it->in_container_index++;
+            wordindex = it->in_container_index / 64;
+            if (wordindex >= BITSET_CONTAINER_SIZE_IN_WORDS) break;
+            word = ((const bitset_container_t *)(it->container))
+                       ->array[wordindex] &
+                   (UINT64_MAX << (it->in_container_index % 64));
+            // next part could be optimized/simplified
+            while ((word == 0) &&
+                   (wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS)) {
+                wordindex++;
+                word = ((const bitset_container_t *)(it->container))
+                           ->array[wordindex];
+            }
+            if (word != 0) {
+                it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
+                it->current_value = it->highbits | it->in_container_index;
+                return (it->has_value = true);
+            }
+            break;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            it->in_container_index++;
+            if (it->in_container_index <
+                ((const array_container_t *)(it->container))->cardinality) {
+                it->current_value = it->highbits |
+                                    ((const array_container_t *)(it->container))
+                                        ->array[it->in_container_index];
+                return true;
+            }
+            break;
+        case RUN_CONTAINER_TYPE_CODE:
+            if(it->current_value == UINT32_MAX) {
+              return (it->has_value = false); // without this, we risk an overflow to zero
+            }
+            it->current_value++;
+            if (it->current_value <= it->in_run_index) {
+                return (it->has_value = true);
+            }
+            it->run_index++;
+            if (it->run_index <
+                ((const run_container_t *)(it->container))->n_runs) {
+                it->current_value =
+                    it->highbits | (((const run_container_t *)(it->container))
+                                        ->runs[it->run_index]
+                                        .value);
+                it->in_run_index = it->current_value +
+                                   ((const run_container_t *)(it->container))
+                                       ->runs[it->run_index]
+                                       .length;
+                return (it->has_value = true);
+            }
+            break;
+        default:
+            // if this ever happens, bug!
+            assert(false);
+    }  // switch (typecode)
+    // moving to next container
+    it->container_index++;
+    return (it->has_value = loadfirstvalue(it));
+}
+
+uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* buf, uint32_t count) {
+  uint32_t ret = 0;
+  uint32_t num_values;
+  uint32_t wordindex;  // used for bitsets
+  uint64_t word;       // used for bitsets
+  const array_container_t* acont; //TODO remove
+  const run_container_t* rcont; //TODO remove
+  const bitset_container_t* bcont; //TODO remove
+
+  while (it->has_value && ret < count) {
+    switch (it->typecode) {
+      case BITSET_CONTAINER_TYPE_CODE:
+        bcont = (const bitset_container_t*)(it->container);
+        wordindex = it->in_container_index / 64;
+        word = bcont->array[wordindex] & (UINT64_MAX << (it->in_container_index % 64));
+        do {
+          while (word != 0 && ret < count) {
+            buf[0] = it->highbits | (wordindex * 64 + __builtin_ctzll(word));
+            word = word & (word - 1);
+            buf++;
+            ret++;
+          }
+          while (word == 0 && wordindex+1 < BITSET_CONTAINER_SIZE_IN_WORDS) {
+            wordindex++;
+            word = bcont->array[wordindex];
+          }
+        } while (word != 0 && ret < count);
+        it->has_value = (word != 0);
+        if (it->has_value) {
+          it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
+          it->current_value = it->highbits | it->in_container_index;
+        }
+        break;
+      case ARRAY_CONTAINER_TYPE_CODE:
+        acont = (const array_container_t *)(it->container);
+        num_values = minimum_uint32(acont->cardinality - it->in_container_index, count - ret);
+        for (uint32_t i = 0; i < num_values; i++) {
+          buf[i] = it->highbits | acont->array[it->in_container_index + i];
+        }
+        buf += num_values;
+        ret += num_values;
+        it->in_container_index += num_values;
+        it->has_value = (it->in_container_index < acont->cardinality);
+        if (it->has_value) {
+          it->current_value = it->highbits | acont->array[it->in_container_index];
+        }
+        break;
+      case RUN_CONTAINER_TYPE_CODE:
+        rcont = (const run_container_t*)(it->container);
+        //"in_run_index" name is misleading, read it as "max_value_in_current_run"
+        do {
+          num_values = minimum_uint32(it->in_run_index - it->current_value + 1, count - ret);
+          for (uint32_t i = 0; i < num_values; i++) {
+            buf[i] = it->current_value + i;
+          }
+          it->current_value += num_values; // this can overflow to zero: UINT32_MAX+1=0
+          buf += num_values;
+          ret += num_values;
+
+          if (it->current_value > it->in_run_index || it->current_value == 0) {
+            it->run_index++;
+            if (it->run_index < rcont->n_runs) {
+              it->current_value = it->highbits | rcont->runs[it->run_index].value;
+              it->in_run_index = it->current_value + rcont->runs[it->run_index].length;
+            } else {
+              it->has_value = false;
+            }
+          }
+        } while ((ret < count) && it->has_value);
+        break;
+      default:
+        assert(false);
+    }
+    if (it->has_value) {
+      assert(ret == count);
+      return ret;
+    }
+    it->container_index++;
+    it->has_value = loadfirstvalue(it);
+  }
+  return ret;
+}
+
+
+
+void roaring_free_uint32_iterator(roaring_uint32_iterator_t *it) { free(it); }
+
+/****
+* end of roaring_uint32_iterator_t
+*****/
+
+bool roaring_bitmap_equals(const roaring_bitmap_t *ra1,
+                           const roaring_bitmap_t *ra2) {
+    if (ra1->high_low_container.size != ra2->high_low_container.size) {
+        return false;
+    }
+    for (int i = 0; i < ra1->high_low_container.size; ++i) {
+        if (ra1->high_low_container.keys[i] !=
+            ra2->high_low_container.keys[i]) {
+            return false;
+        }
+    }
+    for (int i = 0; i < ra1->high_low_container.size; ++i) {
+        bool areequal = container_equals(ra1->high_low_container.containers[i],
+                                         ra1->high_low_container.typecodes[i],
+                                         ra2->high_low_container.containers[i],
+                                         ra2->high_low_container.typecodes[i]);
+        if (!areequal) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool roaring_bitmap_is_subset(const roaring_bitmap_t *ra1,
+                              const roaring_bitmap_t *ra2) {
+    const int length1 = ra1->high_low_container.size,
+              length2 = ra2->high_low_container.size;
+
+    int pos1 = 0, pos2 = 0;
+
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 = ra_get_key_at_index(&ra1->high_low_container, pos1);
+        const uint16_t s2 = ra_get_key_at_index(&ra2->high_low_container, pos2);
+
+        if (s1 == s2) {
+            uint8_t container_type_1, container_type_2;
+            void *c1 = ra_get_container_at_index(&ra1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&ra2->high_low_container, pos2,
+                                                 &container_type_2);
+            bool subset =
+                container_is_subset(c1, container_type_1, c2, container_type_2);
+            if (!subset) return false;
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            return false;
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&ra2->high_low_container, s1, pos2);
+        }
+    }
+    if (pos1 == length1)
+        return true;
+    else
+        return false;
+}
+
+static void insert_flipped_container(roaring_array_t *ans_arr,
+                                     const roaring_array_t *x1_arr, uint16_t hb,
+                                     uint16_t lb_start, uint16_t lb_end) {
+    const int i = ra_get_index(x1_arr, hb);
+    const int j = ra_get_index(ans_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    void *flipped_container = NULL;
+    if (i >= 0) {
+        void *container_to_flip =
+            ra_get_container_at_index(x1_arr, i, &ctype_in);
+        flipped_container =
+            container_not_range(container_to_flip, ctype_in, (uint32_t)lb_start,
+                                (uint32_t)(lb_end + 1), &ctype_out);
+
+        if (container_get_cardinality(flipped_container, ctype_out))
+            ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                       ctype_out);
+        else {
+            container_free(flipped_container, ctype_out);
+        }
+    } else {
+        flipped_container = container_range_of_ones(
+            (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
+        ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+static void inplace_flip_container(roaring_array_t *x1_arr, uint16_t hb,
+                                   uint16_t lb_start, uint16_t lb_end) {
+    const int i = ra_get_index(x1_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    void *flipped_container = NULL;
+    if (i >= 0) {
+        void *container_to_flip =
+            ra_get_container_at_index(x1_arr, i, &ctype_in);
+        flipped_container = container_inot_range(
+            container_to_flip, ctype_in, (uint32_t)lb_start,
+            (uint32_t)(lb_end + 1), &ctype_out);
+        // if a new container was created, the old one was already freed
+        if (container_get_cardinality(flipped_container, ctype_out)) {
+            ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
+        } else {
+            container_free(flipped_container, ctype_out);
+            ra_remove_at_index(x1_arr, i);
+        }
+
+    } else {
+        flipped_container = container_range_of_ones(
+            (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
+        ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+static void insert_fully_flipped_container(roaring_array_t *ans_arr,
+                                           const roaring_array_t *x1_arr,
+                                           uint16_t hb) {
+    const int i = ra_get_index(x1_arr, hb);
+    const int j = ra_get_index(ans_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    void *flipped_container = NULL;
+    if (i >= 0) {
+        void *container_to_flip =
+            ra_get_container_at_index(x1_arr, i, &ctype_in);
+        flipped_container =
+            container_not(container_to_flip, ctype_in, &ctype_out);
+        if (container_get_cardinality(flipped_container, ctype_out))
+            ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                       ctype_out);
+        else {
+            container_free(flipped_container, ctype_out);
+        }
+    } else {
+        flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
+        ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+static void inplace_fully_flip_container(roaring_array_t *x1_arr, uint16_t hb) {
+    const int i = ra_get_index(x1_arr, hb);
+    uint8_t ctype_in, ctype_out;
+    void *flipped_container = NULL;
+    if (i >= 0) {
+        void *container_to_flip =
+            ra_get_container_at_index(x1_arr, i, &ctype_in);
+        flipped_container =
+            container_inot(container_to_flip, ctype_in, &ctype_out);
+
+        if (container_get_cardinality(flipped_container, ctype_out)) {
+            ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
+        } else {
+            container_free(flipped_container, ctype_out);
+            ra_remove_at_index(x1_arr, i);
+        }
+
+    } else {
+        flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
+        ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
+                                   ctype_out);
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *x1,
+                                      uint64_t range_start,
+                                      uint64_t range_end) {
+    if (range_start >= range_end) {
+        return roaring_bitmap_copy(x1);
+    }
+    if(range_end >= UINT64_C(0x100000000)) {
+        range_end = UINT64_C(0x100000000);
+    }
+
+    roaring_bitmap_t *ans = roaring_bitmap_create();
+    ans->copy_on_write = x1->copy_on_write;
+
+    uint16_t hb_start = (uint16_t)(range_start >> 16);
+    const uint16_t lb_start = (uint16_t)range_start;  // & 0xFFFF;
+    uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
+    const uint16_t lb_end = (uint16_t)(range_end - 1);  // & 0xFFFF;
+
+    ra_append_copies_until(&ans->high_low_container, &x1->high_low_container,
+                           hb_start, x1->copy_on_write);
+    if (hb_start == hb_end) {
+        insert_flipped_container(&ans->high_low_container,
+                                 &x1->high_low_container, hb_start, lb_start,
+                                 lb_end);
+    } else {
+        // start and end containers are distinct
+        if (lb_start > 0) {
+            // handle first (partial) container
+            insert_flipped_container(&ans->high_low_container,
+                                     &x1->high_low_container, hb_start,
+                                     lb_start, 0xFFFF);
+            ++hb_start;  // for the full containers.  Can't wrap.
+        }
+
+        if (lb_end != 0xFFFF) --hb_end;  // later we'll handle the partial block
+
+        for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
+            insert_fully_flipped_container(&ans->high_low_container,
+                                           &x1->high_low_container, hb);
+        }
+
+        // handle a partial final container
+        if (lb_end != 0xFFFF) {
+            insert_flipped_container(&ans->high_low_container,
+                                     &x1->high_low_container, hb_end + 1, 0,
+                                     lb_end);
+            ++hb_end;
+        }
+    }
+    ra_append_copies_after(&ans->high_low_container, &x1->high_low_container,
+                           hb_end, x1->copy_on_write);
+    return ans;
+}
+
+void roaring_bitmap_flip_inplace(roaring_bitmap_t *x1, uint64_t range_start,
+                                 uint64_t range_end) {
+    if (range_start >= range_end) {
+        return;  // empty range
+    }
+    if(range_end >= UINT64_C(0x100000000)) {
+        range_end = UINT64_C(0x100000000);
+    }
+
+    uint16_t hb_start = (uint16_t)(range_start >> 16);
+    const uint16_t lb_start = (uint16_t)range_start;
+    uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
+    const uint16_t lb_end = (uint16_t)(range_end - 1);
+
+    if (hb_start == hb_end) {
+        inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
+                               lb_end);
+    } else {
+        // start and end containers are distinct
+        if (lb_start > 0) {
+            // handle first (partial) container
+            inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
+                                   0xFFFF);
+            ++hb_start;  // for the full containers.  Can't wrap.
+        }
+
+        if (lb_end != 0xFFFF) --hb_end;
+
+        for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
+            inplace_fully_flip_container(&x1->high_low_container, hb);
+        }
+        // handle a partial final container
+        if (lb_end != 0xFFFF) {
+            inplace_flip_container(&x1->high_low_container, hb_end + 1, 0,
+                                   lb_end);
+            ++hb_end;
+        }
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *x1,
+                                         const roaring_bitmap_t *x2,
+                                         const bool bitsetconversion) {
+    uint8_t container_result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    answer->copy_on_write = x1->copy_on_write && x2->copy_on_write;
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c;
+            if (bitsetconversion && (get_container_type(c1, container_type_1) !=
+                                     BITSET_CONTAINER_TYPE_CODE) &&
+                (get_container_type(c2, container_type_2) !=
+                 BITSET_CONTAINER_TYPE_CODE)) {
+                void *newc1 =
+                    container_mutable_unwrap_shared(c1, &container_type_1);
+                newc1 = container_to_bitset(newc1, container_type_1);
+                container_type_1 = BITSET_CONTAINER_TYPE_CODE;
+                c = container_lazy_ior(newc1, container_type_1, c2,
+                                       container_type_2,
+                                       &container_result_type);
+                if (c != newc1) {  // should not happen
+                    container_free(newc1, container_type_1);
+                }
+            } else {
+                c = container_lazy_or(c1, container_type_1, c2,
+                                      container_type_2, &container_result_type);
+            }
+            // since we assume that the initial containers are non-empty,
+            // the
+            // result here
+            // can only be non-empty
+            ra_append(&answer->high_low_container, s1, c,
+                      container_result_type);
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            c1 =
+                get_copy_of_container(c1, &container_type_1, x1->copy_on_write);
+            if (x1->copy_on_write) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          container_type_1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, container_type_1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, container_type_2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             x2->copy_on_write);
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             x1->copy_on_write);
+    }
+    return answer;
+}
+
+void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2,
+                                    const bool bitsetconversion) {
+    uint8_t container_result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            if (!container_is_full(c1, container_type_1)) {
+                if ((bitsetconversion == false) ||
+                    (get_container_type(c1, container_type_1) ==
+                     BITSET_CONTAINER_TYPE_CODE)) {
+                    c1 = get_writable_copy_if_shared(c1, &container_type_1);
+                } else {
+                    // convert to bitset
+                    void *oldc1 = c1;
+                    uint8_t oldt1 = container_type_1;
+                    c1 = container_mutable_unwrap_shared(c1, &container_type_1);
+                    c1 = container_to_bitset(c1, container_type_1);
+                    container_free(oldc1, oldt1);
+                    container_type_1 = BITSET_CONTAINER_TYPE_CODE;
+                }
+
+                void *c2 = ra_get_container_at_index(&x2->high_low_container,
+                                                     pos2, &container_type_2);
+                void *c = container_lazy_ior(c1, container_type_1, c2,
+                                             container_type_2,
+                                             &container_result_type);
+                if (c !=
+                    c1) {  // in this instance a new container was created, and
+                           // we need to free the old one
+                    container_free(c1, container_type_1);
+                }
+
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          container_result_type);
+            }
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            // void *c2_clone = container_clone(c2, container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       container_type_2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, x2->copy_on_write);
+    }
+}
+
+roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *x1,
+                                          const roaring_bitmap_t *x2) {
+    uint8_t container_result_type = 0;
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    if (0 == length1) {
+        return roaring_bitmap_copy(x2);
+    }
+    if (0 == length2) {
+        return roaring_bitmap_copy(x1);
+    }
+    roaring_bitmap_t *answer =
+        roaring_bitmap_create_with_capacity(length1 + length2);
+    answer->copy_on_write = x1->copy_on_write && x2->copy_on_write;
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c =
+                container_lazy_xor(c1, container_type_1, c2, container_type_2,
+                                   &container_result_type);
+
+            if (container_nonzero_cardinality(c, container_result_type)) {
+                ra_append(&answer->high_low_container, s1, c,
+                          container_result_type);
+            } else {
+                container_free(c, container_result_type);
+            }
+
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            c1 =
+                get_copy_of_container(c1, &container_type_1, x1->copy_on_write);
+            if (x1->copy_on_write) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c1,
+                                          container_type_1);
+            }
+            ra_append(&answer->high_low_container, s1, c1, container_type_1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+            ra_append(&answer->high_low_container, s2, c2, container_type_2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2,
+                             x2->copy_on_write);
+    } else if (pos2 == length2) {
+        ra_append_copy_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1,
+                             x1->copy_on_write);
+    }
+    return answer;
+}
+
+void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2) {
+    assert(x1 != x2);
+    uint8_t container_result_type = 0;
+    int length1 = x1->high_low_container.size;
+    const int length2 = x2->high_low_container.size;
+
+    if (0 == length2) return;
+
+    if (0 == length1) {
+        roaring_bitmap_overwrite(x1, x2);
+        return;
+    }
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            c1 = get_writable_copy_if_shared(c1, &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            void *c =
+                container_lazy_ixor(c1, container_type_1, c2, container_type_2,
+                                    &container_result_type);
+            if (container_nonzero_cardinality(c, container_result_type)) {
+                ra_set_container_at_index(&x1->high_low_container, pos1, c,
+                                          container_result_type);
+                ++pos1;
+            } else {
+                container_free(c, container_result_type);
+                ra_remove_at_index(&x1->high_low_container, pos1);
+                --length1;
+            }
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            // void *c2_clone = container_clone(c2, container_type_2);
+            c2 =
+                get_copy_of_container(c2, &container_type_2, x2->copy_on_write);
+            if (x2->copy_on_write) {
+                ra_set_container_at_index(&x2->high_low_container, pos2, c2,
+                                          container_type_2);
+            }
+            ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
+                                       container_type_2);
+            pos1++;
+            length1++;
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
+                             pos2, length2, x2->copy_on_write);
+    }
+}
+
+void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *ra) {
+    for (int i = 0; i < ra->high_low_container.size; ++i) {
+        const uint8_t original_typecode = ra->high_low_container.typecodes[i];
+        void *container = ra->high_low_container.containers[i];
+        uint8_t new_typecode = original_typecode;
+        void *newcontainer =
+            container_repair_after_lazy(container, &new_typecode);
+        ra->high_low_container.containers[i] = newcontainer;
+        ra->high_low_container.typecodes[i] = new_typecode;
+    }
+}
+
+
+
+/**
+* roaring_bitmap_rank returns the number of integers that are smaller or equal
+* to x.
+*/
+uint64_t roaring_bitmap_rank(const roaring_bitmap_t *bm, uint32_t x) {
+    uint64_t size = 0;
+    uint32_t xhigh = x >> 16;
+    for (int i = 0; i < bm->high_low_container.size; i++) {
+        uint32_t key = bm->high_low_container.keys[i];
+        if (xhigh > key) {
+            size +=
+                container_get_cardinality(bm->high_low_container.containers[i],
+                                          bm->high_low_container.typecodes[i]);
+        } else if (xhigh == key) {
+            return size + container_rank(bm->high_low_container.containers[i],
+                                         bm->high_low_container.typecodes[i],
+                                         x & 0xFFFF);
+        } else {
+            return size;
+        }
+    }
+    return size;
+}
+
+/**
+* roaring_bitmap_smallest returns the smallest value in the set.
+* Returns UINT32_MAX if the set is empty.
+*/
+uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *bm) {
+    if (bm->high_low_container.size > 0) {
+        void *container = bm->high_low_container.containers[0];
+        uint8_t typecode = bm->high_low_container.typecodes[0];
+        uint32_t key = bm->high_low_container.keys[0];
+        uint32_t lowvalue = container_minimum(container, typecode);
+        return lowvalue | (key << 16);
+    }
+    return UINT32_MAX;
+}
+
+/**
+* roaring_bitmap_smallest returns the greatest value in the set.
+* Returns 0 if the set is empty.
+*/
+uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *bm) {
+    if (bm->high_low_container.size > 0) {
+        void *container =
+            bm->high_low_container.containers[bm->high_low_container.size - 1];
+        uint8_t typecode =
+            bm->high_low_container.typecodes[bm->high_low_container.size - 1];
+        uint32_t key =
+            bm->high_low_container.keys[bm->high_low_container.size - 1];
+        uint32_t lowvalue = container_maximum(container, typecode);
+        return lowvalue | (key << 16);
+    }
+    return 0;
+}
+
+bool roaring_bitmap_select(const roaring_bitmap_t *bm, uint32_t rank,
+                           uint32_t *element) {
+    void *container;
+    uint8_t typecode;
+    uint16_t key;
+    uint32_t start_rank = 0;
+    int i = 0;
+    bool valid = false;
+    while (!valid && i < bm->high_low_container.size) {
+        container = bm->high_low_container.containers[i];
+        typecode = bm->high_low_container.typecodes[i];
+        valid =
+            container_select(container, typecode, &start_rank, rank, element);
+        i++;
+    }
+
+    if (valid) {
+        key = bm->high_low_container.keys[i - 1];
+        *element |= (key << 16);
+        return true;
+    } else
+        return false;
+}
+
+bool roaring_bitmap_intersect(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2) {
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    uint64_t answer = 0;
+    int pos1 = 0, pos2 = 0;
+
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 = ra_get_key_at_index(& x1->high_low_container, pos1);
+        const uint16_t s2 = ra_get_key_at_index(& x2->high_low_container, pos2);
+
+        if (s1 == s2) {
+            uint8_t container_type_1, container_type_2;
+            void *c1 = ra_get_container_at_index(& x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(& x2->high_low_container, pos2,
+                                                 &container_type_2);
+            if( container_intersect(c1, container_type_1, c2, container_type_2) ) return true;
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            pos1 = ra_advance_until(& x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(& x2->high_low_container, s1, pos2);
+        }
+    }
+    return answer;
+}
+
+
+uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2) {
+    const int length1 = x1->high_low_container.size,
+              length2 = x2->high_low_container.size;
+    uint64_t answer = 0;
+    int pos1 = 0, pos2 = 0;
+
+    while (pos1 < length1 && pos2 < length2) {
+        const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+        const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        if (s1 == s2) {
+            uint8_t container_type_1, container_type_2;
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            answer += container_and_cardinality(c1, container_type_1, c2,
+                                                container_type_2);
+            ++pos1;
+            ++pos2;
+        } else if (s1 < s2) {  // s1 < s2
+            pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
+        } else {  // s1 > s2
+            pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
+        }
+    }
+    return answer;
+}
+
+double roaring_bitmap_jaccard_index(const roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return (double)inter / (double)(c1 + c2 - inter);
+}
+
+uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *x1,
+                                       const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return c1 + c2 - inter;
+}
+
+uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *x1,
+                                           const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return c1 - inter;
+}
+
+uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2) {
+    const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
+    const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
+    const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
+    return c1 + c2 - 2 * inter;
+}
+
+
+/**
+ * Check whether a range of values from range_start (included) to range_end (excluded) is present
+ */
+bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, uint64_t range_start, uint64_t range_end) {
+    if(range_end >= UINT64_C(0x100000000)) {
+        range_end = UINT64_C(0x100000000);
+    }
+    if (range_start >= range_end) return true;  // empty range are always contained!
+    if (range_end - range_start == 1) return roaring_bitmap_contains(r, (uint32_t)range_start);
+    uint16_t hb_rs = (uint16_t)(range_start >> 16);
+    uint16_t hb_re = (uint16_t)((range_end - 1) >> 16);
+    const int32_t span = hb_re - hb_rs;
+    const int32_t hlc_sz = ra_get_size(&r->high_low_container);
+    if (hlc_sz < span + 1) {
+      return false;
+    }
+    int32_t is = ra_get_index(&r->high_low_container, hb_rs);
+    int32_t ie = ra_get_index(&r->high_low_container, hb_re);
+    ie = (ie < 0 ? -ie - 1 : ie);
+    if ((is < 0) || ((ie - is) != span)) {
+       return false;
+    }
+    const uint32_t lb_rs = range_start & 0xFFFF;
+    const uint32_t lb_re = ((range_end - 1) & 0xFFFF) + 1;
+    uint8_t typecode;
+    void *container = ra_get_container_at_index(&r->high_low_container, is, &typecode);
+    if (hb_rs == hb_re) {
+      return container_contains_range(container, lb_rs, lb_re, typecode);
+    }
+    if (!container_contains_range(container, lb_rs, 1 << 16, typecode)) {
+      return false;
+    }
+    assert(ie < hlc_sz); // would indicate an algorithmic bug
+    container = ra_get_container_at_index(&r->high_low_container, ie, &typecode);
+    if (!container_contains_range(container, 0, lb_re, typecode)) {
+        return false;
+    }
+    for (int32_t i = is + 1; i < ie; ++i) {
+        container = ra_get_container_at_index(&r->high_low_container, i, &typecode);
+        if (!container_is_full(container, typecode) ) {
+          return false;
+        }
+    }
+    return true;
+}
+
+
+bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *ra1,
+                                            const roaring_bitmap_t *ra2) {
+    return (roaring_bitmap_get_cardinality(ra2) >
+                roaring_bitmap_get_cardinality(ra1) &&
+            roaring_bitmap_is_subset(ra1, ra2));
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/roaring.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/roaring_array.c */
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+
+// Convention: [0,ra->size) all elements are initialized
+//  [ra->size, ra->allocation_size) is junk and contains nothing needing freeing
+
+extern inline int32_t ra_get_size(const roaring_array_t *ra);
+extern inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x);
+extern inline void *ra_get_container_at_index(const roaring_array_t *ra,
+                                              uint16_t i, uint8_t *typecode);
+extern inline void ra_unshare_container_at_index(roaring_array_t *ra,
+                                                 uint16_t i);
+extern inline void ra_replace_key_and_container_at_index(roaring_array_t *ra,
+                                                         int32_t i,
+                                                         uint16_t key, void *c,
+                                                         uint8_t typecode);
+extern inline void ra_set_container_at_index(const roaring_array_t *ra,
+                                             int32_t i, void *c,
+                                             uint8_t typecode);
+
+#define INITIAL_CAPACITY 4
+
+static bool realloc_array(roaring_array_t *ra, int32_t new_capacity) {
+    // because we combine the allocations, it is not possible to use realloc
+    /*ra->keys =
+    (uint16_t *)realloc(ra->keys, sizeof(uint16_t) * new_capacity);
+ra->containers =
+    (void **)realloc(ra->containers, sizeof(void *) * new_capacity);
+ra->typecodes =
+    (uint8_t *)realloc(ra->typecodes, sizeof(uint8_t) * new_capacity);
+if (!ra->keys || !ra->containers || !ra->typecodes) {
+    free(ra->keys);
+    free(ra->containers);
+    free(ra->typecodes);
+    return false;
+}*/
+
+    if ( new_capacity == 0 ) {
+      free(ra->containers);
+      ra->containers = NULL;
+      ra->keys = NULL;
+      ra->typecodes = NULL;
+      ra->allocation_size = 0;
+      return true;
+    }
+    const size_t memoryneeded =
+        new_capacity * (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t));
+    void *bigalloc = malloc(memoryneeded);
+    if (!bigalloc) return false;
+    void *oldbigalloc = ra->containers;
+    void **newcontainers = (void **)bigalloc;
+    uint16_t *newkeys = (uint16_t *)(newcontainers + new_capacity);
+    uint8_t *newtypecodes = (uint8_t *)(newkeys + new_capacity);
+    assert((char *)(newtypecodes + new_capacity) ==
+           (char *)bigalloc + memoryneeded);
+    if(ra->size > 0) {
+      memcpy(newcontainers, ra->containers, sizeof(void *) * ra->size);
+      memcpy(newkeys, ra->keys, sizeof(uint16_t) * ra->size);
+      memcpy(newtypecodes, ra->typecodes, sizeof(uint8_t) * ra->size);
+    }
+    ra->containers = newcontainers;
+    ra->keys = newkeys;
+    ra->typecodes = newtypecodes;
+    ra->allocation_size = new_capacity;
+    free(oldbigalloc);
+    return true;
+}
+
+bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap) {
+    if (!new_ra) return false;
+    new_ra->keys = NULL;
+    new_ra->containers = NULL;
+    new_ra->typecodes = NULL;
+
+    new_ra->allocation_size = cap;
+    new_ra->size = 0;
+    if(cap > 0) {
+      void *bigalloc =
+        malloc(cap * (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t)));
+      if( bigalloc == NULL ) return false;
+      new_ra->containers = (void **)bigalloc;
+      new_ra->keys = (uint16_t *)(new_ra->containers + cap);
+      new_ra->typecodes = (uint8_t *)(new_ra->keys + cap);
+    }
+    return true;
+}
+
+int ra_shrink_to_fit(roaring_array_t *ra) {
+    int savings = (ra->allocation_size - ra->size) *
+                  (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t));
+    if (!realloc_array(ra, ra->size)) {
+      return 0;
+    }
+    ra->allocation_size = ra->size;
+    return savings;
+}
+
+bool ra_init(roaring_array_t *t) {
+    return ra_init_with_capacity(t, INITIAL_CAPACITY);
+}
+
+bool ra_copy(const roaring_array_t *source, roaring_array_t *dest,
+             bool copy_on_write) {
+    if (!ra_init_with_capacity(dest, source->size)) return false;
+    dest->size = source->size;
+    dest->allocation_size = source->size;
+    if(dest->size > 0) {
+      memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t));
+    }
+    // we go through the containers, turning them into shared containers...
+    if (copy_on_write) {
+        for (int32_t i = 0; i < dest->size; ++i) {
+            source->containers[i] = get_copy_of_container(
+                source->containers[i], &source->typecodes[i], copy_on_write);
+        }
+        // we do a shallow copy to the other bitmap
+        if(dest->size > 0) {
+          memcpy(dest->containers, source->containers,
+               dest->size * sizeof(void *));
+          memcpy(dest->typecodes, source->typecodes,
+               dest->size * sizeof(uint8_t));
+        }
+    } else {
+        if(dest->size > 0) {
+          memcpy(dest->typecodes, source->typecodes,
+               dest->size * sizeof(uint8_t));
+        }
+        for (int32_t i = 0; i < dest->size; i++) {
+            dest->containers[i] =
+                container_clone(source->containers[i], source->typecodes[i]);
+            if (dest->containers[i] == NULL) {
+                for (int32_t j = 0; j < i; j++) {
+                    container_free(dest->containers[j], dest->typecodes[j]);
+                }
+                ra_clear_without_containers(dest);
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest,
+                  bool copy_on_write) {
+    ra_clear_containers(dest);  // we are going to overwrite them
+    if (dest->allocation_size < source->size) {
+        if (!realloc_array(dest, source->size)) {
+            return false;
+        }
+    }
+    dest->size = source->size;
+    memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t));
+    // we go through the containers, turning them into shared containers...
+    if (copy_on_write) {
+        for (int32_t i = 0; i < dest->size; ++i) {
+            source->containers[i] = get_copy_of_container(
+                source->containers[i], &source->typecodes[i], copy_on_write);
+        }
+        // we do a shallow copy to the other bitmap
+        memcpy(dest->containers, source->containers,
+               dest->size * sizeof(void *));
+        memcpy(dest->typecodes, source->typecodes,
+               dest->size * sizeof(uint8_t));
+    } else {
+        memcpy(dest->typecodes, source->typecodes,
+               dest->size * sizeof(uint8_t));
+        for (int32_t i = 0; i < dest->size; i++) {
+            dest->containers[i] =
+                container_clone(source->containers[i], source->typecodes[i]);
+            if (dest->containers[i] == NULL) {
+                for (int32_t j = 0; j < i; j++) {
+                    container_free(dest->containers[j], dest->typecodes[j]);
+                }
+                ra_clear_without_containers(dest);
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+void ra_clear_containers(roaring_array_t *ra) {
+    for (int32_t i = 0; i < ra->size; ++i) {
+        container_free(ra->containers[i], ra->typecodes[i]);
+    }
+}
+
+void ra_reset(roaring_array_t *ra) {
+  ra_clear_containers(ra);
+  ra->size = 0;
+  ra_shrink_to_fit(ra);
+}
+
+void ra_clear_without_containers(roaring_array_t *ra) {
+    free(ra->containers);    // keys and typecodes are allocated with containers
+    ra->size = 0;
+    ra->allocation_size = 0;
+    ra->containers = NULL;
+    ra->keys = NULL;
+    ra->typecodes = NULL;
+}
+
+void ra_clear(roaring_array_t *ra) {
+    ra_clear_containers(ra);
+    ra_clear_without_containers(ra);
+}
+
+bool extend_array(roaring_array_t *ra, int32_t k) {
+    int32_t desired_size = ra->size + k;
+    assert(desired_size <= MAX_CONTAINERS);
+    if (desired_size > ra->allocation_size) {
+        int32_t new_capacity =
+            (ra->size < 1024) ? 2 * desired_size : 5 * desired_size / 4;
+        if (new_capacity > MAX_CONTAINERS) {
+            new_capacity = MAX_CONTAINERS;
+        }
+
+        return realloc_array(ra, new_capacity);
+    }
+    return true;
+}
+
+void ra_append(roaring_array_t *ra, uint16_t key, void *container,
+               uint8_t typecode) {
+    extend_array(ra, 1);
+    const int32_t pos = ra->size;
+
+    ra->keys[pos] = key;
+    ra->containers[pos] = container;
+    ra->typecodes[pos] = typecode;
+    ra->size++;
+}
+
+void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa,
+                    uint16_t index, bool copy_on_write) {
+    extend_array(ra, 1);
+    const int32_t pos = ra->size;
+
+    // old contents is junk not needing freeing
+    ra->keys[pos] = sa->keys[index];
+    // the shared container will be in two bitmaps
+    if (copy_on_write) {
+        sa->containers[index] = get_copy_of_container(
+            sa->containers[index], &sa->typecodes[index], copy_on_write);
+        ra->containers[pos] = sa->containers[index];
+        ra->typecodes[pos] = sa->typecodes[index];
+    } else {
+        ra->containers[pos] =
+            container_clone(sa->containers[index], sa->typecodes[index]);
+        ra->typecodes[pos] = sa->typecodes[index];
+    }
+    ra->size++;
+}
+
+void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t stopping_key, bool copy_on_write) {
+    for (int32_t i = 0; i < sa->size; ++i) {
+        if (sa->keys[i] >= stopping_key) break;
+        ra_append_copy(ra, sa, i, copy_on_write);
+    }
+}
+
+void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index,
+                          bool copy_on_write) {
+    extend_array(ra, end_index - start_index);
+    for (int32_t i = start_index; i < end_index; ++i) {
+        const int32_t pos = ra->size;
+        ra->keys[pos] = sa->keys[i];
+        if (copy_on_write) {
+            sa->containers[i] = get_copy_of_container(
+                sa->containers[i], &sa->typecodes[i], copy_on_write);
+            ra->containers[pos] = sa->containers[i];
+            ra->typecodes[pos] = sa->typecodes[i];
+        } else {
+            ra->containers[pos] =
+                container_clone(sa->containers[i], sa->typecodes[i]);
+            ra->typecodes[pos] = sa->typecodes[i];
+        }
+        ra->size++;
+    }
+}
+
+void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t before_start, bool copy_on_write) {
+    int start_location = ra_get_index(sa, before_start);
+    if (start_location >= 0)
+        ++start_location;
+    else
+        start_location = -start_location - 1;
+    ra_append_copy_range(ra, sa, start_location, sa->size, copy_on_write);
+}
+
+void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index) {
+    extend_array(ra, end_index - start_index);
+
+    for (int32_t i = start_index; i < end_index; ++i) {
+        const int32_t pos = ra->size;
+
+        ra->keys[pos] = sa->keys[i];
+        ra->containers[pos] = sa->containers[i];
+        ra->typecodes[pos] = sa->typecodes[i];
+        ra->size++;
+    }
+}
+
+void ra_append_range(roaring_array_t *ra, roaring_array_t *sa,
+                     int32_t start_index, int32_t end_index,
+                     bool copy_on_write) {
+    extend_array(ra, end_index - start_index);
+
+    for (int32_t i = start_index; i < end_index; ++i) {
+        const int32_t pos = ra->size;
+        ra->keys[pos] = sa->keys[i];
+        if (copy_on_write) {
+            sa->containers[i] = get_copy_of_container(
+                sa->containers[i], &sa->typecodes[i], copy_on_write);
+            ra->containers[pos] = sa->containers[i];
+            ra->typecodes[pos] = sa->typecodes[i];
+        } else {
+            ra->containers[pos] =
+                container_clone(sa->containers[i], sa->typecodes[i]);
+            ra->typecodes[pos] = sa->typecodes[i];
+        }
+        ra->size++;
+    }
+}
+
+void *ra_get_container(roaring_array_t *ra, uint16_t x, uint8_t *typecode) {
+    int i = binarySearch(ra->keys, (int32_t)ra->size, x);
+    if (i < 0) return NULL;
+    *typecode = ra->typecodes[i];
+    return ra->containers[i];
+}
+
+extern void *ra_get_container_at_index(const roaring_array_t *ra, uint16_t i,
+                                       uint8_t *typecode);
+
+void *ra_get_writable_container(roaring_array_t *ra, uint16_t x,
+                                uint8_t *typecode) {
+    int i = binarySearch(ra->keys, (int32_t)ra->size, x);
+    if (i < 0) return NULL;
+    *typecode = ra->typecodes[i];
+    return get_writable_copy_if_shared(ra->containers[i], typecode);
+}
+
+void *ra_get_writable_container_at_index(roaring_array_t *ra, uint16_t i,
+                                         uint8_t *typecode) {
+    assert(i < ra->size);
+    *typecode = ra->typecodes[i];
+    return get_writable_copy_if_shared(ra->containers[i], typecode);
+}
+
+uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i) {
+    return ra->keys[i];
+}
+
+extern int32_t ra_get_index(const roaring_array_t *ra, uint16_t x);
+
+extern int32_t ra_advance_until(const roaring_array_t *ra, uint16_t x,
+                                int32_t pos);
+
+// everything skipped over is freed
+int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos) {
+    while (pos < ra->size && ra->keys[pos] < x) {
+        container_free(ra->containers[pos], ra->typecodes[pos]);
+        ++pos;
+    }
+    return pos;
+}
+
+void ra_insert_new_key_value_at(roaring_array_t *ra, int32_t i, uint16_t key,
+                                void *container, uint8_t typecode) {
+    extend_array(ra, 1);
+    // May be an optimization opportunity with DIY memmove
+    memmove(&(ra->keys[i + 1]), &(ra->keys[i]),
+            sizeof(uint16_t) * (ra->size - i));
+    memmove(&(ra->containers[i + 1]), &(ra->containers[i]),
+            sizeof(void *) * (ra->size - i));
+    memmove(&(ra->typecodes[i + 1]), &(ra->typecodes[i]),
+            sizeof(uint8_t) * (ra->size - i));
+    ra->keys[i] = key;
+    ra->containers[i] = container;
+    ra->typecodes[i] = typecode;
+    ra->size++;
+}
+
+// note: Java routine set things to 0, enabling GC.
+// Java called it "resize" but it was always used to downsize.
+// Allowing upsize would break the conventions about
+// valid containers below ra->size.
+
+void ra_downsize(roaring_array_t *ra, int32_t new_length) {
+    assert(new_length <= ra->size);
+    ra->size = new_length;
+}
+
+void ra_remove_at_index(roaring_array_t *ra, int32_t i) {
+    memmove(&(ra->containers[i]), &(ra->containers[i + 1]),
+            sizeof(void *) * (ra->size - i - 1));
+    memmove(&(ra->keys[i]), &(ra->keys[i + 1]),
+            sizeof(uint16_t) * (ra->size - i - 1));
+    memmove(&(ra->typecodes[i]), &(ra->typecodes[i + 1]),
+            sizeof(uint8_t) * (ra->size - i - 1));
+    ra->size--;
+}
+
+void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i) {
+    container_free(ra->containers[i], ra->typecodes[i]);
+    ra_remove_at_index(ra, i);
+}
+
+// used in inplace andNot only, to slide left the containers from
+// the mutated RoaringBitmap that are after the largest container of
+// the argument RoaringBitmap.  In use it should be followed by a call to
+// downsize.
+//
+void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end,
+                   uint32_t new_begin) {
+    assert(begin <= end);
+    assert(new_begin < begin);
+
+    const int range = end - begin;
+
+    // We ensure to previously have freed overwritten containers
+    // that are not copied elsewhere
+
+    memmove(&(ra->containers[new_begin]), &(ra->containers[begin]),
+            sizeof(void *) * range);
+    memmove(&(ra->keys[new_begin]), &(ra->keys[begin]),
+            sizeof(uint16_t) * range);
+    memmove(&(ra->typecodes[new_begin]), &(ra->typecodes[begin]),
+            sizeof(uint8_t) * range);
+}
+
+void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance) {
+    if (distance > 0) {
+        extend_array(ra, distance);
+    }
+    int32_t srcpos = ra->size - count;
+    int32_t dstpos = srcpos + distance;
+    memmove(&(ra->keys[dstpos]), &(ra->keys[srcpos]),
+            sizeof(uint16_t) * count);
+    memmove(&(ra->containers[dstpos]), &(ra->containers[srcpos]),
+            sizeof(void *) * count);
+    memmove(&(ra->typecodes[dstpos]), &(ra->typecodes[srcpos]),
+            sizeof(uint8_t) * count);
+    ra->size += distance;
+}
+
+
+size_t ra_size_in_bytes(roaring_array_t *ra) {
+    size_t cardinality = 0;
+    size_t tot_len =
+        1 /* initial byte type */ + 4 /* tot_len */ + sizeof(roaring_array_t) +
+        ra->size * (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t));
+    for (int32_t i = 0; i < ra->size; i++) {
+        tot_len +=
+            (container_serialization_len(ra->containers[i], ra->typecodes[i]) +
+             sizeof(uint16_t));
+        cardinality +=
+            container_get_cardinality(ra->containers[i], ra->typecodes[i]);
+    }
+
+    if ((cardinality * sizeof(uint32_t) + sizeof(uint32_t)) < tot_len) {
+        return cardinality * sizeof(uint32_t) + 1 + sizeof(uint32_t);
+    }
+    return tot_len;
+}
+
+void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans) {
+    size_t ctr = 0;
+    for (int32_t i = 0; i < ra->size; ++i) {
+        int num_added = container_to_uint32_array(
+            ans + ctr, ra->containers[i], ra->typecodes[i],
+            ((uint32_t)ra->keys[i]) << 16);
+        ctr += num_added;
+    }
+}
+
+bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset, size_t limit, uint32_t *ans) {
+    size_t ctr = 0;
+    size_t dtr = 0;
+
+    size_t t_limit = 0;
+
+    bool first = false;
+    size_t first_skip = 0;
+
+    uint32_t *t_ans = NULL;
+    size_t cur_len = 0;
+
+    for (int i = 0; i < ra->size; ++i) {
+        
+        const void *container = container_unwrap_shared(ra->containers[i], &ra->typecodes[i]);
+        switch (ra->typecodes[i]) {
+            case BITSET_CONTAINER_TYPE_CODE:
+                t_limit = ((const bitset_container_t *)container)->cardinality;
+                break;
+            case ARRAY_CONTAINER_TYPE_CODE:
+                t_limit = ((const array_container_t *)container)->cardinality;
+                break;
+            case RUN_CONTAINER_TYPE_CODE:
+                t_limit = run_container_cardinality((const run_container_t *)container);
+                break;
+        }
+        if (ctr + t_limit - 1 >= offset && ctr < offset + limit){
+            if (!first){
+                //first_skip = t_limit - (ctr + t_limit - offset);
+                first_skip = offset - ctr;
+                first = true;
+                t_ans = (uint32_t *)malloc(sizeof(*t_ans) * (first_skip + limit));
+                if(t_ans == NULL) {
+                  return false;
+                }
+                memset(t_ans, 0, sizeof(*t_ans) * (first_skip + limit)) ;
+                cur_len = first_skip + limit;
+            }
+            if (dtr + t_limit > cur_len){
+                uint32_t * append_ans = (uint32_t *)malloc(sizeof(*append_ans) * (cur_len + t_limit));
+                if(append_ans == NULL) {
+                  if(t_ans != NULL) free(t_ans);
+                  return false;
+                }
+                memset(append_ans, 0, sizeof(*append_ans) * (cur_len + t_limit));
+                cur_len = cur_len + t_limit;
+                memcpy(append_ans, t_ans, dtr * sizeof(uint32_t));
+                free(t_ans);
+                t_ans = append_ans;
+            }
+            switch (ra->typecodes[i]) {
+                case BITSET_CONTAINER_TYPE_CODE:
+                    container_to_uint32_array(
+                        t_ans + dtr, (const bitset_container_t *)container,  ra->typecodes[i],
+                        ((uint32_t)ra->keys[i]) << 16);
+                    break;
+                case ARRAY_CONTAINER_TYPE_CODE:
+                    container_to_uint32_array(
+                        t_ans + dtr, (const array_container_t *)container, ra->typecodes[i],
+                        ((uint32_t)ra->keys[i]) << 16);
+                    break;
+                case RUN_CONTAINER_TYPE_CODE:
+                    container_to_uint32_array(
+                        t_ans + dtr, (const run_container_t *)container, ra->typecodes[i],
+                        ((uint32_t)ra->keys[i]) << 16);
+                    break;
+            }
+            dtr += t_limit;
+        }
+        ctr += t_limit;
+        if (dtr-first_skip >= limit) break;
+    }
+    if(t_ans != NULL) {
+      memcpy(ans, t_ans+first_skip, limit * sizeof(uint32_t));
+      free(t_ans);
+    }
+    return true;
+}
+
+bool ra_has_run_container(const roaring_array_t *ra) {
+    for (int32_t k = 0; k < ra->size; ++k) {
+        if (get_container_type(ra->containers[k], ra->typecodes[k]) ==
+            RUN_CONTAINER_TYPE_CODE)
+            return true;
+    }
+    return false;
+}
+
+uint32_t ra_portable_header_size(const roaring_array_t *ra) {
+    if (ra_has_run_container(ra)) {
+        if (ra->size <
+            NO_OFFSET_THRESHOLD) {  // for small bitmaps, we omit the offsets
+            return 4 + (ra->size + 7) / 8 + 4 * ra->size;
+        }
+        return 4 + (ra->size + 7) / 8 +
+               8 * ra->size;  // - 4 because we pack the size with the cookie
+    } else {
+        return 4 + 4 + 8 * ra->size;
+    }
+}
+
+size_t ra_portable_size_in_bytes(const roaring_array_t *ra) {
+    size_t count = ra_portable_header_size(ra);
+
+    for (int32_t k = 0; k < ra->size; ++k) {
+        count += container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
+    }
+    return count;
+}
+
+size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) {
+    char *initbuf = buf;
+    uint32_t startOffset = 0;
+    bool hasrun = ra_has_run_container(ra);
+    if (hasrun) {
+        uint32_t cookie = SERIAL_COOKIE | ((ra->size - 1) << 16);
+        memcpy(buf, &cookie, sizeof(cookie));
+        buf += sizeof(cookie);
+        uint32_t s = (ra->size + 7) / 8;
+        uint8_t *bitmapOfRunContainers = (uint8_t *)calloc(s, 1);
+        assert(bitmapOfRunContainers != NULL);  // todo: handle
+        for (int32_t i = 0; i < ra->size; ++i) {
+            if (get_container_type(ra->containers[i], ra->typecodes[i]) ==
+                RUN_CONTAINER_TYPE_CODE) {
+                bitmapOfRunContainers[i / 8] |= (1 << (i % 8));
+            }
+        }
+        memcpy(buf, bitmapOfRunContainers, s);
+        buf += s;
+        free(bitmapOfRunContainers);
+        if (ra->size < NO_OFFSET_THRESHOLD) {
+            startOffset = 4 + 4 * ra->size + s;
+        } else {
+            startOffset = 4 + 8 * ra->size + s;
+        }
+    } else {  // backwards compatibility
+        uint32_t cookie = SERIAL_COOKIE_NO_RUNCONTAINER;
+
+        memcpy(buf, &cookie, sizeof(cookie));
+        buf += sizeof(cookie);
+        memcpy(buf, &ra->size, sizeof(ra->size));
+        buf += sizeof(ra->size);
+
+        startOffset = 4 + 4 + 4 * ra->size + 4 * ra->size;
+    }
+    for (int32_t k = 0; k < ra->size; ++k) {
+        memcpy(buf, &ra->keys[k], sizeof(ra->keys[k]));
+        buf += sizeof(ra->keys[k]);
+        // get_cardinality returns a value in [1,1<<16], subtracting one
+        // we get [0,1<<16 - 1] which fits in 16 bits
+        uint16_t card = (uint16_t)(
+            container_get_cardinality(ra->containers[k], ra->typecodes[k]) - 1);
+        memcpy(buf, &card, sizeof(card));
+        buf += sizeof(card);
+    }
+    if ((!hasrun) || (ra->size >= NO_OFFSET_THRESHOLD)) {
+        // writing the containers offsets
+        for (int32_t k = 0; k < ra->size; k++) {
+            memcpy(buf, &startOffset, sizeof(startOffset));
+            buf += sizeof(startOffset);
+            startOffset =
+                startOffset +
+                container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
+        }
+    }
+    for (int32_t k = 0; k < ra->size; ++k) {
+        buf += container_write(ra->containers[k], ra->typecodes[k], buf);
+    }
+    return buf - initbuf;
+}
+
+// Quickly checks whether there is a serialized bitmap at the pointer,
+// not exceeding size "maxbytes" in bytes. This function does not allocate
+// memory dynamically.
+//
+// This function returns 0 if and only if no valid bitmap is found.
+// Otherwise, it returns how many bytes are occupied.
+//
+size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) {
+    size_t bytestotal = sizeof(int32_t);// for cookie
+    if(bytestotal > maxbytes) return 0;
+    uint32_t cookie;
+    memcpy(&cookie, buf, sizeof(int32_t));
+    buf += sizeof(uint32_t);
+    if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
+        cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
+        return 0;
+    }
+    int32_t size;
+
+    if ((cookie & 0xFFFF) == SERIAL_COOKIE)
+        size = (cookie >> 16) + 1;
+    else {
+        bytestotal += sizeof(int32_t);
+        if(bytestotal > maxbytes) return 0;
+        memcpy(&size, buf, sizeof(int32_t));
+        buf += sizeof(uint32_t);
+    }
+    if (size > (1<<16)) {
+       return 0; // logically impossible
+    }
+    char *bitmapOfRunContainers = NULL;
+    bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
+    if (hasrun) {
+        int32_t s = (size + 7) / 8;
+        bytestotal += s;
+        if(bytestotal > maxbytes) return 0;
+        bitmapOfRunContainers = (char *)buf;
+        buf += s;
+    }
+    bytestotal += size * 2 * sizeof(uint16_t);
+    if(bytestotal > maxbytes) return 0;
+    uint16_t *keyscards = (uint16_t *)buf;
+    buf += size * 2 * sizeof(uint16_t);
+    if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
+        // skipping the offsets
+        bytestotal += size * 4;
+        if(bytestotal > maxbytes) return 0;
+        buf += size * 4;
+    }
+    // Reading the containers
+    for (int32_t k = 0; k < size; ++k) {
+        uint16_t tmp;
+        memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp));
+        uint32_t thiscard = tmp + 1;
+        bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if(hasrun) {
+          if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
+            isbitmap = false;
+            isrun = true;
+          }
+        }
+        if (isbitmap) {
+            size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+            bytestotal += containersize;
+            if(bytestotal > maxbytes) return 0;
+            buf += containersize;
+        } else if (isrun) {
+            bytestotal += sizeof(uint16_t);
+            if(bytestotal > maxbytes) return 0;
+            uint16_t n_runs;
+            memcpy(&n_runs, buf, sizeof(uint16_t));
+            buf += sizeof(uint16_t);
+            size_t containersize = n_runs * sizeof(rle16_t);
+            bytestotal += containersize;
+            if(bytestotal > maxbytes) return 0;
+            buf += containersize;
+        } else {
+            size_t containersize = thiscard * sizeof(uint16_t);
+            bytestotal += containersize;
+            if(bytestotal > maxbytes) return 0;
+            buf += containersize;
+        }
+    }
+    return bytestotal;
+}
+
+
+// this function populates answer from the content of buf (reading up to maxbytes bytes).
+// The function returns false if a properly serialized bitmap cannot be found.
+// if it returns true, readbytes is populated by how many bytes were read, we have that *readbytes <= maxbytes.
+bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const size_t maxbytes, size_t * readbytes) {
+    *readbytes = sizeof(int32_t);// for cookie
+    if(*readbytes > maxbytes) {
+      fprintf(stderr, "Ran out of bytes while reading first 4 bytes.\n");
+      return false;
+    }
+    uint32_t cookie;
+    memcpy(&cookie, buf, sizeof(int32_t));
+    buf += sizeof(uint32_t);
+    if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
+        cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
+        fprintf(stderr, "I failed to find one of the right cookies. Found %" PRIu32 "\n",
+                cookie);
+        return false;
+    }
+    int32_t size;
+
+    if ((cookie & 0xFFFF) == SERIAL_COOKIE)
+        size = (cookie >> 16) + 1;
+    else {
+        *readbytes += sizeof(int32_t);
+        if(*readbytes > maxbytes) {
+          fprintf(stderr, "Ran out of bytes while reading second part of the cookie.\n");
+          return false;
+        }
+        memcpy(&size, buf, sizeof(int32_t));
+        buf += sizeof(uint32_t);
+    }
+    if (size > (1<<16)) {
+       fprintf(stderr, "You cannot have so many containers, the data must be corrupted: %" PRId32 "\n",
+                size);
+       return false; // logically impossible
+    }
+    const char *bitmapOfRunContainers = NULL;
+    bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
+    if (hasrun) {
+        int32_t s = (size + 7) / 8;
+        *readbytes += s;
+        if(*readbytes > maxbytes) {// data is corrupted?
+          fprintf(stderr, "Ran out of bytes while reading run bitmap.\n");
+          return false;
+        }
+        bitmapOfRunContainers = buf;
+        buf += s;
+    }
+    uint16_t *keyscards = (uint16_t *)buf;
+
+    *readbytes += size * 2 * sizeof(uint16_t);
+    if(*readbytes > maxbytes) {
+      fprintf(stderr, "Ran out of bytes while reading key-cardinality array.\n");
+      return false;
+    }
+    buf += size * 2 * sizeof(uint16_t);
+
+    bool is_ok = ra_init_with_capacity(answer, size);
+    if (!is_ok) {
+        fprintf(stderr, "Failed to allocate memory for roaring array. Bailing out.\n");
+        return false;
+    }
+
+    for (int32_t k = 0; k < size; ++k) {
+        uint16_t tmp;
+        memcpy(&tmp, keyscards + 2*k, sizeof(tmp));
+        answer->keys[k] = tmp;
+    }
+    if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
+        *readbytes += size * 4;
+        if(*readbytes > maxbytes) {// data is corrupted?
+          fprintf(stderr, "Ran out of bytes while reading offsets.\n");
+          ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+          return false;
+        }
+
+        // skipping the offsets
+        buf += size * 4;
+    }
+    // Reading the containers
+    for (int32_t k = 0; k < size; ++k) {
+        uint16_t tmp;
+        memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp));
+        uint32_t thiscard = tmp + 1;
+        bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if(hasrun) {
+          if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
+            isbitmap = false;
+            isrun = true;
+          }
+        }
+        if (isbitmap) {
+            // we check that the read is allowed
+            size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+            *readbytes += containersize;
+            if(*readbytes > maxbytes) {
+              fprintf(stderr, "Running out of bytes while reading a bitset container.\n");
+              ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+              return false;
+            }
+            // it is now safe to read
+            bitset_container_t *c = bitset_container_create();
+            if(c == NULL) {// memory allocation failure
+              fprintf(stderr, "Failed to allocate memory for a bitset container.\n");
+              ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+              return false;
+            }
+            answer->size++;
+            buf += bitset_container_read(thiscard, c, buf);
+            answer->containers[k] = c;
+            answer->typecodes[k] = BITSET_CONTAINER_TYPE_CODE;
+        } else if (isrun) {
+            // we check that the read is allowed
+            *readbytes += sizeof(uint16_t);
+            if(*readbytes > maxbytes) {
+              fprintf(stderr, "Running out of bytes while reading a run container (header).\n");
+              ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+              return false;
+            }
+            uint16_t n_runs;
+            memcpy(&n_runs, buf, sizeof(uint16_t));
+            size_t containersize = n_runs * sizeof(rle16_t);
+            *readbytes += containersize;
+            if(*readbytes > maxbytes) {// data is corrupted?
+              fprintf(stderr, "Running out of bytes while reading a run container.\n");
+              ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+              return false;
+            }
+            // it is now safe to read
+
+            run_container_t *c = run_container_create();
+            if(c == NULL) {// memory allocation failure
+              fprintf(stderr, "Failed to allocate memory for a run container.\n");
+              ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+              return false;
+            }
+            answer->size++;
+            buf += run_container_read(thiscard, c, buf);
+            answer->containers[k] = c;
+            answer->typecodes[k] = RUN_CONTAINER_TYPE_CODE;
+        } else {
+            // we check that the read is allowed
+            size_t containersize = thiscard * sizeof(uint16_t);
+            *readbytes += containersize;
+            if(*readbytes > maxbytes) {// data is corrupted?
+              fprintf(stderr, "Running out of bytes while reading an array container.\n");
+              ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+              return false;
+            }
+            // it is now safe to read
+            array_container_t *c =
+                array_container_create_given_capacity(thiscard);
+            if(c == NULL) {// memory allocation failure
+              fprintf(stderr, "Failed to allocate memory for an array container.\n");
+              ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
+              return false;
+            }
+            answer->size++;
+            buf += array_container_read(thiscard, c, buf);
+            answer->containers[k] = c;
+            answer->typecodes[k] = ARRAY_CONTAINER_TYPE_CODE;
+        }
+    }
+    return true;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/roaring_array.c */
+/* begin file /opt/bitmap/CRoaring-0.2.57/src/roaring_priority_queue.c */
+
+struct roaring_pq_element_s {
+    uint64_t size;
+    bool is_temporary;
+    roaring_bitmap_t *bitmap;
+};
+
+typedef struct roaring_pq_element_s roaring_pq_element_t;
+
+struct roaring_pq_s {
+    roaring_pq_element_t *elements;
+    uint64_t size;
+};
+
+typedef struct roaring_pq_s roaring_pq_t;
+
+static inline bool compare(roaring_pq_element_t *t1, roaring_pq_element_t *t2) {
+    return t1->size < t2->size;
+}
+
+static void pq_add(roaring_pq_t *pq, roaring_pq_element_t *t) {
+    uint64_t i = pq->size;
+    pq->elements[pq->size++] = *t;
+    while (i > 0) {
+        uint64_t p = (i - 1) >> 1;
+        roaring_pq_element_t ap = pq->elements[p];
+        if (!compare(t, &ap)) break;
+        pq->elements[i] = ap;
+        i = p;
+    }
+    pq->elements[i] = *t;
+}
+
+static void pq_free(roaring_pq_t *pq) {
+    free(pq->elements);
+    pq->elements = NULL;  // paranoid
+    free(pq);
+}
+
+static void percolate_down(roaring_pq_t *pq, uint32_t i) {
+    uint32_t size = (uint32_t)pq->size;
+    uint32_t hsize = size >> 1;
+    roaring_pq_element_t ai = pq->elements[i];
+    while (i < hsize) {
+        uint32_t l = (i << 1) + 1;
+        uint32_t r = l + 1;
+        roaring_pq_element_t bestc = pq->elements[l];
+        if (r < size) {
+            if (compare(pq->elements + r, &bestc)) {
+                l = r;
+                bestc = pq->elements[r];
+            }
+        }
+        if (!compare(&bestc, &ai)) {
+            break;
+        }
+        pq->elements[i] = bestc;
+        i = l;
+    }
+    pq->elements[i] = ai;
+}
+
+static roaring_pq_t *create_pq(const roaring_bitmap_t **arr, uint32_t length) {
+    roaring_pq_t *answer = (roaring_pq_t *)malloc(sizeof(roaring_pq_t));
+    answer->elements =
+        (roaring_pq_element_t *)malloc(sizeof(roaring_pq_element_t) * length);
+    answer->size = length;
+    for (uint32_t i = 0; i < length; i++) {
+        answer->elements[i].bitmap = (roaring_bitmap_t *)arr[i];
+        answer->elements[i].is_temporary = false;
+        answer->elements[i].size =
+            roaring_bitmap_portable_size_in_bytes(arr[i]);
+    }
+    for (int32_t i = (length >> 1); i >= 0; i--) {
+        percolate_down(answer, i);
+    }
+    return answer;
+}
+
+static roaring_pq_element_t pq_poll(roaring_pq_t *pq) {
+    roaring_pq_element_t ans = *pq->elements;
+    if (pq->size > 1) {
+        pq->elements[0] = pq->elements[--pq->size];
+        percolate_down(pq, 0);
+    } else
+        --pq->size;
+    // memmove(pq->elements,pq->elements+1,(pq->size-1)*sizeof(roaring_pq_element_t));--pq->size;
+    return ans;
+}
+
+// this function consumes and frees the inputs
+static roaring_bitmap_t *lazy_or_from_lazy_inputs(roaring_bitmap_t *x1,
+                                                  roaring_bitmap_t *x2) {
+    uint8_t container_result_type = 0;
+    const int length1 = ra_get_size(&x1->high_low_container),
+              length2 = ra_get_size(&x2->high_low_container);
+    if (0 == length1) {
+        roaring_bitmap_free(x1);
+        return x2;
+    }
+    if (0 == length2) {
+        roaring_bitmap_free(x2);
+        return x1;
+    }
+    uint32_t neededcap = length1 > length2 ? length2 : length1;
+    roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
+    int pos1 = 0, pos2 = 0;
+    uint8_t container_type_1, container_type_2;
+    uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+    uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+    while (true) {
+        if (s1 == s2) {
+            // todo: unsharing can be inefficient as it may create a clone where
+            // none
+            // is needed, but it has the benefit of being easy to reason about.
+            ra_unshare_container_at_index(&x1->high_low_container, pos1);
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            assert(container_type_1 != SHARED_CONTAINER_TYPE_CODE);
+            ra_unshare_container_at_index(&x2->high_low_container, pos2);
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            assert(container_type_2 != SHARED_CONTAINER_TYPE_CODE);
+            void *c;
+
+            if ((container_type_2 == BITSET_CONTAINER_TYPE_CODE) &&
+                (container_type_1 != BITSET_CONTAINER_TYPE_CODE)) {
+                c = container_lazy_ior(c2, container_type_2, c1,
+                                       container_type_1,
+                                       &container_result_type);
+                container_free(c1, container_type_1);
+                if (c != c2) {
+                    container_free(c2, container_type_2);
+                }
+            } else {
+                c = container_lazy_ior(c1, container_type_1, c2,
+                                       container_type_2,
+                                       &container_result_type);
+                container_free(c2, container_type_2);
+                if (c != c1) {
+                    container_free(c1, container_type_1);
+                }
+            }
+            // since we assume that the initial containers are non-empty, the
+            // result here
+            // can only be non-empty
+            ra_append(&answer->high_low_container, s1, c,
+                      container_result_type);
+            ++pos1;
+            ++pos2;
+            if (pos1 == length1) break;
+            if (pos2 == length2) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+
+        } else if (s1 < s2) {  // s1 < s2
+            void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
+                                                 &container_type_1);
+            ra_append(&answer->high_low_container, s1, c1, container_type_1);
+            pos1++;
+            if (pos1 == length1) break;
+            s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
+
+        } else {  // s1 > s2
+            void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
+                                                 &container_type_2);
+            ra_append(&answer->high_low_container, s2, c2, container_type_2);
+            pos2++;
+            if (pos2 == length2) break;
+            s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
+        }
+    }
+    if (pos1 == length1) {
+        ra_append_move_range(&answer->high_low_container,
+                             &x2->high_low_container, pos2, length2);
+    } else if (pos2 == length2) {
+        ra_append_move_range(&answer->high_low_container,
+                             &x1->high_low_container, pos1, length1);
+    }
+    ra_clear_without_containers(&x1->high_low_container);
+    ra_clear_without_containers(&x2->high_low_container);
+    free(x1);
+    free(x2);
+    return answer;
+}
+
+/**
+ * Compute the union of 'number' bitmaps using a heap. This can
+ * sometimes be faster than roaring_bitmap_or_many which uses
+ * a naive algorithm. Caller is responsible for freeing the
+ * result.
+ */
+roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number,
+                                              const roaring_bitmap_t **x) {
+    if (number == 0) {
+        return roaring_bitmap_create();
+    }
+    if (number == 1) {
+        return roaring_bitmap_copy(x[0]);
+    }
+    roaring_pq_t *pq = create_pq(x, number);
+    while (pq->size > 1) {
+        roaring_pq_element_t x1 = pq_poll(pq);
+        roaring_pq_element_t x2 = pq_poll(pq);
+
+        if (x1.is_temporary && x2.is_temporary) {
+            roaring_bitmap_t *newb =
+                lazy_or_from_lazy_inputs(x1.bitmap, x2.bitmap);
+            // should normally return a fresh new bitmap *except* that
+            // it can return x1.bitmap or x2.bitmap in degenerate cases
+            bool temporary = !((newb == x1.bitmap) && (newb == x2.bitmap));
+            uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
+            roaring_pq_element_t newelement = {
+                .size = bsize, .is_temporary = temporary, .bitmap = newb};
+            pq_add(pq, &newelement);
+        } else if (x2.is_temporary) {
+            roaring_bitmap_lazy_or_inplace(x2.bitmap, x1.bitmap, false);
+            x2.size = roaring_bitmap_portable_size_in_bytes(x2.bitmap);
+            pq_add(pq, &x2);
+        } else if (x1.is_temporary) {
+            roaring_bitmap_lazy_or_inplace(x1.bitmap, x2.bitmap, false);
+            x1.size = roaring_bitmap_portable_size_in_bytes(x1.bitmap);
+
+            pq_add(pq, &x1);
+        } else {
+            roaring_bitmap_t *newb =
+                roaring_bitmap_lazy_or(x1.bitmap, x2.bitmap, false);
+            uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
+            roaring_pq_element_t newelement = {
+                .size = bsize, .is_temporary = true, .bitmap = newb};
+
+            pq_add(pq, &newelement);
+        }
+    }
+    roaring_pq_element_t X = pq_poll(pq);
+    roaring_bitmap_t *answer = X.bitmap;
+    roaring_bitmap_repair_after_lazy(answer);
+    pq_free(pq);
+    return answer;
+}
+/* end file /opt/bitmap/CRoaring-0.2.57/src/roaring_priority_queue.c */
diff --git a/contrib/croaring/roaring.h b/contrib/croaring/roaring.h
new file mode 100644
index 00000000000..6583188c56e
--- /dev/null
+++ b/contrib/croaring/roaring.h
@@ -0,0 +1,7166 @@
+/* auto-generated on Tue Dec 18 09:42:59 CST 2018. Do not edit! */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring_version.h */
+// /include/roaring/roaring_version.h automatically generated by release.py, do not change by hand 
+#ifndef ROARING_INCLUDE_ROARING_VERSION 
+#define ROARING_INCLUDE_ROARING_VERSION 
+#define ROARING_VERSION = 0.2.57,  
+enum { 
+    ROARING_VERSION_MAJOR = 0,  
+    ROARING_VERSION_MINOR = 2,  
+    ROARING_VERSION_REVISION = 57  
+}; 
+#endif // ROARING_INCLUDE_ROARING_VERSION 
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring_version.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/portability.h */
+/*
+ * portability.h
+ *
+ */
+
+#ifndef INCLUDE_PORTABILITY_H_
+#define INCLUDE_PORTABILITY_H_
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS 1
+#endif
+
+#if !(defined(_POSIX_C_SOURCE)) || (_POSIX_C_SOURCE < 200809L)
+#define _POSIX_C_SOURCE 200809L
+#endif
+#if !(defined(_XOPEN_SOURCE)) || (_XOPEN_SOURCE < 700)
+#define _XOPEN_SOURCE 700
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>  // will provide posix_memalign with _POSIX_C_SOURCE as defined above
+#if !(defined(__APPLE__)) && !(defined(__FreeBSD__))
+#include <malloc.h>  // this should never be needed but there are some reports that it is needed.
+#endif
+
+
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(_WIN64)
+#pragma message( \
+    "You appear to be attempting a 32-bit build under Visual Studio. We recommend a 64-bit build instead.")
+#endif
+
+#if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8
+#error This code assumes  64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported.
+#endif
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+#ifndef DISABLE_X64  // some users may want to compile as if they did not have
+                     // an x64 processor
+
+///////////////////////
+/// We support X64 hardware in the following manner:
+///
+/// if IS_X64 is defined then we have at least SSE and SSE2
+/// (All Intel processors sold in the recent past have at least SSE and SSE2 support,
+/// going back to the Pentium 4.)
+///
+/// if USESSE4 is defined then we assume at least SSE4.2, SSE4.1,
+///                   SSSE3, SSE3... + IS_X64
+/// if USEAVX is defined, then we assume AVX2, AVX + USESSE4
+///
+/// So if you have hardware that supports AVX but not AVX2, then "USEAVX"
+/// won't be enabled.
+/// If you have hardware that supports SSE4.1, but not SSE4.2, then USESSE4
+/// won't be defined.
+//////////////////////
+
+// unless DISABLEAVX was defined, if we have __AVX2__, we enable AVX
+#if (!defined(USEAVX)) && (!defined(DISABLEAVX)) && (defined(__AVX2__))
+#define USEAVX
+#endif
+
+// if we have __SSE4_2__, we enable SSE4
+#if (defined(__POPCNT__)) && (defined(__SSE4_2__))
+#define USESSE4
+#endif
+
+#if defined(USEAVX) || defined(__x86_64__) || defined(_M_X64)
+// we have an x64 processor
+#define IS_X64
+// we include the intrinsic header
+#ifndef _MSC_VER
+/* Non-Microsoft C/C++-compatible compiler */
+#include <x86intrin.h>  // on some recent GCC, this will declare posix_memalign
+#endif
+#endif
+
+#ifndef _MSC_VER
+/* Non-Microsoft C/C++-compatible compiler, assumes that it supports inline
+ * assembly */
+#define ROARING_INLINE_ASM
+#endif
+
+#ifdef USEAVX
+#define USESSE4             // if we have AVX, then we have SSE4
+#define USE_BMI             // we assume that AVX2 and BMI go hand and hand
+#define USEAVX2FORDECODING  // optimization
+// vector operations should work on not just AVX
+#define ROARING_VECTOR_OPERATIONS_ENABLED  // vector unions (optimization)
+#endif
+
+#endif  // DISABLE_X64
+
+#ifdef _MSC_VER
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+
+#ifndef __clang__  // if one compiles with MSVC *with* clang, then these
+                   // intrinsics are defined!!!
+// sadly there is no way to check whether we are missing these intrinsics
+// specifically.
+
+/* wrappers for Visual Studio built-ins that look like gcc built-ins */
+/* result might be undefined when input_num is zero */
+static inline int __builtin_ctzll(unsigned long long input_num) {
+    unsigned long index;
+#ifdef _WIN64  // highly recommended!!!
+    _BitScanForward64(&index, input_num);
+#else  // if we must support 32-bit Windows
+    if ((uint32_t)input_num != 0) {
+        _BitScanForward(&index, (uint32_t)input_num);
+    } else {
+        _BitScanForward(&index, (uint32_t)(input_num >> 32));
+        index += 32;
+    }
+#endif
+    return index;
+}
+
+/* result might be undefined when input_num is zero */
+static inline int __builtin_clzll(unsigned long long input_num) {
+    unsigned long index;
+#ifdef _WIN64  // highly recommended!!!
+    _BitScanReverse64(&index, input_num);
+#else  // if we must support 32-bit Windows
+    if (input_num > 0xFFFFFFFF) {
+        _BitScanReverse(&index, (uint32_t)(input_num >> 32));
+        index += 32;
+    } else {
+        _BitScanReverse(&index, (uint32_t)(input_num));
+    }
+#endif
+    return 63 - index;
+}
+
+/* result might be undefined when input_num is zero */
+#ifdef USESSE4
+/* POPCNT support was added to processors around the release of SSE4.2 */
+/* USESSE4 flag guarantees POPCNT support */
+static inline int __builtin_popcountll(unsigned long long input_num) {
+#ifdef _WIN64  // highly recommended!!!
+	return (int)__popcnt64(input_num);
+#else  // if we must support 32-bit Windows
+	return (int)(__popcnt((uint32_t)input_num) +
+		__popcnt((uint32_t)(input_num >> 32)));
+#endif
+}
+#else
+/* software implementation avoids POPCNT */
+static inline int __builtin_popcountll(unsigned long long input_num) {
+	const uint64_t m1 = 0x5555555555555555; //binary: 0101...
+	const uint64_t m2 = 0x3333333333333333; //binary: 00110011..
+	const uint64_t m4 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
+	const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
+
+	input_num -= (input_num >> 1) & m1;
+	input_num = (input_num & m2) + ((input_num >> 2) & m2);
+	input_num = (input_num + (input_num >> 4)) & m4;
+	return (input_num * h01) >> 56;
+}
+#endif
+
+/* Use #define so this is effective even under /Ob0 (no inline) */
+#define __builtin_unreachable() __assume(0)
+#endif
+
+#endif
+
+// without the following, we get lots of warnings about posix_memalign
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif  //__cplusplus // C++ does not have a well defined signature
+
+// portable version of  posix_memalign
+static inline void *aligned_malloc(size_t alignment, size_t size) {
+    void *p;
+#ifdef _MSC_VER
+    p = _aligned_malloc(size, alignment);
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+    p = __mingw_aligned_malloc(size, alignment);
+#else
+    // somehow, if this is used before including "x86intrin.h", it creates an
+    // implicit defined warning.
+    if (posix_memalign(&p, alignment, size) != 0) return NULL;
+#endif
+    return p;
+}
+
+static inline void aligned_free(void *memblock) {
+#ifdef _MSC_VER
+    _aligned_free(memblock);
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+    __mingw_aligned_free(memblock);
+#else
+    free(memblock);
+#endif
+}
+
+#if defined(_MSC_VER)
+#define ALIGNED(x) __declspec(align(x))
+#else
+#if defined(__GNUC__)
+#define ALIGNED(x) __attribute__((aligned(x)))
+#endif
+#endif
+
+#ifdef __GNUC__
+#define WARN_UNUSED __attribute__((warn_unused_result))
+#else
+#define WARN_UNUSED
+#endif
+
+#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
+
+static inline int hamming(uint64_t x) {
+#ifdef USESSE4
+    return (int) _mm_popcnt_u64(x);
+#else
+    // won't work under visual studio, but hopeful we have _mm_popcnt_u64 in
+    // many cases
+    return __builtin_popcountll(x);
+#endif
+}
+
+#ifndef UINT64_C
+#define UINT64_C(c) (c##ULL)
+#endif
+
+#ifndef UINT32_C
+#define UINT32_C(c) (c##UL)
+#endif
+
+#endif /* INCLUDE_PORTABILITY_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/portability.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/perfparameters.h */
+#ifndef PERFPARAMETERS_H_
+#define PERFPARAMETERS_H_
+
+#include <stdbool.h>
+
+/**
+During lazy computations, we can transform array containers into bitset
+containers as
+long as we can expect them to have  ARRAY_LAZY_LOWERBOUND values.
+*/
+enum { ARRAY_LAZY_LOWERBOUND = 1024 };
+
+/* default initial size of a run container 
+   setting it to zero delays the malloc.*/
+enum { RUN_DEFAULT_INIT_SIZE = 0 };
+
+/* default initial size of an array container 
+   setting it to zero delays the malloc */
+enum { ARRAY_DEFAULT_INIT_SIZE = 0 };
+
+/* automatic bitset conversion during lazy or */
+#ifndef LAZY_OR_BITSET_CONVERSION
+#define LAZY_OR_BITSET_CONVERSION true
+#endif
+
+/* automatically attempt to convert a bitset to a full run during lazy
+ * evaluation */
+#ifndef LAZY_OR_BITSET_CONVERSION_TO_FULL
+#define LAZY_OR_BITSET_CONVERSION_TO_FULL true
+#endif
+
+/* automatically attempt to convert a bitset to a full run */
+#ifndef OR_BITSET_CONVERSION_TO_FULL
+#define OR_BITSET_CONVERSION_TO_FULL true
+#endif
+
+#endif
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/perfparameters.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/array_util.h */
+#ifndef ARRAY_UTIL_H
+#define ARRAY_UTIL_H
+
+#include <stddef.h>  // for size_t
+#include <stdint.h>
+
+
+/*
+ *  Good old binary search.
+ *  Assumes that array is sorted, has logarithmic complexity.
+ *  if the result is x, then:
+ *     if ( x>0 )  you have array[x] = ikey
+ *     if ( x<0 ) then inserting ikey at position -x-1 in array (insuring that array[-x-1]=ikey)
+ *                   keys the array sorted.
+ */
+inline int32_t binarySearch(const uint16_t *array, int32_t lenarray,
+                            uint16_t ikey) {
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t middleValue = array[middleIndex];
+        if (middleValue < ikey) {
+            low = middleIndex + 1;
+        } else if (middleValue > ikey) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return -(low + 1);
+}
+
+/**
+ * Galloping search
+ * Assumes that array is sorted, has logarithmic complexity.
+ * if the result is x, then if x = length, you have that all values in array between pos and length
+ *    are smaller than min.
+ * otherwise returns the first index x such that array[x] >= min.
+ */
+static inline int32_t advanceUntil(const uint16_t *array, int32_t pos,
+                                   int32_t length, uint16_t min) {
+    int32_t lower = pos + 1;
+
+    if ((lower >= length) || (array[lower] >= min)) {
+        return lower;
+    }
+
+    int32_t spansize = 1;
+
+    while ((lower + spansize < length) && (array[lower + spansize] < min)) {
+        spansize <<= 1;
+    }
+    int32_t upper = (lower + spansize < length) ? lower + spansize : length - 1;
+
+    if (array[upper] == min) {
+        return upper;
+    }
+    if (array[upper] < min) {
+        // means
+        // array
+        // has no
+        // item
+        // >= min
+        // pos = array.length;
+        return length;
+    }
+
+    // we know that the next-smallest span was too small
+    lower += (spansize >> 1);
+
+    int32_t mid = 0;
+    while (lower + 1 != upper) {
+        mid = (lower + upper) >> 1;
+        if (array[mid] == min) {
+            return mid;
+        } else if (array[mid] < min) {
+            lower = mid;
+        } else {
+            upper = mid;
+        }
+    }
+    return upper;
+}
+
+/**
+ * Returns number of elements which are less then $ikey.
+ * Array elements must be unique and sorted.
+ */
+static inline int32_t count_less(const uint16_t *array, int32_t lenarray,
+                                 uint16_t ikey) {
+    if (lenarray == 0) return 0;
+    int32_t pos = binarySearch(array, lenarray, ikey);
+    return pos >= 0 ? pos : -(pos+1);
+}
+
+/**
+ * Returns number of elements which are greater then $ikey.
+ * Array elements must be unique and sorted.
+ */
+static inline int32_t count_greater(const uint16_t *array, int32_t lenarray,
+                                    uint16_t ikey) {
+    if (lenarray == 0) return 0;
+    int32_t pos = binarySearch(array, lenarray, ikey);
+    if (pos >= 0) {
+        return lenarray - (pos+1);
+    } else {
+        return lenarray - (-pos-1);
+    }
+}
+
+/**
+ * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
+ * Optimized by D. Lemire on May 3rd 2013
+ *
+ * C should have capacity greater than the minimum of s_1 and s_b + 8
+ * where 8 is sizeof(__m128i)/sizeof(uint16_t).
+ */
+int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                           const uint16_t *__restrict__ B, size_t s_b,
+                           uint16_t *C);
+
+/**
+ * Compute the cardinality of the intersection using SSE4 instructions
+ */
+int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
+                                       size_t s_a,
+                                       const uint16_t *__restrict__ B,
+                                       size_t s_b);
+
+/* Computes the intersection between one small and one large set of uint16_t.
+ * Stores the result into buffer and return the number of elements. */
+int32_t intersect_skewed_uint16(const uint16_t *smallarray, size_t size_s,
+                                const uint16_t *largearray, size_t size_l,
+                                uint16_t *buffer);
+
+/* Computes the size of the intersection between one small and one large set of
+ * uint16_t. */
+int32_t intersect_skewed_uint16_cardinality(const uint16_t *smallarray,
+                                            size_t size_s,
+                                            const uint16_t *largearray,
+                                            size_t size_l);
+
+
+/* Check whether the size of the intersection between one small and one large set of uint16_t is non-zero. */
+bool intersect_skewed_uint16_nonempty(const uint16_t *smallarray, size_t size_s,
+                                const uint16_t *largearray, size_t size_l);
+/**
+ * Generic intersection function.
+ */
+int32_t intersect_uint16(const uint16_t *A, const size_t lenA,
+                         const uint16_t *B, const size_t lenB, uint16_t *out);
+/**
+ * Compute the size of the intersection (generic).
+ */
+int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA,
+                                     const uint16_t *B, const size_t lenB);
+
+/**
+ * Checking whether the size of the intersection  is non-zero.
+ */
+bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA,
+                         const uint16_t *B, const size_t lenB);
+/**
+ * Generic union function.
+ */
+size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
+                    size_t size_2, uint16_t *buffer);
+
+/**
+ * Generic XOR function.
+ */
+int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
+                   const uint16_t *array_2, int32_t card_2, uint16_t *out);
+
+/**
+ * Generic difference function (ANDNOT).
+ */
+int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2,
+                      int length2, uint16_t *a_out);
+
+/**
+ * Generic intersection function.
+ */
+size_t intersection_uint32(const uint32_t *A, const size_t lenA,
+                           const uint32_t *B, const size_t lenB, uint32_t *out);
+
+/**
+ * Generic intersection function, returns just the cardinality.
+ */
+size_t intersection_uint32_card(const uint32_t *A, const size_t lenA,
+                                const uint32_t *B, const size_t lenB);
+
+/**
+ * Generic union function.
+ */
+size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2,
+                    size_t size_2, uint32_t *buffer);
+
+/**
+ * A fast SSE-based union function.
+ */
+uint32_t union_vector16(const uint16_t *__restrict__ set_1, uint32_t size_1,
+                        const uint16_t *__restrict__ set_2, uint32_t size_2,
+                        uint16_t *__restrict__ buffer);
+/**
+ * A fast SSE-based XOR function.
+ */
+uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
+                      const uint16_t *__restrict__ array2, uint32_t length2,
+                      uint16_t *__restrict__ output);
+
+/**
+ * A fast SSE-based difference function.
+ */
+int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
+                            const uint16_t *__restrict__ B, size_t s_b,
+                            uint16_t *C);
+
+/**
+ * Generic union function, returns just the cardinality.
+ */
+size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
+                         const uint32_t *set_2, size_t size_2);
+
+/**
+* combines union_uint16 and  union_vector16 optimally
+*/
+size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
+                    size_t size_2, uint16_t *buffer);
+
+
+#endif
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/array_util.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring_types.h */
+/*
+  Typedefs used by various components
+*/
+
+#ifndef ROARING_TYPES_H
+#define ROARING_TYPES_H
+
+typedef bool (*roaring_iterator)(uint32_t value, void *param);
+typedef bool (*roaring_iterator64)(uint64_t value, void *param);
+
+/**
+*  (For advanced users.)
+* The roaring_statistics_t can be used to collect detailed statistics about
+* the composition of a roaring bitmap.
+*/
+typedef struct roaring_statistics_s {
+    uint32_t n_containers; /* number of containers */
+
+    uint32_t n_array_containers;  /* number of array containers */
+    uint32_t n_run_containers;    /* number of run containers */
+    uint32_t n_bitset_containers; /* number of bitmap containers */
+
+    uint32_t
+        n_values_array_containers;    /* number of values in array containers */
+    uint32_t n_values_run_containers; /* number of values in run containers */
+    uint32_t
+        n_values_bitset_containers; /* number of values in  bitmap containers */
+
+    uint32_t n_bytes_array_containers;  /* number of allocated bytes in array
+                                           containers */
+    uint32_t n_bytes_run_containers;    /* number of allocated bytes in run
+                                           containers */
+    uint32_t n_bytes_bitset_containers; /* number of allocated bytes in  bitmap
+                                           containers */
+
+    uint32_t
+        max_value; /* the maximal value, undefined if cardinality is zero */
+    uint32_t
+        min_value; /* the minimal value, undefined if cardinality is zero */
+    uint64_t sum_value; /* the sum of all values (could be used to compute
+                           average) */
+
+    uint64_t cardinality; /* total number of values stored in the bitmap */
+
+    // and n_values_arrays, n_values_rle, n_values_bitmap
+} roaring_statistics_t;
+
+#endif /* ROARING_TYPES_H */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring_types.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/utilasm.h */
+/*
+ * utilasm.h
+ *
+ */
+
+#ifndef INCLUDE_UTILASM_H_
+#define INCLUDE_UTILASM_H_
+
+
+#if defined(USE_BMI) & defined(ROARING_INLINE_ASM)
+#define ASMBITMANIPOPTIMIZATION  // optimization flag
+
+#define ASM_SHIFT_RIGHT(srcReg, bitsReg, destReg) \
+    __asm volatile("shrx %1, %2, %0"              \
+                   : "=r"(destReg)                \
+                   :             /* write */      \
+                   "r"(bitsReg), /* read only */  \
+                   "r"(srcReg)   /* read only */  \
+                   )
+
+#define ASM_INPLACESHIFT_RIGHT(srcReg, bitsReg)  \
+    __asm volatile("shrx %1, %0, %0"             \
+                   : "+r"(srcReg)                \
+                   :            /* read/write */ \
+                   "r"(bitsReg) /* read only */  \
+                   )
+
+#define ASM_SHIFT_LEFT(srcReg, bitsReg, destReg) \
+    __asm volatile("shlx %1, %2, %0"             \
+                   : "=r"(destReg)               \
+                   :             /* write */     \
+                   "r"(bitsReg), /* read only */ \
+                   "r"(srcReg)   /* read only */ \
+                   )
+// set bit at position testBit within testByte to 1 and
+// copy cmovDst to cmovSrc if that bit was previously clear
+#define ASM_SET_BIT_INC_WAS_CLEAR(testByte, testBit, count) \
+    __asm volatile(                                         \
+        "bts %2, %0\n"                                      \
+        "sbb $-1, %1\n"                                     \
+        : "+r"(testByte), /* read/write */                  \
+          "+r"(count)                                       \
+        :            /* read/write */                       \
+        "r"(testBit) /* read only */                        \
+        )
+
+#define ASM_CLEAR_BIT_DEC_WAS_SET(testByte, testBit, count) \
+    __asm volatile(                                         \
+        "btr %2, %0\n"                                      \
+        "sbb $0, %1\n"                                      \
+        : "+r"(testByte), /* read/write */                  \
+          "+r"(count)                                       \
+        :            /* read/write */                       \
+        "r"(testBit) /* read only */                        \
+        )
+
+#define ASM_BT64(testByte, testBit, count) \
+    __asm volatile(                        \
+        "bt %2,%1\n"                       \
+        "sbb %0,%0" /*could use setb */    \
+        : "=r"(count)                      \
+        :              /* write */         \
+        "r"(testByte), /* read only */     \
+        "r"(testBit)   /* read only */     \
+        )
+
+#endif  // USE_BMI
+#endif  /* INCLUDE_UTILASM_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/utilasm.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/bitset_util.h */
+#ifndef BITSET_UTIL_H
+#define BITSET_UTIL_H
+
+#include <stdint.h>
+
+
+/*
+ * Set all bits in indexes [begin,end) to true.
+ */
+static inline void bitset_set_range(uint64_t *bitmap, uint32_t start,
+                                    uint32_t end) {
+    if (start == end) return;
+    uint32_t firstword = start / 64;
+    uint32_t endword = (end - 1) / 64;
+    if (firstword == endword) {
+        bitmap[firstword] |= ((~UINT64_C(0)) << (start % 64)) &
+                             ((~UINT64_C(0)) >> ((~end + 1) % 64));
+        return;
+    }
+    bitmap[firstword] |= (~UINT64_C(0)) << (start % 64);
+    for (uint32_t i = firstword + 1; i < endword; i++) bitmap[i] = ~UINT64_C(0);
+    bitmap[endword] |= (~UINT64_C(0)) >> ((~end + 1) % 64);
+}
+
+
+/*
+ * Find the cardinality of the bitset in [begin,begin+lenminusone]
+ */
+static inline int bitset_lenrange_cardinality(uint64_t *bitmap, uint32_t start,
+                                              uint32_t lenminusone) {
+    uint32_t firstword = start / 64;
+    uint32_t endword = (start + lenminusone) / 64;
+    if (firstword == endword) {
+        return hamming(bitmap[firstword] &
+                       ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
+                           << (start % 64));
+    }
+    int answer = hamming(bitmap[firstword] & ((~UINT64_C(0)) << (start % 64)));
+    for (uint32_t i = firstword + 1; i < endword; i++) {
+        answer += hamming(bitmap[i]);
+    }
+    answer +=
+        hamming(bitmap[endword] &
+                (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64));
+    return answer;
+}
+
+/*
+ * Check whether the cardinality of the bitset in [begin,begin+lenminusone] is 0
+ */
+static inline bool bitset_lenrange_empty(uint64_t *bitmap, uint32_t start,
+        uint32_t lenminusone) {
+    uint32_t firstword = start / 64;
+    uint32_t endword = (start + lenminusone) / 64;
+    if (firstword == endword) {
+      return (bitmap[firstword] & ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
+              << (start % 64)) == 0;
+    }
+    if(((bitmap[firstword] & ((~UINT64_C(0)) << (start%64)))) != 0) return false;
+    for (uint32_t i = firstword + 1; i < endword; i++) {
+     if(bitmap[i] != 0) return false;
+    }
+    if((bitmap[endword] & (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64)) != 0) return false;
+    return true;
+}
+
+
+/*
+ * Set all bits in indexes [begin,begin+lenminusone] to true.
+ */
+static inline void bitset_set_lenrange(uint64_t *bitmap, uint32_t start,
+                                       uint32_t lenminusone) {
+    uint32_t firstword = start / 64;
+    uint32_t endword = (start + lenminusone) / 64;
+    if (firstword == endword) {
+        bitmap[firstword] |= ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
+                             << (start % 64);
+        return;
+    }
+    uint64_t temp = bitmap[endword];
+    bitmap[firstword] |= (~UINT64_C(0)) << (start % 64);
+    for (uint32_t i = firstword + 1; i < endword; i += 2)
+        bitmap[i] = bitmap[i + 1] = ~UINT64_C(0);
+    bitmap[endword] =
+        temp | (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64);
+}
+
+/*
+ * Flip all the bits in indexes [begin,end).
+ */
+static inline void bitset_flip_range(uint64_t *bitmap, uint32_t start,
+                                     uint32_t end) {
+    if (start == end) return;
+    uint32_t firstword = start / 64;
+    uint32_t endword = (end - 1) / 64;
+    bitmap[firstword] ^= ~((~UINT64_C(0)) << (start % 64));
+    for (uint32_t i = firstword; i < endword; i++) bitmap[i] = ~bitmap[i];
+    bitmap[endword] ^= ((~UINT64_C(0)) >> ((~end + 1) % 64));
+}
+
+/*
+ * Set all bits in indexes [begin,end) to false.
+ */
+static inline void bitset_reset_range(uint64_t *bitmap, uint32_t start,
+                                      uint32_t end) {
+    if (start == end) return;
+    uint32_t firstword = start / 64;
+    uint32_t endword = (end - 1) / 64;
+    if (firstword == endword) {
+        bitmap[firstword] &= ~(((~UINT64_C(0)) << (start % 64)) &
+                               ((~UINT64_C(0)) >> ((~end + 1) % 64)));
+        return;
+    }
+    bitmap[firstword] &= ~((~UINT64_C(0)) << (start % 64));
+    for (uint32_t i = firstword + 1; i < endword; i++) bitmap[i] = UINT64_C(0);
+    bitmap[endword] &= ~((~UINT64_C(0)) >> ((~end + 1) % 64));
+}
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out", values start at "base".
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ * set.
+ *
+ * Returns how many values were actually decoded.
+ *
+ * This function should only be expected to be faster than
+ * bitset_extract_setbits
+ * when the density of the bitset is high.
+ *
+ * This function uses AVX2 decoding.
+ */
+size_t bitset_extract_setbits_avx2(uint64_t *bitset, size_t length, void *vout,
+                                   size_t outcapacity, uint32_t base);
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out", values start at "base".
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_setbits(uint64_t *bitset, size_t length, void *vout,
+                              uint32_t base);
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out" as 16-bit integers, values start at "base" (can
+ *be set to zero)
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ *
+ * This function should only be expected to be faster than
+ *bitset_extract_setbits_uint16
+ * when the density of the bitset is high.
+ *
+ * This function uses SSE decoding.
+ */
+size_t bitset_extract_setbits_sse_uint16(const uint64_t *bitset, size_t length,
+                                         uint16_t *out, size_t outcapacity,
+                                         uint16_t base);
+
+/*
+ * Given a bitset containing "length" 64-bit words, write out the position
+ * of all the set bits to "out",  values start at "base"
+ * (can be set to zero)
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ *set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_setbits_uint16(const uint64_t *bitset, size_t length,
+                                     uint16_t *out, uint16_t base);
+
+/*
+ * Given two bitsets containing "length" 64-bit words, write out the position
+ * of all the common set bits to "out", values start at "base"
+ * (can be set to zero)
+ *
+ * The "out" pointer should be sufficient to store the actual number of bits
+ * set.
+ *
+ * Returns how many values were actually decoded.
+ */
+size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ bitset1,
+                                                  const uint64_t * __restrict__ bitset2,
+                                                  size_t length, uint16_t *out,
+                                                  uint16_t base);
+
+/*
+ * Given a bitset having cardinality card, set all bit values in the list (there
+ * are length of them)
+ * and return the updated cardinality. This evidently assumes that the bitset
+ * already contained data.
+ */
+uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
+                                  const uint16_t *list, uint64_t length);
+/*
+ * Given a bitset, set all bit values in the list (there
+ * are length of them).
+ */
+void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length);
+
+/*
+ * Given a bitset having cardinality card, unset all bit values in the list
+ * (there are length of them)
+ * and return the updated cardinality. This evidently assumes that the bitset
+ * already contained data.
+ */
+uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
+                           uint64_t length);
+
+/*
+ * Given a bitset having cardinality card, toggle all bit values in the list
+ * (there are length of them)
+ * and return the updated cardinality. This evidently assumes that the bitset
+ * already contained data.
+ */
+
+uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card,
+                                   const uint16_t *list, uint64_t length);
+
+void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length);
+
+#ifdef USEAVX
+/***
+ * BEGIN Harley-Seal popcount functions.
+ */
+
+/**
+ * Compute the population count of a 256-bit word
+ * This is not especially fast, but it is convenient as part of other functions.
+ */
+static inline __m256i popcount256(__m256i v) {
+    const __m256i lookuppos = _mm256_setr_epi8(
+        /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
+        /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
+        /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
+        /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4,
+
+        /* 0 */ 4 + 0, /* 1 */ 4 + 1, /* 2 */ 4 + 1, /* 3 */ 4 + 2,
+        /* 4 */ 4 + 1, /* 5 */ 4 + 2, /* 6 */ 4 + 2, /* 7 */ 4 + 3,
+        /* 8 */ 4 + 1, /* 9 */ 4 + 2, /* a */ 4 + 2, /* b */ 4 + 3,
+        /* c */ 4 + 2, /* d */ 4 + 3, /* e */ 4 + 3, /* f */ 4 + 4);
+    const __m256i lookupneg = _mm256_setr_epi8(
+        /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
+        /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
+        /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
+        /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4,
+
+        /* 0 */ 4 - 0, /* 1 */ 4 - 1, /* 2 */ 4 - 1, /* 3 */ 4 - 2,
+        /* 4 */ 4 - 1, /* 5 */ 4 - 2, /* 6 */ 4 - 2, /* 7 */ 4 - 3,
+        /* 8 */ 4 - 1, /* 9 */ 4 - 2, /* a */ 4 - 2, /* b */ 4 - 3,
+        /* c */ 4 - 2, /* d */ 4 - 3, /* e */ 4 - 3, /* f */ 4 - 4);
+    const __m256i low_mask = _mm256_set1_epi8(0x0f);
+
+    const __m256i lo = _mm256_and_si256(v, low_mask);
+    const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
+    const __m256i popcnt1 = _mm256_shuffle_epi8(lookuppos, lo);
+    const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi);
+    return _mm256_sad_epu8(popcnt1, popcnt2);
+}
+
+/**
+ * Simple CSA over 256 bits
+ */
+static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b,
+                       __m256i c) {
+    const __m256i u = _mm256_xor_si256(a, b);
+    *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
+    *l = _mm256_xor_si256(u, c);
+}
+
+/**
+ * Fast Harley-Seal AVX population count function
+ */
+inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data,
+                                                    const uint64_t size) {
+    __m256i total = _mm256_setzero_si256();
+    __m256i ones = _mm256_setzero_si256();
+    __m256i twos = _mm256_setzero_si256();
+    __m256i fours = _mm256_setzero_si256();
+    __m256i eights = _mm256_setzero_si256();
+    __m256i sixteens = _mm256_setzero_si256();
+    __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
+
+    const uint64_t limit = size - size % 16;
+    uint64_t i = 0;
+
+    for (; i < limit; i += 16) {
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i),
+            _mm256_lddqu_si256(data + i + 1));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 2),
+            _mm256_lddqu_si256(data + i + 3));
+        CSA(&foursA, &twos, twos, twosA, twosB);
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 4),
+            _mm256_lddqu_si256(data + i + 5));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 6),
+            _mm256_lddqu_si256(data + i + 7));
+        CSA(&foursB, &twos, twos, twosA, twosB);
+        CSA(&eightsA, &fours, fours, foursA, foursB);
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 8),
+            _mm256_lddqu_si256(data + i + 9));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 10),
+            _mm256_lddqu_si256(data + i + 11));
+        CSA(&foursA, &twos, twos, twosA, twosB);
+        CSA(&twosA, &ones, ones, _mm256_lddqu_si256(data + i + 12),
+            _mm256_lddqu_si256(data + i + 13));
+        CSA(&twosB, &ones, ones, _mm256_lddqu_si256(data + i + 14),
+            _mm256_lddqu_si256(data + i + 15));
+        CSA(&foursB, &twos, twos, twosA, twosB);
+        CSA(&eightsB, &fours, fours, foursA, foursB);
+        CSA(&sixteens, &eights, eights, eightsA, eightsB);
+
+        total = _mm256_add_epi64(total, popcount256(sixteens));
+    }
+
+    total = _mm256_slli_epi64(total, 4);  // * 16
+    total = _mm256_add_epi64(
+        total, _mm256_slli_epi64(popcount256(eights), 3));  // += 8 * ...
+    total = _mm256_add_epi64(
+        total, _mm256_slli_epi64(popcount256(fours), 2));  // += 4 * ...
+    total = _mm256_add_epi64(
+        total, _mm256_slli_epi64(popcount256(twos), 1));  // += 2 * ...
+    total = _mm256_add_epi64(total, popcount256(ones));
+    for (; i < size; i++)
+        total =
+            _mm256_add_epi64(total, popcount256(_mm256_lddqu_si256(data + i)));
+
+    return (uint64_t)(_mm256_extract_epi64(total, 0)) +
+           (uint64_t)(_mm256_extract_epi64(total, 1)) +
+           (uint64_t)(_mm256_extract_epi64(total, 2)) +
+           (uint64_t)(_mm256_extract_epi64(total, 3));
+}
+
+#define AVXPOPCNTFNC(opname, avx_intrinsic)                                    \
+    static inline uint64_t avx2_harley_seal_popcount256_##opname(              \
+        const __m256i *data1, const __m256i *data2, const uint64_t size) {     \
+        __m256i total = _mm256_setzero_si256();                                \
+        __m256i ones = _mm256_setzero_si256();                                 \
+        __m256i twos = _mm256_setzero_si256();                                 \
+        __m256i fours = _mm256_setzero_si256();                                \
+        __m256i eights = _mm256_setzero_si256();                               \
+        __m256i sixteens = _mm256_setzero_si256();                             \
+        __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;                \
+        __m256i A1, A2;                                                        \
+        const uint64_t limit = size - size % 16;                               \
+        uint64_t i = 0;                                                        \
+        for (; i < limit; i += 16) {                                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1),              \
+                               _mm256_lddqu_si256(data2 + i + 1));             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2),              \
+                               _mm256_lddqu_si256(data2 + i + 2));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3),              \
+                               _mm256_lddqu_si256(data2 + i + 3));             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4),              \
+                               _mm256_lddqu_si256(data2 + i + 4));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5),              \
+                               _mm256_lddqu_si256(data2 + i + 5));             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6),              \
+                               _mm256_lddqu_si256(data2 + i + 6));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7),              \
+                               _mm256_lddqu_si256(data2 + i + 7));             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsA, &fours, fours, foursA, foursB);                      \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8),              \
+                               _mm256_lddqu_si256(data2 + i + 8));             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9),              \
+                               _mm256_lddqu_si256(data2 + i + 9));             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10),             \
+                               _mm256_lddqu_si256(data2 + i + 10));            \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11),             \
+                               _mm256_lddqu_si256(data2 + i + 11));            \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12),             \
+                               _mm256_lddqu_si256(data2 + i + 12));            \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13),             \
+                               _mm256_lddqu_si256(data2 + i + 13));            \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14),             \
+                               _mm256_lddqu_si256(data2 + i + 14));            \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15),             \
+                               _mm256_lddqu_si256(data2 + i + 15));            \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsB, &fours, fours, foursA, foursB);                      \
+            CSA(&sixteens, &eights, eights, eightsA, eightsB);                 \
+            total = _mm256_add_epi64(total, popcount256(sixteens));            \
+        }                                                                      \
+        total = _mm256_slli_epi64(total, 4);                                   \
+        total = _mm256_add_epi64(total,                                        \
+                                 _mm256_slli_epi64(popcount256(eights), 3));   \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1));  \
+        total = _mm256_add_epi64(total, popcount256(ones));                    \
+        for (; i < size; i++) {                                                \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            total = _mm256_add_epi64(total, popcount256(A1));                  \
+        }                                                                      \
+        return (uint64_t)(_mm256_extract_epi64(total, 0)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 1)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 2)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 3));                     \
+    }                                                                          \
+    static inline uint64_t avx2_harley_seal_popcount256andstore_##opname(      \
+        const __m256i *__restrict__ data1, const __m256i *__restrict__ data2,  \
+        __m256i *__restrict__ out, const uint64_t size) {                      \
+        __m256i total = _mm256_setzero_si256();                                \
+        __m256i ones = _mm256_setzero_si256();                                 \
+        __m256i twos = _mm256_setzero_si256();                                 \
+        __m256i fours = _mm256_setzero_si256();                                \
+        __m256i eights = _mm256_setzero_si256();                               \
+        __m256i sixteens = _mm256_setzero_si256();                             \
+        __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;                \
+        __m256i A1, A2;                                                        \
+        const uint64_t limit = size - size % 16;                               \
+        uint64_t i = 0;                                                        \
+        for (; i < limit; i += 16) {                                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            _mm256_storeu_si256(out + i, A1);                                  \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 1),              \
+                               _mm256_lddqu_si256(data2 + i + 1));             \
+            _mm256_storeu_si256(out + i + 1, A2);                              \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 2),              \
+                               _mm256_lddqu_si256(data2 + i + 2));             \
+            _mm256_storeu_si256(out + i + 2, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 3),              \
+                               _mm256_lddqu_si256(data2 + i + 3));             \
+            _mm256_storeu_si256(out + i + 3, A2);                              \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 4),              \
+                               _mm256_lddqu_si256(data2 + i + 4));             \
+            _mm256_storeu_si256(out + i + 4, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 5),              \
+                               _mm256_lddqu_si256(data2 + i + 5));             \
+            _mm256_storeu_si256(out + i + 5, A2);                              \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 6),              \
+                               _mm256_lddqu_si256(data2 + i + 6));             \
+            _mm256_storeu_si256(out + i + 6, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 7),              \
+                               _mm256_lddqu_si256(data2 + i + 7));             \
+            _mm256_storeu_si256(out + i + 7, A2);                              \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsA, &fours, fours, foursA, foursB);                      \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 8),              \
+                               _mm256_lddqu_si256(data2 + i + 8));             \
+            _mm256_storeu_si256(out + i + 8, A1);                              \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 9),              \
+                               _mm256_lddqu_si256(data2 + i + 9));             \
+            _mm256_storeu_si256(out + i + 9, A2);                              \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 10),             \
+                               _mm256_lddqu_si256(data2 + i + 10));            \
+            _mm256_storeu_si256(out + i + 10, A1);                             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 11),             \
+                               _mm256_lddqu_si256(data2 + i + 11));            \
+            _mm256_storeu_si256(out + i + 11, A2);                             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursA, &twos, twos, twosA, twosB);                           \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 12),             \
+                               _mm256_lddqu_si256(data2 + i + 12));            \
+            _mm256_storeu_si256(out + i + 12, A1);                             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 13),             \
+                               _mm256_lddqu_si256(data2 + i + 13));            \
+            _mm256_storeu_si256(out + i + 13, A2);                             \
+            CSA(&twosA, &ones, ones, A1, A2);                                  \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 14),             \
+                               _mm256_lddqu_si256(data2 + i + 14));            \
+            _mm256_storeu_si256(out + i + 14, A1);                             \
+            A2 = avx_intrinsic(_mm256_lddqu_si256(data1 + i + 15),             \
+                               _mm256_lddqu_si256(data2 + i + 15));            \
+            _mm256_storeu_si256(out + i + 15, A2);                             \
+            CSA(&twosB, &ones, ones, A1, A2);                                  \
+            CSA(&foursB, &twos, twos, twosA, twosB);                           \
+            CSA(&eightsB, &fours, fours, foursA, foursB);                      \
+            CSA(&sixteens, &eights, eights, eightsA, eightsB);                 \
+            total = _mm256_add_epi64(total, popcount256(sixteens));            \
+        }                                                                      \
+        total = _mm256_slli_epi64(total, 4);                                   \
+        total = _mm256_add_epi64(total,                                        \
+                                 _mm256_slli_epi64(popcount256(eights), 3));   \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(fours), 2)); \
+        total =                                                                \
+            _mm256_add_epi64(total, _mm256_slli_epi64(popcount256(twos), 1));  \
+        total = _mm256_add_epi64(total, popcount256(ones));                    \
+        for (; i < size; i++) {                                                \
+            A1 = avx_intrinsic(_mm256_lddqu_si256(data1 + i),                  \
+                               _mm256_lddqu_si256(data2 + i));                 \
+            _mm256_storeu_si256(out + i, A1);                                  \
+            total = _mm256_add_epi64(total, popcount256(A1));                  \
+        }                                                                      \
+        return (uint64_t)(_mm256_extract_epi64(total, 0)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 1)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 2)) +                    \
+               (uint64_t)(_mm256_extract_epi64(total, 3));                     \
+    }
+
+AVXPOPCNTFNC(or, _mm256_or_si256)
+AVXPOPCNTFNC(union, _mm256_or_si256)
+AVXPOPCNTFNC(and, _mm256_and_si256)
+AVXPOPCNTFNC(intersection, _mm256_and_si256)
+AVXPOPCNTFNC (xor, _mm256_xor_si256)
+AVXPOPCNTFNC(andnot, _mm256_andnot_si256)
+
+/***
+ * END Harley-Seal popcount functions.
+ */
+
+#endif  // USEAVX
+
+#endif
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/bitset_util.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/array.h */
+/*
+ * array.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_ARRAY_H_
+#define INCLUDE_CONTAINERS_ARRAY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+
+/* Containers with DEFAULT_MAX_SIZE or less integers should be arrays */
+enum { DEFAULT_MAX_SIZE = 4096 };
+
+/* struct array_container - sparse representation of a bitmap
+ *
+ * @cardinality: number of indices in `array` (and the bitmap)
+ * @capacity:    allocated size of `array`
+ * @array:       sorted list of integers
+ */
+struct array_container_s {
+    int32_t cardinality;
+    int32_t capacity;
+    uint16_t *array;
+};
+
+typedef struct array_container_s array_container_t;
+
+/* Create a new array with default. Return NULL in case of failure. See also
+ * array_container_create_given_capacity. */
+array_container_t *array_container_create(void);
+
+/* Create a new array with a specified capacity size. Return NULL in case of
+ * failure. */
+array_container_t *array_container_create_given_capacity(int32_t size);
+
+/* Create a new array containing all values in [min,max). */
+array_container_t * array_container_create_range(uint32_t min, uint32_t max);
+
+/*
+ * Shrink the capacity to the actual size, return the number of bytes saved.
+ */
+int array_container_shrink_to_fit(array_container_t *src);
+
+/* Free memory owned by `array'. */
+void array_container_free(array_container_t *array);
+
+/* Duplicate container */
+array_container_t *array_container_clone(const array_container_t *src);
+
+int32_t array_container_serialize(const array_container_t *container,
+                                  char *buf) WARN_UNUSED;
+
+uint32_t array_container_serialization_len(const array_container_t *container);
+
+void *array_container_deserialize(const char *buf, size_t buf_len);
+
+/* Get the cardinality of `array'. */
+static inline int array_container_cardinality(const array_container_t *array) {
+    return array->cardinality;
+}
+
+static inline bool array_container_nonzero_cardinality(
+    const array_container_t *array) {
+    return array->cardinality > 0;
+}
+
+/* Copy one container into another. We assume that they are distinct. */
+void array_container_copy(const array_container_t *src, array_container_t *dst);
+
+/*  Add all the values in [min,max) (included) at a distance k*step from min.
+    The container must have a size less or equal to DEFAULT_MAX_SIZE after this
+   addition. */
+void array_container_add_from_range(array_container_t *arr, uint32_t min,
+                                    uint32_t max, uint16_t step);
+
+/* Set the cardinality to zero (does not release memory). */
+static inline void array_container_clear(array_container_t *array) {
+    array->cardinality = 0;
+}
+
+static inline bool array_container_empty(const array_container_t *array) {
+    return array->cardinality == 0;
+}
+
+/* check whether the cardinality is equal to the capacity (this does not mean
+* that it contains 1<<16 elements) */
+static inline bool array_container_full(const array_container_t *array) {
+    return array->cardinality == array->capacity;
+}
+
+
+/* Compute the union of `src_1' and `src_2' and write the result to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void array_container_union(const array_container_t *src_1,
+                           const array_container_t *src_2,
+                           array_container_t *dst);
+
+/* symmetric difference, see array_container_union */
+void array_container_xor(const array_container_t *array_1,
+                         const array_container_t *array_2,
+                         array_container_t *out);
+
+/* Computes the intersection of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void array_container_intersection(const array_container_t *src_1,
+                                  const array_container_t *src_2,
+                                  array_container_t *dst);
+
+/* Check whether src_1 and src_2 intersect. */
+bool array_container_intersect(const array_container_t *src_1,
+                                  const array_container_t *src_2);
+
+
+/* computers the size of the intersection between two arrays.
+ */
+int array_container_intersection_cardinality(const array_container_t *src_1,
+                                             const array_container_t *src_2);
+
+/* computes the intersection of array1 and array2 and write the result to
+ * array1.
+ * */
+void array_container_intersection_inplace(array_container_t *src_1,
+                                          const array_container_t *src_2);
+
+/*
+ * Write out the 16-bit integers contained in this container as a list of 32-bit
+ * integers using base
+ * as the starting value (it might be expected that base has zeros in its 16
+ * least significant bits).
+ * The function returns the number of values written.
+ * The caller is responsible for allocating enough memory in out.
+ */
+int array_container_to_uint32_array(void *vout, const array_container_t *cont,
+                                    uint32_t base);
+
+/* Compute the number of runs */
+int32_t array_container_number_of_runs(const array_container_t *a);
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void array_container_printf(const array_container_t *v);
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void array_container_printf_as_uint32_array(const array_container_t *v,
+                                            uint32_t base);
+
+/**
+ * Return the serialized size in bytes of a container having cardinality "card".
+ */
+static inline int32_t array_container_serialized_size_in_bytes(int32_t card) {
+    return card * 2 + 2;
+}
+
+/**
+ * Increase capacity to at least min.
+ * Whether the existing data needs to be copied over depends on the "preserve"
+ * parameter. If preserve is false, then the new content will be uninitialized,
+ * otherwise the old content is copied.
+ */
+void array_container_grow(array_container_t *container, int32_t min,
+                          bool preserve);
+
+bool array_container_iterate(const array_container_t *cont, uint32_t base,
+                             roaring_iterator iterator, void *ptr);
+bool array_container_iterate64(const array_container_t *cont, uint32_t base,
+                               roaring_iterator64 iterator, uint64_t high_bits,
+                               void *ptr);
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be
+ * array_container_size_in_bytes(container).
+ *
+ */
+int32_t array_container_write(const array_container_t *container, char *buf);
+/**
+ * Reads the instance from buf, outputs how many bytes were read.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes read should be array_container_size_in_bytes(container).
+ * You need to provide the (known) cardinality.
+ */
+int32_t array_container_read(int32_t cardinality, array_container_t *container,
+                             const char *buf);
+
+/**
+ * Return the serialized size in bytes of a container (see
+ * bitset_container_write)
+ * This is meant to be compatible with the Java and Go versions of Roaring and
+ * assumes
+ * that the cardinality of the container is already known.
+ *
+ */
+static inline int32_t array_container_size_in_bytes(
+    const array_container_t *container) {
+    return container->cardinality * sizeof(uint16_t);
+}
+
+/**
+ * Return true if the two arrays have the same content.
+ */
+bool array_container_equals(const array_container_t *container1,
+                            const array_container_t *container2);
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool array_container_is_subset(const array_container_t *container1,
+                               const array_container_t *container2);
+
+/**
+ * If the element of given rank is in this container, supposing that the first
+ * element has rank start_rank, then the function returns true and sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+static inline bool array_container_select(const array_container_t *container,
+                                          uint32_t *start_rank, uint32_t rank,
+                                          uint32_t *element) {
+    int card = array_container_cardinality(container);
+    if (*start_rank + card <= rank) {
+        *start_rank += card;
+        return false;
+    } else {
+        *element = container->array[rank - *start_rank];
+        return true;
+    }
+}
+
+/* Computes the  difference of array1 and array2 and write the result
+ * to array out.
+ * Array out does not need to be distinct from array_1
+ */
+void array_container_andnot(const array_container_t *array_1,
+                            const array_container_t *array_2,
+                            array_container_t *out);
+
+/* Append x to the set. Assumes that the value is larger than any preceding
+ * values.  */
+static inline void array_container_append(array_container_t *arr,
+                                          uint16_t pos) {
+    const int32_t capacity = arr->capacity;
+
+    if (array_container_full(arr)) {
+        array_container_grow(arr, capacity + 1, true);
+    }
+
+    arr->array[arr->cardinality++] = pos;
+}
+
+/**
+ * Add value to the set if final cardinality doesn't exceed max_cardinality.
+ * Return code:
+ * 1  -- value was added
+ * 0  -- value was already present
+ * -1 -- value was not added because cardinality would exceed max_cardinality
+ */
+static inline int array_container_try_add(array_container_t *arr, uint16_t value,
+                                          int32_t max_cardinality) {
+    const int32_t cardinality = arr->cardinality;
+
+    // best case, we can append.
+    if ((array_container_empty(arr) || arr->array[cardinality - 1] < value) &&
+        cardinality < max_cardinality) {
+        array_container_append(arr, value);
+        return 1;
+    }
+
+    const int32_t loc = binarySearch(arr->array, cardinality, value);
+
+    if (loc >= 0) {
+        return 0;
+    } else if (cardinality < max_cardinality) {
+        if (array_container_full(arr)) {
+            array_container_grow(arr, arr->capacity + 1, true);
+        }
+        const int32_t insert_idx = -loc - 1;
+        memmove(arr->array + insert_idx + 1, arr->array + insert_idx,
+                (cardinality - insert_idx) * sizeof(uint16_t));
+        arr->array[insert_idx] = value;
+        arr->cardinality++;
+        return 1;
+    } else {
+        return -1;
+    }
+}
+
+/* Add value to the set. Returns true if x was not already present.  */
+static inline bool array_container_add(array_container_t *arr, uint16_t value) {
+    return array_container_try_add(arr, value, INT32_MAX) == 1;
+}
+
+/* Remove x from the set. Returns true if x was present.  */
+static inline bool array_container_remove(array_container_t *arr,
+                                          uint16_t pos) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, pos);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        memmove(arr->array + idx, arr->array + idx + 1,
+                (arr->cardinality - idx - 1) * sizeof(uint16_t));
+        arr->cardinality--;
+    }
+
+    return is_present;
+}
+
+/* Check whether x is present.  */
+inline bool array_container_contains(const array_container_t *arr,
+                                     uint16_t pos) {
+    //    return binarySearch(arr->array, arr->cardinality, pos) >= 0;
+    // binary search with fallback to linear search for short ranges
+    int32_t low = 0;
+    const uint16_t * carr = (const uint16_t *) arr->array;
+    int32_t high = arr->cardinality - 1;
+    //    while (high - low >= 0) {
+    while(high >= low + 16) {
+        int32_t middleIndex = (low + high)>>1;
+        uint16_t middleValue = carr[middleIndex];
+        if (middleValue < pos) {
+            low = middleIndex + 1;
+        } else if (middleValue > pos) {
+            high = middleIndex - 1;
+        } else {
+            return true;
+        }
+    }
+
+    for (int i=low; i <= high; i++) {
+        uint16_t v = carr[i];
+        if (v == pos) {
+            return true;
+        }
+        if ( v > pos ) return false;
+    }
+    return false;
+
+}
+
+//* Check whether a range of values from range_start (included) to range_end (excluded) is present. */
+static inline bool array_container_contains_range(const array_container_t *arr,
+                                                    uint32_t range_start, uint32_t range_end) {
+
+    const uint16_t rs_included = range_start;
+    const uint16_t re_included = range_end - 1;
+
+    const uint16_t *carr = (const uint16_t *) arr->array;
+
+    const int32_t start = advanceUntil(carr, -1, arr->cardinality, rs_included);
+    const int32_t end = advanceUntil(carr, start - 1, arr->cardinality, re_included);
+
+    return (start < arr->cardinality) && (end < arr->cardinality)
+            && (((uint16_t)(end - start)) == re_included - rs_included)
+            && (carr[start] == rs_included) && (carr[end] == re_included);
+}
+
+/* Returns the smallest value (assumes not empty) */
+inline uint16_t array_container_minimum(const array_container_t *arr) {
+    if (arr->cardinality == 0) return 0;
+    return arr->array[0];
+}
+
+/* Returns the largest value (assumes not empty) */
+inline uint16_t array_container_maximum(const array_container_t *arr) {
+    if (arr->cardinality == 0) return 0;
+    return arr->array[arr->cardinality - 1];
+}
+
+/* Returns the number of values equal or smaller than x */
+inline int array_container_rank(const array_container_t *arr, uint16_t x) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, x);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        return idx + 1;
+    } else {
+        return -idx - 1;
+    }
+}
+
+/* Returns the index of the first value equal or smaller than x, or -1 */
+inline int array_container_index_equalorlarger(const array_container_t *arr, uint16_t x) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, x);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        return idx;
+    } else {
+        int32_t candidate = - idx - 1;
+        if(candidate < arr->cardinality) return candidate;
+        return -1;
+    }
+}
+
+/*
+ * Adds all values in range [min,max] using hint:
+ *   nvals_less is the number of array values less than $min
+ *   nvals_greater is the number of array values greater than $max
+ */
+static inline void array_container_add_range_nvals(array_container_t *array,
+                                                   uint32_t min, uint32_t max,
+                                                   int32_t nvals_less,
+                                                   int32_t nvals_greater) {
+    int32_t union_cardinality = nvals_less + (max - min + 1) + nvals_greater;
+    if (union_cardinality > array->capacity) {
+        array_container_grow(array, union_cardinality, true);
+    }
+    memmove(&(array->array[union_cardinality - nvals_greater]),
+            &(array->array[array->cardinality - nvals_greater]),
+            nvals_greater * sizeof(uint16_t));
+    for (uint32_t i = 0; i <= max - min; i++) {
+        array->array[nvals_less + i] = min + i;
+    }
+    array->cardinality = union_cardinality;
+}
+
+/**
+ * Adds all values in range [min,max].
+ */
+static inline void array_container_add_range(array_container_t *array,
+                                             uint32_t min, uint32_t max) {
+    int32_t nvals_greater = count_greater(array->array, array->cardinality, max);
+    int32_t nvals_less = count_less(array->array, array->cardinality - nvals_greater, min);
+    array_container_add_range_nvals(array, min, max, nvals_less, nvals_greater);
+}
+
+/*
+ * Removes all elements array[pos] .. array[pos+count-1]
+ */
+static inline void array_container_remove_range(array_container_t *array,
+                                                uint32_t pos, uint32_t count) {
+  if (count != 0) {
+      memmove(&(array->array[pos]), &(array->array[pos+count]),
+              (array->cardinality - pos - count) * sizeof(uint16_t));
+      array->cardinality -= count;
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* INCLUDE_CONTAINERS_ARRAY_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/array.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/bitset.h */
+/*
+ * bitset.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_BITSET_H_
+#define INCLUDE_CONTAINERS_BITSET_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef USEAVX
+#define ALIGN_AVX __attribute__((aligned(sizeof(__m256i))))
+#else
+#define ALIGN_AVX
+#endif
+
+enum {
+    BITSET_CONTAINER_SIZE_IN_WORDS = (1 << 16) / 64,
+    BITSET_UNKNOWN_CARDINALITY = -1
+};
+
+struct bitset_container_s {
+    int32_t cardinality;
+    uint64_t *array;
+};
+
+typedef struct bitset_container_s bitset_container_t;
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_container_t *bitset_container_create(void);
+
+/* Free memory. */
+void bitset_container_free(bitset_container_t *bitset);
+
+/* Clear bitset (sets bits to 0). */
+void bitset_container_clear(bitset_container_t *bitset);
+
+/* Set all bits to 1. */
+void bitset_container_set_all(bitset_container_t *bitset);
+
+/* Duplicate bitset */
+bitset_container_t *bitset_container_clone(const bitset_container_t *src);
+
+int32_t bitset_container_serialize(const bitset_container_t *container,
+                                   char *buf) WARN_UNUSED;
+
+uint32_t bitset_container_serialization_len(void);
+
+void *bitset_container_deserialize(const char *buf, size_t buf_len);
+
+/* Set the bit in [begin,end). WARNING: as of April 2016, this method is slow
+ * and
+ * should not be used in performance-sensitive code. Ever.  */
+void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin,
+                                uint32_t end);
+
+#ifdef ASMBITMANIPOPTIMIZATION
+/* Set the ith bit.  */
+static inline void bitset_container_set(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->array[offset];
+    ASM_SET_BIT_INC_WAS_CLEAR(load, p, bitset->cardinality);
+    bitset->array[offset] = load;
+}
+
+/* Unset the ith bit.  */
+static inline void bitset_container_unset(bitset_container_t *bitset,
+                                          uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->array[offset];
+    ASM_CLEAR_BIT_DEC_WAS_SET(load, p, bitset->cardinality);
+    bitset->array[offset] = load;
+}
+
+/* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower
+ * than bitset_container_set.  */
+static inline bool bitset_container_add(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->array[offset];
+    // could be possibly slightly further optimized
+    const int32_t oldcard = bitset->cardinality;
+    ASM_SET_BIT_INC_WAS_CLEAR(load, p, bitset->cardinality);
+    bitset->array[offset] = load;
+    return bitset->cardinality - oldcard;
+}
+
+/* Remove `pos' from `bitset'. Returns true if `pos' was present.  Might be
+ * slower than bitset_container_unset.  */
+static inline bool bitset_container_remove(bitset_container_t *bitset,
+                                           uint16_t pos) {
+    uint64_t shift = 6;
+    uint64_t offset;
+    uint64_t p = pos;
+    ASM_SHIFT_RIGHT(p, shift, offset);
+    uint64_t load = bitset->array[offset];
+    // could be possibly slightly further optimized
+    const int32_t oldcard = bitset->cardinality;
+    ASM_CLEAR_BIT_DEC_WAS_SET(load, p, bitset->cardinality);
+    bitset->array[offset] = load;
+    return oldcard - bitset->cardinality;
+}
+
+/* Get the value of the ith bit.  */
+inline bool bitset_container_get(const bitset_container_t *bitset,
+                                 uint16_t pos) {
+    uint64_t word = bitset->array[pos >> 6];
+    const uint64_t p = pos;
+    ASM_INPLACESHIFT_RIGHT(word, p);
+    return word & 1;
+}
+
+#else
+
+/* Set the ith bit.  */
+static inline void bitset_container_set(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    const uint64_t old_word = bitset->array[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word | (UINT64_C(1) << index);
+    bitset->cardinality += (uint32_t)((old_word ^ new_word) >> index);
+    bitset->array[pos >> 6] = new_word;
+}
+
+/* Unset the ith bit.  */
+static inline void bitset_container_unset(bitset_container_t *bitset,
+                                          uint16_t pos) {
+    const uint64_t old_word = bitset->array[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word & (~(UINT64_C(1) << index));
+    bitset->cardinality -= (uint32_t)((old_word ^ new_word) >> index);
+    bitset->array[pos >> 6] = new_word;
+}
+
+/* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower
+ * than bitset_container_set.  */
+static inline bool bitset_container_add(bitset_container_t *bitset,
+                                        uint16_t pos) {
+    const uint64_t old_word = bitset->array[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word | (UINT64_C(1) << index);
+    const uint64_t increment = (old_word ^ new_word) >> index;
+    bitset->cardinality += (uint32_t)increment;
+    bitset->array[pos >> 6] = new_word;
+    return increment > 0;
+}
+
+/* Remove `pos' from `bitset'. Returns true if `pos' was present.  Might be
+ * slower than bitset_container_unset.  */
+static inline bool bitset_container_remove(bitset_container_t *bitset,
+                                           uint16_t pos) {
+    const uint64_t old_word = bitset->array[pos >> 6];
+    const int index = pos & 63;
+    const uint64_t new_word = old_word & (~(UINT64_C(1) << index));
+    const uint64_t increment = (old_word ^ new_word) >> index;
+    bitset->cardinality -= (uint32_t)increment;
+    bitset->array[pos >> 6] = new_word;
+    return increment > 0;
+}
+
+/* Get the value of the ith bit.  */
+inline bool bitset_container_get(const bitset_container_t *bitset,
+                                 uint16_t pos) {
+    const uint64_t word = bitset->array[pos >> 6];
+    return (word >> (pos & 63)) & 1;
+}
+
+#endif
+
+/*
+* Check if all bits are set in a range of positions from pos_start (included) to
+* pos_end (excluded).
+*/
+static inline bool bitset_container_get_range(const bitset_container_t *bitset,
+                                                uint32_t pos_start, uint32_t pos_end) {
+
+    const uint32_t start = pos_start >> 6;
+    const uint32_t end = pos_end >> 6;
+
+    const uint64_t first = ~((1ULL << (pos_start & 0x3F)) - 1);
+    const uint64_t last = (1ULL << (pos_end & 0x3F)) - 1;
+
+    if (start == end) return ((bitset->array[end] & first & last) == (first & last));
+    if ((bitset->array[start] & first) != first) return false;
+
+    if ((end < BITSET_CONTAINER_SIZE_IN_WORDS) && ((bitset->array[end] & last) != last)){
+
+        return false;
+    }
+
+    for (uint16_t i = start + 1; (i < BITSET_CONTAINER_SIZE_IN_WORDS) && (i < end); ++i){
+
+        if (bitset->array[i] != UINT64_C(0xFFFFFFFFFFFFFFFF)) return false;
+    }
+
+    return true;
+}
+
+/* Check whether `bitset' is present in `array'.  Calls bitset_container_get. */
+inline bool bitset_container_contains(const bitset_container_t *bitset,
+                                      uint16_t pos) {
+    return bitset_container_get(bitset, pos);
+}
+
+/*
+* Check whether a range of bits from position `pos_start' (included) to `pos_end' (excluded)
+* is present in `bitset'.  Calls bitset_container_get_all.
+*/
+static inline bool bitset_container_contains_range(const bitset_container_t *bitset,
+					uint32_t pos_start, uint32_t pos_end) {
+    return bitset_container_get_range(bitset, pos_start, pos_end);
+}
+
+/* Get the number of bits set */
+static inline int bitset_container_cardinality(
+    const bitset_container_t *bitset) {
+    return bitset->cardinality;
+}
+
+
+
+
+/* Copy one container into another. We assume that they are distinct. */
+void bitset_container_copy(const bitset_container_t *source,
+                           bitset_container_t *dest);
+
+/*  Add all the values [min,max) at a distance k*step from min: min,
+ * min+step,.... */
+void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
+                                     uint32_t max, uint16_t step);
+
+/* Get the number of bits set (force computation). This does not modify bitset.
+ * To update the cardinality, you should do
+ * bitset->cardinality =  bitset_container_compute_cardinality(bitset).*/
+int bitset_container_compute_cardinality(const bitset_container_t *bitset);
+
+/* Get whether there is at least one bit set  (see bitset_container_empty for the reverse),
+   when the cardinality is unknown, it is computed and stored in the struct */
+static inline bool bitset_container_nonzero_cardinality(
+    bitset_container_t *bitset) {
+    // account for laziness
+    if (bitset->cardinality == BITSET_UNKNOWN_CARDINALITY) {
+        // could bail early instead with a nonzero result
+        bitset->cardinality = bitset_container_compute_cardinality(bitset);
+    }
+    return bitset->cardinality > 0;
+}
+
+/* Check whether this bitset is empty (see bitset_container_nonzero_cardinality for the reverse),
+ *  it never modifies the bitset struct. */
+static inline bool bitset_container_empty(
+    const bitset_container_t *bitset) {
+  if (bitset->cardinality == BITSET_UNKNOWN_CARDINALITY) {
+      for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i ++) {
+          if((bitset->array[i]) != 0) return false;
+      }
+      return true;
+  }
+  return bitset->cardinality == 0;
+}
+
+
+/* Get whether there is at least one bit set  (see bitset_container_empty for the reverse),
+   the bitset is never modified */
+static inline bool bitset_container_const_nonzero_cardinality(
+    const bitset_container_t *bitset) {
+    return !bitset_container_empty(bitset);
+}
+
+/*
+ * Check whether the two bitsets intersect
+ */
+bool bitset_container_intersect(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2);
+
+/* Computes the union of bitsets `src_1' and `src_2' into `dst'  and return the
+ * cardinality. */
+int bitset_container_or(const bitset_container_t *src_1,
+                        const bitset_container_t *src_2,
+                        bitset_container_t *dst);
+
+/* Computes the union of bitsets `src_1' and `src_2' and return the cardinality.
+ */
+int bitset_container_or_justcard(const bitset_container_t *src_1,
+                                 const bitset_container_t *src_2);
+
+/* Computes the union of bitsets `src_1' and `src_2' into `dst' and return the
+ * cardinality. Same as bitset_container_or. */
+int bitset_container_union(const bitset_container_t *src_1,
+                           const bitset_container_t *src_2,
+                           bitset_container_t *dst);
+
+/* Computes the union of bitsets `src_1' and `src_2'  and return the
+ * cardinality. Same as bitset_container_or_justcard. */
+int bitset_container_union_justcard(const bitset_container_t *src_1,
+                                    const bitset_container_t *src_2);
+
+/* Computes the union of bitsets `src_1' and `src_2' into `dst', but does not
+ * update the cardinality. Provided to optimize chained operations. */
+int bitset_container_or_nocard(const bitset_container_t *src_1,
+                               const bitset_container_t *src_2,
+                               bitset_container_t *dst);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst' and
+ * return the cardinality. */
+int bitset_container_and(const bitset_container_t *src_1,
+                         const bitset_container_t *src_2,
+                         bitset_container_t *dst);
+
+/* Computes the intersection of bitsets `src_1' and `src_2'  and return the
+ * cardinality. */
+int bitset_container_and_justcard(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst' and
+ * return the cardinality. Same as bitset_container_and. */
+int bitset_container_intersection(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  bitset_container_t *dst);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' and return the
+ * cardinality. Same as bitset_container_and_justcard. */
+int bitset_container_intersection_justcard(const bitset_container_t *src_1,
+                                           const bitset_container_t *src_2);
+
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_and_nocard(const bitset_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst);
+
+/* Computes the exclusive or of bitsets `src_1' and `src_2' into `dst' and
+ * return the cardinality. */
+int bitset_container_xor(const bitset_container_t *src_1,
+                         const bitset_container_t *src_2,
+                         bitset_container_t *dst);
+
+/* Computes the exclusive or of bitsets `src_1' and `src_2' and return the
+ * cardinality. */
+int bitset_container_xor_justcard(const bitset_container_t *src_1,
+                                  const bitset_container_t *src_2);
+
+/* Computes the exclusive or of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_xor_nocard(const bitset_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst);
+
+/* Computes the and not of bitsets `src_1' and `src_2' into `dst' and return the
+ * cardinality. */
+int bitset_container_andnot(const bitset_container_t *src_1,
+                            const bitset_container_t *src_2,
+                            bitset_container_t *dst);
+
+/* Computes the and not of bitsets `src_1' and `src_2'  and return the
+ * cardinality. */
+int bitset_container_andnot_justcard(const bitset_container_t *src_1,
+                                     const bitset_container_t *src_2);
+
+/* Computes the and not or of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_andnot_nocard(const bitset_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   bitset_container_t *dst);
+
+/*
+ * Write out the 16-bit integers contained in this container as a list of 32-bit
+ * integers using base
+ * as the starting value (it might be expected that base has zeros in its 16
+ * least significant bits).
+ * The function returns the number of values written.
+ * The caller is responsible for allocating enough memory in out.
+ * The out pointer should point to enough memory (the cardinality times 32
+ * bits).
+ */
+int bitset_container_to_uint32_array(void *out, const bitset_container_t *cont,
+                                     uint32_t base);
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void bitset_container_printf(const bitset_container_t *v);
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void bitset_container_printf_as_uint32_array(const bitset_container_t *v,
+                                             uint32_t base);
+
+/**
+ * Return the serialized size in bytes of a container.
+ */
+static inline int32_t bitset_container_serialized_size_in_bytes(void) {
+    return BITSET_CONTAINER_SIZE_IN_WORDS * 8;
+}
+
+/**
+ * Return the the number of runs.
+ */
+int bitset_container_number_of_runs(bitset_container_t *b);
+
+bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base,
+                              roaring_iterator iterator, void *ptr);
+bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base,
+                                roaring_iterator64 iterator, uint64_t high_bits,
+                                void *ptr);
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be
+ * bitset_container_size_in_bytes(container).
+ */
+int32_t bitset_container_write(const bitset_container_t *container, char *buf);
+
+/**
+ * Reads the instance from buf, outputs how many bytes were read.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes read should be bitset_container_size_in_bytes(container).
+ * You need to provide the (known) cardinality.
+ */
+int32_t bitset_container_read(int32_t cardinality,
+                              bitset_container_t *container, const char *buf);
+/**
+ * Return the serialized size in bytes of a container (see
+ * bitset_container_write).
+ * This is meant to be compatible with the Java and Go versions of Roaring and
+ * assumes
+ * that the cardinality of the container is already known or can be computed.
+ */
+static inline int32_t bitset_container_size_in_bytes(
+    const bitset_container_t *container) {
+    (void)container;
+    return BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+}
+
+/**
+ * Return true if the two containers have the same content.
+ */
+bool bitset_container_equals(const bitset_container_t *container1,
+                             const bitset_container_t *container2);
+
+/**
+* Return true if container1 is a subset of container2.
+*/
+bool bitset_container_is_subset(const bitset_container_t *container1,
+                                const bitset_container_t *container2);
+
+/**
+ * If the element of given rank is in this container, supposing that the first
+ * element has rank start_rank, then the function returns true and sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+bool bitset_container_select(const bitset_container_t *container,
+                             uint32_t *start_rank, uint32_t rank,
+                             uint32_t *element);
+
+/* Returns the smallest value (assumes not empty) */
+uint16_t bitset_container_minimum(const bitset_container_t *container);
+
+/* Returns the largest value (assumes not empty) */
+uint16_t bitset_container_maximum(const bitset_container_t *container);
+
+/* Returns the number of values equal or smaller than x */
+int bitset_container_rank(const bitset_container_t *container, uint16_t x);
+
+/* Returns the index of the first value equal or larger than x, or -1 */
+int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x);
+#endif /* INCLUDE_CONTAINERS_BITSET_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/bitset.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/run.h */
+/*
+ * run.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_RUN_H_
+#define INCLUDE_CONTAINERS_RUN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+
+/* struct rle16_s - run length pair
+ *
+ * @value:  start position of the run
+ * @length: length of the run is `length + 1`
+ *
+ * An RLE pair {v, l} would represent the integers between the interval
+ * [v, v+l+1], e.g. {3, 2} = [3, 4, 5].
+ */
+struct rle16_s {
+    uint16_t value;
+    uint16_t length;
+};
+
+typedef struct rle16_s rle16_t;
+
+/* struct run_container_s - run container bitmap
+ *
+ * @n_runs:   number of rle_t pairs in `runs`.
+ * @capacity: capacity in rle_t pairs `runs` can hold.
+ * @runs:     pairs of rle_t.
+ *
+ */
+struct run_container_s {
+    int32_t n_runs;
+    int32_t capacity;
+    rle16_t *runs;
+};
+
+typedef struct run_container_s run_container_t;
+
+/* Create a new run container. Return NULL in case of failure. */
+run_container_t *run_container_create(void);
+
+/* Create a new run container with given capacity. Return NULL in case of
+ * failure. */
+run_container_t *run_container_create_given_capacity(int32_t size);
+
+/*
+ * Shrink the capacity to the actual size, return the number of bytes saved.
+ */
+int run_container_shrink_to_fit(run_container_t *src);
+
+/* Free memory owned by `run'. */
+void run_container_free(run_container_t *run);
+
+/* Duplicate container */
+run_container_t *run_container_clone(const run_container_t *src);
+
+int32_t run_container_serialize(const run_container_t *container,
+                                char *buf) WARN_UNUSED;
+
+uint32_t run_container_serialization_len(const run_container_t *container);
+
+void *run_container_deserialize(const char *buf, size_t buf_len);
+
+/*
+ * Effectively deletes the value at index index, repacking data.
+ */
+static inline void recoverRoomAtIndex(run_container_t *run, uint16_t index) {
+    memmove(run->runs + index, run->runs + (1 + index),
+            (run->n_runs - index - 1) * sizeof(rle16_t));
+    run->n_runs--;
+}
+
+/**
+ * Good old binary search through rle data
+ */
+inline int32_t interleavedBinarySearch(const rle16_t *array, int32_t lenarray,
+                                       uint16_t ikey) {
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t middleValue = array[middleIndex].value;
+        if (middleValue < ikey) {
+            low = middleIndex + 1;
+        } else if (middleValue > ikey) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return -(low + 1);
+}
+
+/*
+ * Returns index of the run which contains $ikey
+ */
+static inline int32_t rle16_find_run(const rle16_t *array, int32_t lenarray,
+                                     uint16_t ikey) {
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t min = array[middleIndex].value;
+        uint16_t max = array[middleIndex].value + array[middleIndex].length;
+        if (ikey > max) {
+            low = middleIndex + 1;
+        } else if (ikey < min) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return -(low + 1);
+}
+
+
+/**
+ * Returns number of runs which can'be be merged with the key because they
+ * are less than the key.
+ * Note that [5,6,7,8] can be merged with the key 9 and won't be counted.
+ */
+static inline int32_t rle16_count_less(const rle16_t* array, int32_t lenarray,
+                                       uint16_t key) {
+    if (lenarray == 0) return 0;
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t min_value = array[middleIndex].value;
+        uint16_t max_value = array[middleIndex].value + array[middleIndex].length;
+        if (max_value + UINT32_C(1) < key) { // uint32 arithmetic
+            low = middleIndex + 1;
+        } else if (key < min_value) {
+            high = middleIndex - 1;
+        } else {
+            return middleIndex;
+        }
+    }
+    return low;
+}
+
+static inline int32_t rle16_count_greater(const rle16_t* array, int32_t lenarray,
+                                          uint16_t key) {
+    if (lenarray == 0) return 0;
+    int32_t low = 0;
+    int32_t high = lenarray - 1;
+    while (low <= high) {
+        int32_t middleIndex = (low + high) >> 1;
+        uint16_t min_value = array[middleIndex].value;
+        uint16_t max_value = array[middleIndex].value + array[middleIndex].length;
+        if (max_value < key) {
+            low = middleIndex + 1;
+        } else if (key + UINT32_C(1) < min_value) { // uint32 arithmetic
+            high = middleIndex - 1;
+        } else {
+            return lenarray - (middleIndex + 1);
+        }
+    }
+    return lenarray - low;
+}
+
+/**
+ * increase capacity to at least min. Whether the
+ * existing data needs to be copied over depends on copy. If "copy" is false,
+ * then the new content will be uninitialized, otherwise a copy is made.
+ */
+void run_container_grow(run_container_t *run, int32_t min, bool copy);
+
+/**
+ * Moves the data so that we can write data at index
+ */
+static inline void makeRoomAtIndex(run_container_t *run, uint16_t index) {
+    /* This function calls realloc + memmove sequentially to move by one index.
+     * Potentially copying twice the array.
+     */
+    if (run->n_runs + 1 > run->capacity)
+        run_container_grow(run, run->n_runs + 1, true);
+    memmove(run->runs + 1 + index, run->runs + index,
+            (run->n_runs - index) * sizeof(rle16_t));
+    run->n_runs++;
+}
+
+/* Add `pos' to `run'. Returns true if `pos' was not present. */
+bool run_container_add(run_container_t *run, uint16_t pos);
+
+/* Remove `pos' from `run'. Returns true if `pos' was present. */
+static inline bool run_container_remove(run_container_t *run, uint16_t pos) {
+    int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
+    if (index >= 0) {
+        int32_t le = run->runs[index].length;
+        if (le == 0) {
+            recoverRoomAtIndex(run, (uint16_t)index);
+        } else {
+            run->runs[index].value++;
+            run->runs[index].length--;
+        }
+        return true;
+    }
+    index = -index - 2;  // points to preceding value, possibly -1
+    if (index >= 0) {    // possible match
+        int32_t offset = pos - run->runs[index].value;
+        int32_t le = run->runs[index].length;
+        if (offset < le) {
+            // need to break in two
+            run->runs[index].length = (uint16_t)(offset - 1);
+            // need to insert
+            uint16_t newvalue = pos + 1;
+            int32_t newlength = le - offset - 1;
+            makeRoomAtIndex(run, (uint16_t)(index + 1));
+            run->runs[index + 1].value = newvalue;
+            run->runs[index + 1].length = (uint16_t)newlength;
+            return true;
+
+        } else if (offset == le) {
+            run->runs[index].length--;
+            return true;
+        }
+    }
+    // no match
+    return false;
+}
+
+/* Check whether `pos' is present in `run'.  */
+inline bool run_container_contains(const run_container_t *run, uint16_t pos) {
+    int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
+    if (index >= 0) return true;
+    index = -index - 2;  // points to preceding value, possibly -1
+    if (index != -1) {   // possible match
+        int32_t offset = pos - run->runs[index].value;
+        int32_t le = run->runs[index].length;
+        if (offset <= le) return true;
+    }
+    return false;
+}
+
+/*
+* Check whether all positions in a range of positions from pos_start (included)
+* to pos_end (excluded) is present in `run'.
+*/
+static inline bool run_container_contains_range(const run_container_t *run,
+                                                uint32_t pos_start, uint32_t pos_end) {
+    uint32_t count = 0;
+    int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos_start);
+    if (index < 0) {
+        index = -index - 2;
+        if ((index == -1) || ((pos_start - run->runs[index].value) > run->runs[index].length)){
+            return false;
+        }
+    }
+    for (int32_t i = index; i < run->n_runs; ++i) {
+        const uint32_t stop = run->runs[i].value + run->runs[i].length;
+        if (run->runs[i].value >= pos_end) break;
+        if (stop >= pos_end) {
+            count += (((pos_end - run->runs[i].value) > 0) ? (pos_end - run->runs[i].value) : 0);
+            break;
+        }
+        const uint32_t min = (stop - pos_start) > 0 ? (stop - pos_start) : 0;
+        count += (min < run->runs[i].length) ? min : run->runs[i].length;
+    }
+    return count >= (pos_end - pos_start - 1);
+}
+
+#ifdef USEAVX
+
+/* Get the cardinality of `run'. Requires an actual computation. */
+static inline int run_container_cardinality(const run_container_t *run) {
+    const int32_t n_runs = run->n_runs;
+    const rle16_t *runs = run->runs;
+
+    /* by initializing with n_runs, we omit counting the +1 for each pair. */
+    int sum = n_runs;
+    int32_t k = 0;
+    const int32_t step = sizeof(__m256i) / sizeof(rle16_t);
+    if (n_runs > step) {
+        __m256i total = _mm256_setzero_si256();
+        for (; k + step <= n_runs; k += step) {
+            __m256i ymm1 = _mm256_lddqu_si256((const __m256i *)(runs + k));
+            __m256i justlengths = _mm256_srli_epi32(ymm1, 16);
+            total = _mm256_add_epi32(total, justlengths);
+        }
+        // a store might be faster than extract?
+        uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)];
+        _mm256_storeu_si256((__m256i *)buffer, total);
+        sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
+               (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
+    }
+    for (; k < n_runs; ++k) {
+        sum += runs[k].length;
+    }
+
+    return sum;
+}
+
+#else
+
+/* Get the cardinality of `run'. Requires an actual computation. */
+static inline int run_container_cardinality(const run_container_t *run) {
+    const int32_t n_runs = run->n_runs;
+    const rle16_t *runs = run->runs;
+
+    /* by initializing with n_runs, we omit counting the +1 for each pair. */
+    int sum = n_runs;
+    for (int k = 0; k < n_runs; ++k) {
+        sum += runs[k].length;
+    }
+
+    return sum;
+}
+#endif
+
+/* Card > 0?, see run_container_empty for the reverse */
+static inline bool run_container_nonzero_cardinality(
+    const run_container_t *run) {
+    return run->n_runs > 0;  // runs never empty
+}
+
+/* Card == 0?, see run_container_nonzero_cardinality for the reverse */
+static inline bool run_container_empty(
+    const run_container_t *run) {
+    return run->n_runs == 0;  // runs never empty
+}
+
+
+
+/* Copy one container into another. We assume that they are distinct. */
+void run_container_copy(const run_container_t *src, run_container_t *dst);
+
+/* Set the cardinality to zero (does not release memory). */
+static inline void run_container_clear(run_container_t *run) {
+    run->n_runs = 0;
+}
+
+/**
+ * Append run described by vl to the run container, possibly merging.
+ * It is assumed that the run would be inserted at the end of the container, no
+ * check is made.
+ * It is assumed that the run container has the necessary capacity: caller is
+ * responsible for checking memory capacity.
+ *
+ *
+ * This is not a safe function, it is meant for performance: use with care.
+ */
+static inline void run_container_append(run_container_t *run, rle16_t vl,
+                                        rle16_t *previousrl) {
+    const uint32_t previousend = previousrl->value + previousrl->length;
+    if (vl.value > previousend + 1) {  // we add a new one
+        run->runs[run->n_runs] = vl;
+        run->n_runs++;
+        *previousrl = vl;
+    } else {
+        uint32_t newend = vl.value + vl.length + UINT32_C(1);
+        if (newend > previousend) {  // we merge
+            previousrl->length = (uint16_t)(newend - 1 - previousrl->value);
+            run->runs[run->n_runs - 1] = *previousrl;
+        }
+    }
+}
+
+/**
+ * Like run_container_append but it is assumed that the content of run is empty.
+ */
+static inline rle16_t run_container_append_first(run_container_t *run,
+                                                 rle16_t vl) {
+    run->runs[run->n_runs] = vl;
+    run->n_runs++;
+    return vl;
+}
+
+/**
+ * append a single value  given by val to the run container, possibly merging.
+ * It is assumed that the value would be inserted at the end of the container,
+ * no check is made.
+ * It is assumed that the run container has the necessary capacity: caller is
+ * responsible for checking memory capacity.
+ *
+ * This is not a safe function, it is meant for performance: use with care.
+ */
+static inline void run_container_append_value(run_container_t *run,
+                                              uint16_t val,
+                                              rle16_t *previousrl) {
+    const uint32_t previousend = previousrl->value + previousrl->length;
+    if (val > previousend + 1) {  // we add a new one
+        //*previousrl = (rle16_t){.value = val, .length = 0};// requires C99
+        previousrl->value = val;
+        previousrl->length = 0;
+
+        run->runs[run->n_runs] = *previousrl;
+        run->n_runs++;
+    } else if (val == previousend + 1) {  // we merge
+        previousrl->length++;
+        run->runs[run->n_runs - 1] = *previousrl;
+    }
+}
+
+/**
+ * Like run_container_append_value but it is assumed that the content of run is
+ * empty.
+ */
+static inline rle16_t run_container_append_value_first(run_container_t *run,
+                                                       uint16_t val) {
+    // rle16_t newrle = (rle16_t){.value = val, .length = 0};// requires C99
+    rle16_t newrle;
+    newrle.value = val;
+    newrle.length = 0;
+
+    run->runs[run->n_runs] = newrle;
+    run->n_runs++;
+    return newrle;
+}
+
+/* Check whether the container spans the whole chunk (cardinality = 1<<16).
+ * This check can be done in constant time (inexpensive). */
+static inline bool run_container_is_full(const run_container_t *run) {
+    rle16_t vl = run->runs[0];
+    return (run->n_runs == 1) && (vl.value == 0) && (vl.length == 0xFFFF);
+}
+
+/* Compute the union of `src_1' and `src_2' and write the result to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_union(const run_container_t *src_1,
+                         const run_container_t *src_2, run_container_t *dst);
+
+/* Compute the union of `src_1' and `src_2' and write the result to `src_1' */
+void run_container_union_inplace(run_container_t *src_1,
+                                 const run_container_t *src_2);
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+void run_container_intersection(const run_container_t *src_1,
+                                const run_container_t *src_2,
+                                run_container_t *dst);
+
+/* Compute the size of the intersection of src_1 and src_2 . */
+int run_container_intersection_cardinality(const run_container_t *src_1,
+                                           const run_container_t *src_2);
+
+/* Check whether src_1 and src_2 intersect. */
+bool run_container_intersect(const run_container_t *src_1,
+                                const run_container_t *src_2);
+
+/* Compute the symmetric difference of `src_1' and `src_2' and write the result
+ * to `dst'
+ * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
+void run_container_xor(const run_container_t *src_1,
+                       const run_container_t *src_2, run_container_t *dst);
+
+/*
+ * Write out the 16-bit integers contained in this container as a list of 32-bit
+ * integers using base
+ * as the starting value (it might be expected that base has zeros in its 16
+ * least significant bits).
+ * The function returns the number of values written.
+ * The caller is responsible for allocating enough memory in out.
+ */
+int run_container_to_uint32_array(void *vout, const run_container_t *cont,
+                                  uint32_t base);
+
+/*
+ * Print this container using printf (useful for debugging).
+ */
+void run_container_printf(const run_container_t *v);
+
+/*
+ * Print this container using printf as a comma-separated list of 32-bit
+ * integers starting at base.
+ */
+void run_container_printf_as_uint32_array(const run_container_t *v,
+                                          uint32_t base);
+
+/**
+ * Return the serialized size in bytes of a container having "num_runs" runs.
+ */
+static inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs) {
+    return sizeof(uint16_t) +
+           sizeof(rle16_t) * num_runs;  // each run requires 2 2-byte entries.
+}
+
+bool run_container_iterate(const run_container_t *cont, uint32_t base,
+                           roaring_iterator iterator, void *ptr);
+bool run_container_iterate64(const run_container_t *cont, uint32_t base,
+                             roaring_iterator64 iterator, uint64_t high_bits,
+                             void *ptr);
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be run_container_size_in_bytes(container).
+ */
+int32_t run_container_write(const run_container_t *container, char *buf);
+
+/**
+ * Reads the instance from buf, outputs how many bytes were read.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes read should be bitset_container_size_in_bytes(container).
+ * The cardinality parameter is provided for consistency with other containers,
+ * but
+ * it might be effectively ignored..
+ */
+int32_t run_container_read(int32_t cardinality, run_container_t *container,
+                           const char *buf);
+
+/**
+ * Return the serialized size in bytes of a container (see run_container_write).
+ * This is meant to be compatible with the Java and Go versions of Roaring.
+ */
+static inline int32_t run_container_size_in_bytes(
+    const run_container_t *container) {
+    return run_container_serialized_size_in_bytes(container->n_runs);
+}
+
+/**
+ * Return true if the two containers have the same content.
+ */
+bool run_container_equals(const run_container_t *container1,
+                          const run_container_t *container2);
+
+/**
+* Return true if container1 is a subset of container2.
+*/
+bool run_container_is_subset(const run_container_t *container1,
+                             const run_container_t *container2);
+
+/**
+ * Used in a start-finish scan that appends segments, for XOR and NOT
+ */
+
+void run_container_smart_append_exclusive(run_container_t *src,
+                                          const uint16_t start,
+                                          const uint16_t length);
+
+/**
+* The new container consists of a single run [start,stop).
+* It is required that stop>start, the caller is responsability for this check.
+* It is required that stop <= (1<<16), the caller is responsability for this check.
+* The cardinality of the created container is stop - start.
+* Returns NULL on failure
+*/
+static inline run_container_t *run_container_create_range(uint32_t start,
+                                                          uint32_t stop) {
+    run_container_t *rc = run_container_create_given_capacity(1);
+    if (rc) {
+        rle16_t r;
+        r.value = (uint16_t)start;
+        r.length = (uint16_t)(stop - start - 1);
+        run_container_append_first(rc, r);
+    }
+    return rc;
+}
+
+/**
+ * If the element of given rank is in this container, supposing that the first
+ * element has rank start_rank, then the function returns true and sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+bool run_container_select(const run_container_t *container,
+                          uint32_t *start_rank, uint32_t rank,
+                          uint32_t *element);
+
+/* Compute the difference of src_1 and src_2 and write the result to
+ * dst. It is assumed that dst is distinct from both src_1 and src_2. */
+
+void run_container_andnot(const run_container_t *src_1,
+                          const run_container_t *src_2, run_container_t *dst);
+
+/* Returns the smallest value (assumes not empty) */
+inline uint16_t run_container_minimum(const run_container_t *run) {
+    if (run->n_runs == 0) return 0;
+    return run->runs[0].value;
+}
+
+/* Returns the largest value (assumes not empty) */
+inline uint16_t run_container_maximum(const run_container_t *run) {
+    if (run->n_runs == 0) return 0;
+    return run->runs[run->n_runs - 1].value + run->runs[run->n_runs - 1].length;
+}
+
+/* Returns the number of values equal or smaller than x */
+int run_container_rank(const run_container_t *arr, uint16_t x);
+
+/* Returns the index of the first run containing a value at least as large as x, or -1 */
+inline int run_container_index_equalorlarger(const run_container_t *arr, uint16_t x) {
+    int32_t index = interleavedBinarySearch(arr->runs, arr->n_runs, x);
+    if (index >= 0) return index;
+    index = -index - 2;  // points to preceding run, possibly -1
+    if (index != -1) {   // possible match
+        int32_t offset = x - arr->runs[index].value;
+        int32_t le = arr->runs[index].length;
+        if (offset <= le) return index;
+    }
+    index += 1;
+    if(index  < arr->n_runs) {
+      return index;
+    }
+    return -1;
+}
+
+/*
+ * Add all values in range [min, max] using hint.
+ */
+static inline void run_container_add_range_nruns(run_container_t* run,
+                                                 uint32_t min, uint32_t max,
+                                                 int32_t nruns_less,
+                                                 int32_t nruns_greater) {
+    int32_t nruns_common = run->n_runs - nruns_less - nruns_greater;
+    if (nruns_common == 0) {
+        makeRoomAtIndex(run, nruns_less);
+        run->runs[nruns_less].value = min;
+        run->runs[nruns_less].length = max - min;
+    } else {
+        uint32_t common_min = run->runs[nruns_less].value;
+        uint32_t common_max = run->runs[nruns_less + nruns_common - 1].value +
+                              run->runs[nruns_less + nruns_common - 1].length;
+        uint32_t result_min = (common_min < min) ? common_min : min;
+        uint32_t result_max = (common_max > max) ? common_max : max;
+
+        run->runs[nruns_less].value = result_min;
+        run->runs[nruns_less].length = result_max - result_min;
+
+        memmove(&(run->runs[nruns_less + 1]),
+                &(run->runs[run->n_runs - nruns_greater]),
+                nruns_greater*sizeof(rle16_t));
+        run->n_runs = nruns_less + 1 + nruns_greater;
+    }
+}
+
+/**
+ * Add all values in range [min, max]
+ */
+static inline void run_container_add_range(run_container_t* run,
+                                           uint32_t min, uint32_t max) {
+    int32_t nruns_greater = rle16_count_greater(run->runs, run->n_runs, max);
+    int32_t nruns_less = rle16_count_less(run->runs, run->n_runs - nruns_greater, min);
+    run_container_add_range_nruns(run, min, max, nruns_less, nruns_greater);
+}
+
+/**
+ * Shifts last $count elements either left (distance < 0) or right (distance > 0)
+ */
+static inline void run_container_shift_tail(run_container_t* run,
+                                            int32_t count, int32_t distance) {
+    if (distance > 0) {
+        if (run->capacity < count+distance) {
+            run_container_grow(run, count+distance, true);
+        }
+    }
+    int32_t srcpos = run->n_runs - count;
+    int32_t dstpos = srcpos + distance;
+    memmove(&(run->runs[dstpos]), &(run->runs[srcpos]), sizeof(rle16_t) * count);
+    run->n_runs += distance;
+}
+
+/**
+ * Remove all elements in range [min, max]
+ */
+static inline void run_container_remove_range(run_container_t *run, uint32_t min, uint32_t max) {
+    int32_t first = rle16_find_run(run->runs, run->n_runs, min);
+    int32_t last = rle16_find_run(run->runs, run->n_runs, max);
+
+    if (first >= 0 && min > run->runs[first].value &&
+        max < run->runs[first].value + run->runs[first].length) {
+        // split this run into two adjacent runs
+
+        // right subinterval
+        makeRoomAtIndex(run, first+1);
+        run->runs[first+1].value = max + 1;
+        run->runs[first+1].length = (run->runs[first].value + run->runs[first].length) - (max + 1);
+
+        // left subinterval
+        run->runs[first].length = (min - 1) - run->runs[first].value;
+
+        return;
+    }
+
+    // update left-most partial run
+    if (first >= 0) {
+        if (min > run->runs[first].value) {
+            run->runs[first].length = (min - 1) - run->runs[first].value;
+            first++;
+        }
+    } else {
+        first = -first-1;
+    }
+
+    // update right-most run
+    if (last >= 0) {
+        uint16_t run_max = run->runs[last].value + run->runs[last].length;
+        if (run_max > max) {
+            run->runs[last].value = max + 1;
+            run->runs[last].length = run_max - (max + 1);
+            last--;
+        }
+    } else {
+        last = (-last-1) - 1;
+    }
+
+    // remove intermediate runs
+    if (first <= last) {
+        run_container_shift_tail(run, run->n_runs - (last+1), -(last-first+1));
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* INCLUDE_CONTAINERS_RUN_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/run.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/convert.h */
+/*
+ * convert.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_CONVERT_H_
+#define INCLUDE_CONTAINERS_CONVERT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Convert an array into a bitset. The input container is not freed or modified.
+ */
+bitset_container_t *bitset_container_from_array(const array_container_t *arr);
+
+/* Convert a run into a bitset. The input container is not freed or modified. */
+bitset_container_t *bitset_container_from_run(const run_container_t *arr);
+
+/* Convert a run into an array. The input container is not freed or modified. */
+array_container_t *array_container_from_run(const run_container_t *arr);
+
+/* Convert a bitset into an array. The input container is not freed or modified.
+ */
+array_container_t *array_container_from_bitset(const bitset_container_t *bits);
+
+/* Convert an array into a run. The input container is not freed or modified.
+ */
+run_container_t *run_container_from_array(const array_container_t *c);
+
+/* convert a run into either an array or a bitset
+ * might free the container */
+void *convert_to_bitset_or_array_container(run_container_t *r, int32_t card,
+                                           uint8_t *resulttype);
+
+/* convert containers to and from runcontainers, as is most space efficient.
+ * The container might be freed. */
+void *convert_run_optimize(void *c, uint8_t typecode_original,
+                           uint8_t *typecode_after);
+
+/* converts a run container to either an array or a bitset, IF it saves space.
+ */
+/* If a conversion occurs, the caller is responsible to free the original
+ * container and
+ * he becomes reponsible to free the new one. */
+void *convert_run_to_efficient_container(run_container_t *c,
+                                         uint8_t *typecode_after);
+// like convert_run_to_efficient_container but frees the old result if needed
+void *convert_run_to_efficient_container_and_free(run_container_t *c,
+                                                  uint8_t *typecode_after);
+
+/**
+ * Create new bitset container which is a union of run container and
+ * range [min, max]. Caller is responsible for freeing run container.
+ */
+bitset_container_t *bitset_container_from_run_range(const run_container_t *run,
+                                                    uint32_t min, uint32_t max);
+													
+													
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* INCLUDE_CONTAINERS_CONVERT_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/convert.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_equal.h */
+/*
+ * mixed_equal.h
+ *
+ */
+
+#ifndef CONTAINERS_MIXED_EQUAL_H_
+#define CONTAINERS_MIXED_EQUAL_H_
+
+
+/**
+ * Return true if the two containers have the same content.
+ */
+bool array_container_equal_bitset(const array_container_t* container1,
+                                  const bitset_container_t* container2);
+
+/**
+ * Return true if the two containers have the same content.
+ */
+bool run_container_equals_array(const run_container_t* container1,
+                                const array_container_t* container2);
+/**
+ * Return true if the two containers have the same content.
+ */
+bool run_container_equals_bitset(const run_container_t* container1,
+                                 const bitset_container_t* container2);
+
+#endif /* CONTAINERS_MIXED_EQUAL_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_equal.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_subset.h */
+/*
+ * mixed_subset.h
+ *
+ */
+
+#ifndef CONTAINERS_MIXED_SUBSET_H_
+#define CONTAINERS_MIXED_SUBSET_H_
+
+
+/**
+ * Return true if container1 is a subset of container2.
+ */
+bool array_container_is_subset_bitset(const array_container_t* container1,
+                                      const bitset_container_t* container2);
+
+/**
+* Return true if container1 is a subset of container2.
+ */
+bool run_container_is_subset_array(const run_container_t* container1,
+                                   const array_container_t* container2);
+
+/**
+* Return true if container1 is a subset of container2.
+ */
+bool array_container_is_subset_run(const array_container_t* container1,
+                                   const run_container_t* container2);
+
+/**
+* Return true if container1 is a subset of container2.
+ */
+bool run_container_is_subset_bitset(const run_container_t* container1,
+                                    const bitset_container_t* container2);
+
+/**
+* Return true if container1 is a subset of container2.
+*/
+bool bitset_container_is_subset_run(const bitset_container_t* container1,
+                                    const run_container_t* container2);
+
+#endif /* CONTAINERS_MIXED_SUBSET_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_subset.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_andnot.h */
+/*
+ * mixed_andnot.h
+ */
+#ifndef INCLUDE_CONTAINERS_MIXED_ANDNOT_H_
+#define INCLUDE_CONTAINERS_MIXED_ANDNOT_H_
+
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, a valid array container that could be the same as dst.*/
+void array_bitset_container_andnot(const array_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   array_container_t *dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * src_1 */
+
+void array_bitset_container_iandnot(array_container_t *src_1,
+                                    const bitset_container_t *src_2);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst, which does not initially have a valid container.
+ * Return true for a bitset result; false for array
+ */
+
+bool bitset_array_container_andnot(const bitset_container_t *src_1,
+                                   const array_container_t *src_2, void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_iandnot(bitset_container_t *src_1,
+                                    const array_container_t *src_2, void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_andnot(const run_container_t *src_1,
+                                 const bitset_container_t *src_2, void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_iandnot(run_container_t *src_1,
+                                  const bitset_container_t *src_2, void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset").  dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool bitset_run_container_andnot(const bitset_container_t *src_1,
+                                 const run_container_t *src_2, void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_run_container_iandnot(bitset_container_t *src_1,
+                                  const run_container_t *src_2, void **dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any type of container.
+ */
+
+int run_array_container_andnot(const run_container_t *src_1,
+                               const array_container_t *src_2, void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_array_container_iandnot(run_container_t *src_1,
+                                const array_container_t *src_2, void **dst);
+
+/* dst must be a valid array container, allowed to be src_1 */
+
+void array_run_container_andnot(const array_container_t *src_1,
+                                const run_container_t *src_2,
+                                array_container_t *dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+void array_run_container_iandnot(array_container_t *src_1,
+                                 const run_container_t *src_2);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_andnot(const run_container_t *src_1,
+                             const run_container_t *src_2, void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+int run_run_container_iandnot(run_container_t *src_1,
+                              const run_container_t *src_2, void **dst);
+
+/*
+ * dst is a valid array container and may be the same as src_1
+ */
+
+void array_array_container_andnot(const array_container_t *src_1,
+                                  const array_container_t *src_2,
+                                  array_container_t *dst);
+
+/* inplace array-array andnot will always be able to reuse the space of
+ * src_1 */
+void array_array_container_iandnot(array_container_t *src_1,
+                                   const array_container_t *src_2);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_andnot(const bitset_container_t *src_1,
+                                    const bitset_container_t *src_2,
+                                    void **dst);
+
+/* Compute the andnot of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_bitset_container_iandnot(bitset_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     void **dst);
+#endif
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_andnot.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_intersection.h */
+/*
+ * mixed_intersection.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_
+#define INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_
+
+/* These functions appear to exclude cases where the
+ * inputs have the same type and the output is guaranteed
+ * to have the same type as the inputs.  Eg, array intersection
+ */
+
+
+/* Compute the intersection of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
+ * valid container. */
+void array_bitset_container_intersection(const array_container_t *src_1,
+                                         const bitset_container_t *src_2,
+                                         array_container_t *dst);
+
+/* Compute the size of the intersection of src_1 and src_2. */
+int array_bitset_container_intersection_cardinality(
+    const array_container_t *src_1, const bitset_container_t *src_2);
+
+
+
+/* Checking whether src_1 and src_2 intersect. */
+bool array_bitset_container_intersect(const array_container_t *src_1,
+                                         const bitset_container_t *src_2);
+
+/*
+ * Compute the intersection between src_1 and src_2 and write the result
+ * to *dst. If the return function is true, the result is a bitset_container_t
+ * otherwise is a array_container_t. We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_bitset_container_intersection(const bitset_container_t *src_1,
+                                          const bitset_container_t *src_2,
+                                          void **dst);
+
+/* Compute the intersection between src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
+ * valid container. */
+void array_run_container_intersection(const array_container_t *src_1,
+                                      const run_container_t *src_2,
+                                      array_container_t *dst);
+
+/* Compute the intersection between src_1 and src_2 and write the result to
+ * *dst. If the result is true then the result is a bitset_container_t
+ * otherwise is a array_container_t.
+ * If *dst == src_2, then an in-place intersection is attempted
+ **/
+bool run_bitset_container_intersection(const run_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       void **dst);
+
+/* Compute the size of the intersection between src_1 and src_2 . */
+int array_run_container_intersection_cardinality(const array_container_t *src_1,
+                                                 const run_container_t *src_2);
+
+/* Compute the size of the intersection  between src_1 and src_2
+ **/
+int run_bitset_container_intersection_cardinality(const run_container_t *src_1,
+                                       const bitset_container_t *src_2);
+
+
+/* Check that src_1 and src_2 intersect. */
+bool array_run_container_intersect(const array_container_t *src_1,
+                                      const run_container_t *src_2);
+
+/* Check that src_1 and src_2 intersect.
+ **/
+bool run_bitset_container_intersect(const run_container_t *src_1,
+                                       const bitset_container_t *src_2);
+
+/*
+ * Same as bitset_bitset_container_intersection except that if the output is to
+ * be a
+ * bitset_container_t, then src_1 is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_bitset_container_intersection_inplace(
+    bitset_container_t *src_1, const bitset_container_t *src_2, void **dst);
+
+#endif /* INCLUDE_CONTAINERS_MIXED_INTERSECTION_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_intersection.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_negation.h */
+/*
+ * mixed_negation.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_NEGATION_H_
+#define INCLUDE_CONTAINERS_MIXED_NEGATION_H_
+
+
+/* Negation across the entire range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. The complement of a
+ * sufficiently sparse set will always be dense and a hence a bitmap
+ * We assume that dst is pre-allocated and a valid bitset container
+ * There can be no in-place version.
+ */
+void array_container_negation(const array_container_t *src,
+                              bitset_container_t *dst);
+
+/* Negation across the entire range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation(const bitset_container_t *src, void **dst);
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_inplace(bitset_container_t *src, void **dst);
+
+/* Negation across the entire range of container
+ * Compute the  negation of src  and write the result
+ * to *dst.
+ * Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation(const run_container_t *src, void **dst);
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_inplace(run_container_t *src, void **dst);
+
+/* Negation across a range of the container.
+ * Compute the  negation of src  and write the result
+ * to *dst. Returns true if the result is a bitset container
+ * and false for an array container.  *dst is not preallocated.
+ */
+bool array_container_negation_range(const array_container_t *src,
+                                    const int range_start, const int range_end,
+                                    void **dst);
+
+/* Even when the result would fit, it is unclear how to make an
+ * inplace version without inefficient copying.  Thus this routine
+ * may be a wrapper for the non-in-place version
+ */
+bool array_container_negation_range_inplace(array_container_t *src,
+                                            const int range_start,
+                                            const int range_end, void **dst);
+
+/* Negation across a range of the container
+ * Compute the  negation of src  and write the result
+ * to *dst.  A true return value indicates a bitset result,
+ * otherwise the result is an array container.
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool bitset_container_negation_range(const bitset_container_t *src,
+                                     const int range_start, const int range_end,
+                                     void **dst);
+
+/* inplace version */
+/*
+ * Same as bitset_container_negation except that if the output is to
+ * be a
+ * bitset_container_t, then src is modified and no allocation is made.
+ * If the output is to be an array_container_t, then caller is responsible
+ * to free the container.
+ * In all cases, the result is in *dst.
+ */
+bool bitset_container_negation_range_inplace(bitset_container_t *src,
+                                             const int range_start,
+                                             const int range_end, void **dst);
+
+/* Negation across a range of container
+ * Compute the  negation of src  and write the result
+ * to *dst.  Return values are the *_TYPECODES as defined * in containers.h
+ *  We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+int run_container_negation_range(const run_container_t *src,
+                                 const int range_start, const int range_end,
+                                 void **dst);
+
+/*
+ * Same as run_container_negation except that if the output is to
+ * be a
+ * run_container_t, and has the capacity to hold the result,
+ * then src is modified and no allocation is made.
+ * In all cases, the result is in *dst.
+ */
+int run_container_negation_range_inplace(run_container_t *src,
+                                         const int range_start,
+                                         const int range_end, void **dst);
+
+#endif /* INCLUDE_CONTAINERS_MIXED_NEGATION_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_negation.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_union.h */
+/*
+ * mixed_intersection.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_UNION_H_
+#define INCLUDE_CONTAINERS_MIXED_UNION_H_
+
+/* These functions appear to exclude cases where the
+ * inputs have the same type and the output is guaranteed
+ * to have the same type as the inputs.  Eg, bitset unions
+ */
+
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.   */
+void array_bitset_container_union(const array_container_t *src_1,
+                                  const bitset_container_t *src_2,
+                                  bitset_container_t *dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */
+void array_bitset_container_lazy_union(const array_container_t *src_1,
+                                       const bitset_container_t *src_2,
+                                       bitset_container_t *dst);
+
+/*
+ * Compute the union between src_1 and src_2 and write the result
+ * to *dst. If the return function is true, the result is a bitset_container_t
+ * otherwise is a array_container_t. We assume that dst is not pre-allocated. In
+ * case of failure, *dst will be NULL.
+ */
+bool array_array_container_union(const array_container_t *src_1,
+                                 const array_container_t *src_2, void **dst);
+
+/*
+ * Compute the union between src_1 and src_2 and write the result
+ * to *dst if it cannot be written to src_1. If the return function is true,
+ * the result is a bitset_container_t
+ * otherwise is a array_container_t. When the result is an array_container_t, it
+ * it either written to src_1 (if *dst is null) or to *dst.
+ * If the result is a bitset_container_t and *dst is null, then there was a failure.
+ */
+bool array_array_container_inplace_union(array_container_t *src_1,
+                                 const array_container_t *src_2, void **dst);
+
+/*
+ * Same as array_array_container_union except that it will more eagerly produce
+ * a bitset.
+ */
+bool array_array_container_lazy_union(const array_container_t *src_1,
+                                      const array_container_t *src_2,
+                                      void **dst);
+
+/*
+ * Same as array_array_container_inplace_union except that it will more eagerly produce
+ * a bitset.
+ */
+bool array_array_container_lazy_inplace_union(array_container_t *src_1,
+                                      const array_container_t *src_2,
+                                      void **dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. We assume that dst is a
+ * valid container. The result might need to be further converted to array or
+ * bitset container,
+ * the caller is responsible for the eventual conversion. */
+void array_run_container_union(const array_container_t *src_1,
+                               const run_container_t *src_2,
+                               run_container_t *dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * src2. The result might need to be further converted to array or
+ * bitset container,
+ * the caller is responsible for the eventual conversion. */
+void array_run_container_inplace_union(const array_container_t *src_1,
+                                       run_container_t *src_2);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be src_2.
+ * If run_container_is_full(src_1) is true, you must not be calling this
+ *function.
+ **/
+void run_bitset_container_union(const run_container_t *src_1,
+                                const bitset_container_t *src_2,
+                                bitset_container_t *dst);
+
+/* Compute the union of src_1 and src_2 and write the result to
+ * dst. It is allowed for dst to be src_2.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
+ * If run_container_is_full(src_1) is true, you must not be calling this
+ * function.
+ * */
+void run_bitset_container_lazy_union(const run_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst);
+
+#endif /* INCLUDE_CONTAINERS_MIXED_UNION_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_union.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_xor.h */
+/*
+ * mixed_xor.h
+ *
+ */
+
+#ifndef INCLUDE_CONTAINERS_MIXED_XOR_H_
+#define INCLUDE_CONTAINERS_MIXED_XOR_H_
+
+/* These functions appear to exclude cases where the
+ * inputs have the same type and the output is guaranteed
+ * to have the same type as the inputs.  Eg, bitset unions
+ */
+
+/*
+ * Java implementation (as of May 2016) for array_run, run_run
+ * and  bitset_run don't do anything different for inplace.
+ * (They are not truly in place.)
+ */
+
+
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).
+ * Result is true iff dst is a bitset  */
+bool array_bitset_container_xor(const array_container_t *src_1,
+                                const bitset_container_t *src_2, void **dst);
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. It is allowed for src_2 to be dst.  This version does not
+ * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
+ */
+
+void array_bitset_container_lazy_xor(const array_container_t *src_1,
+                                     const bitset_container_t *src_2,
+                                     bitset_container_t *dst);
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially). Return value is
+ * "dst is a bitset"
+ */
+
+bool bitset_bitset_container_xor(const bitset_container_t *src_1,
+                                 const bitset_container_t *src_2, void **dst);
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_xor(const run_container_t *src_1,
+                              const bitset_container_t *src_2, void **dst);
+
+/* lazy xor.  Dst is initialized and may be equal to src_2.
+ *  Result is left as a bitset container, even if actual
+ *  cardinality would dictate an array container.
+ */
+
+void run_bitset_container_lazy_xor(const run_container_t *src_1,
+                                   const bitset_container_t *src_2,
+                                   bitset_container_t *dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_xor(const array_container_t *src_1,
+                            const run_container_t *src_2, void **dst);
+
+/* dst does not initially have a valid container.  Creates either
+ * an array or a bitset container, indicated by return code
+ */
+
+bool array_array_container_xor(const array_container_t *src_1,
+                               const array_container_t *src_2, void **dst);
+
+/* dst does not initially have a valid container.  Creates either
+ * an array or a bitset container, indicated by return code.
+ * A bitset container will not have a valid cardinality and the
+ * container type might not be correct for the actual cardinality
+ */
+
+bool array_array_container_lazy_xor(const array_container_t *src_1,
+                                    const array_container_t *src_2, void **dst);
+
+/* Dst is a valid run container. (Can it be src_2? Let's say not.)
+ * Leaves result as run container, even if other options are
+ * smaller.
+ */
+
+void array_run_container_lazy_xor(const array_container_t *src_1,
+                                  const run_container_t *src_2,
+                                  run_container_t *dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int run_run_container_xor(const run_container_t *src_1,
+                          const run_container_t *src_2, void **dst);
+
+/* INPLACE versions (initial implementation may not exploit all inplace
+ * opportunities (if any...)
+ */
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst (which has no container initially).  It will modify src_1
+ * to be dst if the result is a bitset.  Otherwise, it will
+ * free src_1 and dst will be a new array container.  In both
+ * cases, the caller is responsible for deallocating dst.
+ * Returns true iff dst is a bitset  */
+
+bool bitset_array_container_ixor(bitset_container_t *src_1,
+                                 const array_container_t *src_2, void **dst);
+
+bool bitset_bitset_container_ixor(bitset_container_t *src_1,
+                                  const bitset_container_t *src_2, void **dst);
+
+bool array_bitset_container_ixor(array_container_t *src_1,
+                                 const bitset_container_t *src_2, void **dst);
+
+/* Compute the xor of src_1 and src_2 and write the result to
+ * dst. Result may be either a bitset or an array container
+ * (returns "result is bitset"). dst does not initially have
+ * any container, but becomes either a bitset container (return
+ * result true) or an array container.
+ */
+
+bool run_bitset_container_ixor(run_container_t *src_1,
+                               const bitset_container_t *src_2, void **dst);
+
+bool bitset_run_container_ixor(bitset_container_t *src_1,
+                               const run_container_t *src_2, void **dst);
+
+/* dst does not indicate a valid container initially.  Eventually it
+ * can become any kind of container.
+ */
+
+int array_run_container_ixor(array_container_t *src_1,
+                             const run_container_t *src_2, void **dst);
+
+int run_array_container_ixor(run_container_t *src_1,
+                             const array_container_t *src_2, void **dst);
+
+bool array_array_container_ixor(array_container_t *src_1,
+                                const array_container_t *src_2, void **dst);
+
+int run_run_container_ixor(run_container_t *src_1, const run_container_t *src_2,
+                           void **dst);
+#endif
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/mixed_xor.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/containers.h */
+#ifndef CONTAINERS_CONTAINERS_H
+#define CONTAINERS_CONTAINERS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+
+// would enum be possible or better?
+
+/**
+ * The switch case statements follow
+ * BITSET_CONTAINER_TYPE_CODE -- ARRAY_CONTAINER_TYPE_CODE --
+ * RUN_CONTAINER_TYPE_CODE
+ * so it makes more sense to number them 1, 2, 3 (in the vague hope that the
+ * compiler might exploit this ordering).
+ */
+
+#define BITSET_CONTAINER_TYPE_CODE 1
+#define ARRAY_CONTAINER_TYPE_CODE 2
+#define RUN_CONTAINER_TYPE_CODE 3
+#define SHARED_CONTAINER_TYPE_CODE 4
+
+// macro for pairing container type codes
+#define CONTAINER_PAIR(c1, c2) (4 * (c1) + (c2))
+
+/**
+ * A shared container is a wrapper around a container
+ * with reference counting.
+ */
+
+struct shared_container_s {
+    void *container;
+    uint8_t typecode;
+    uint32_t counter;  // to be managed atomically
+};
+
+typedef struct shared_container_s shared_container_t;
+
+/*
+ * With copy_on_write = true
+ *  Create a new shared container if the typecode is not SHARED_CONTAINER_TYPE,
+ * otherwise, increase the count
+ * If copy_on_write = false, then clone.
+ * Return NULL in case of failure.
+ **/
+void *get_copy_of_container(void *container, uint8_t *typecode,
+                            bool copy_on_write);
+
+/* Frees a shared container (actually decrement its counter and only frees when
+ * the counter falls to zero). */
+void shared_container_free(shared_container_t *container);
+
+/* extract a copy from the shared container, freeing the shared container if
+there is just one instance left,
+clone instances when the counter is higher than one
+*/
+void *shared_container_extract_copy(shared_container_t *container,
+                                    uint8_t *typecode);
+
+/* access to container underneath */
+inline const void *container_unwrap_shared(
+    const void *candidate_shared_container, uint8_t *type) {
+    if (*type == SHARED_CONTAINER_TYPE_CODE) {
+        *type =
+            ((const shared_container_t *)candidate_shared_container)->typecode;
+        assert(*type != SHARED_CONTAINER_TYPE_CODE);
+        return ((const shared_container_t *)candidate_shared_container)->container;
+    } else {
+        return candidate_shared_container;
+    }
+}
+
+
+/* access to container underneath */
+inline void *container_mutable_unwrap_shared(
+    void *candidate_shared_container, uint8_t *type) {
+    if (*type == SHARED_CONTAINER_TYPE_CODE) {
+        *type =
+            ((shared_container_t *)candidate_shared_container)->typecode;
+        assert(*type != SHARED_CONTAINER_TYPE_CODE);
+        return ((shared_container_t *)candidate_shared_container)->container;
+    } else {
+        return candidate_shared_container;
+    }
+}
+
+/* access to container underneath and queries its type */
+static inline uint8_t get_container_type(const void *container, uint8_t type) {
+    if (type == SHARED_CONTAINER_TYPE_CODE) {
+        return ((const shared_container_t *)container)->typecode;
+    } else {
+        return type;
+    }
+}
+
+/**
+ * Copies a container, requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation. If the container is not shared, then it is
+ * physically cloned. Sharable containers are not cloneable.
+ */
+void *container_clone(const void *container, uint8_t typecode);
+
+/* access to container underneath, cloning it if needed */
+static inline void *get_writable_copy_if_shared(
+    void *candidate_shared_container, uint8_t *type) {
+    if (*type == SHARED_CONTAINER_TYPE_CODE) {
+        return shared_container_extract_copy(
+            (shared_container_t *)candidate_shared_container, type);
+    } else {
+        return candidate_shared_container;
+    }
+}
+
+/**
+ * End of shared container code
+ */
+
+static const char *container_names[] = {"bitset", "array", "run", "shared"};
+static const char *shared_container_names[] = {
+    "bitset (shared)", "array (shared)", "run (shared)"};
+
+// no matter what the initial container was, convert it to a bitset
+// if a new container is produced, caller responsible for freeing the previous
+// one
+// container should not be a shared container
+static inline void *container_to_bitset(void *container, uint8_t typecode) {
+    bitset_container_t *result = NULL;
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return container;  // nothing to do
+        case ARRAY_CONTAINER_TYPE_CODE:
+            result =
+                bitset_container_from_array((array_container_t *)container);
+            return result;
+        case RUN_CONTAINER_TYPE_CODE:
+            result = bitset_container_from_run((run_container_t *)container);
+            return result;
+        case SHARED_CONTAINER_TYPE_CODE:
+            assert(false);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+/**
+ * Get the container name from the typecode
+ */
+static inline const char *get_container_name(uint8_t typecode) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return container_names[0];
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return container_names[1];
+        case RUN_CONTAINER_TYPE_CODE:
+            return container_names[2];
+        case SHARED_CONTAINER_TYPE_CODE:
+            return container_names[3];
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return "unknown";
+    }
+}
+
+static inline const char *get_full_container_name(const void *container,
+                                                  uint8_t typecode) {
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return container_names[0];
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return container_names[1];
+        case RUN_CONTAINER_TYPE_CODE:
+            return container_names[2];
+        case SHARED_CONTAINER_TYPE_CODE:
+            switch (((const shared_container_t *)container)->typecode) {
+                case BITSET_CONTAINER_TYPE_CODE:
+                    return shared_container_names[0];
+                case ARRAY_CONTAINER_TYPE_CODE:
+                    return shared_container_names[1];
+                case RUN_CONTAINER_TYPE_CODE:
+                    return shared_container_names[2];
+                default:
+                    assert(false);
+                    __builtin_unreachable();
+                    return "unknown";
+            }
+            break;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return "unknown";
+    }
+    __builtin_unreachable();
+    return NULL;
+}
+
+/**
+ * Get the container cardinality (number of elements), requires a  typecode
+ */
+static inline int container_get_cardinality(const void *container,
+                                            uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_cardinality(
+                (const bitset_container_t *)container);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_cardinality(
+                (const array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_cardinality(
+                (const run_container_t *)container);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+
+
+// returns true if a container is known to be full. Note that a lazy bitset
+// container
+// might be full without us knowing
+static inline bool container_is_full(const void *container, uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_cardinality(
+                       (const bitset_container_t *)container) == (1 << 16);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_cardinality(
+                       (const array_container_t *)container) == (1 << 16);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_is_full((const run_container_t *)container);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+static inline int container_shrink_to_fit(void *container, uint8_t typecode) {
+    container = container_mutable_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return 0;  // no shrinking possible
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_shrink_to_fit(
+                (array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_shrink_to_fit((run_container_t *)container);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+
+/**
+ * make a container with a run of ones
+ */
+/* initially always use a run container, even if an array might be
+ * marginally
+ * smaller */
+static inline void *container_range_of_ones(uint32_t range_start,
+                                            uint32_t range_end,
+                                            uint8_t *result_type) {
+    assert(range_end >= range_start);
+    uint64_t cardinality =  range_end - range_start + 1;
+    if(cardinality <= 2) {
+      *result_type = ARRAY_CONTAINER_TYPE_CODE;
+      return array_container_create_range(range_start, range_end);
+    } else {
+      *result_type = RUN_CONTAINER_TYPE_CODE;
+      return run_container_create_range(range_start, range_end);
+    }
+}
+
+
+/*  Create a container with all the values between in [min,max) at a
+    distance k*step from min. */
+static inline void *container_from_range(uint8_t *type, uint32_t min,
+                                         uint32_t max, uint16_t step) {
+    if (step == 0) return NULL;  // being paranoid
+    if (step == 1) {
+        return container_range_of_ones(min,max,type);
+        // Note: the result is not always a run (need to check the cardinality)
+        //*type = RUN_CONTAINER_TYPE_CODE;
+        //return run_container_create_range(min, max);
+    }
+    int size = (max - min + step - 1) / step;
+    if (size <= DEFAULT_MAX_SIZE) {  // array container
+        *type = ARRAY_CONTAINER_TYPE_CODE;
+        array_container_t *array = array_container_create_given_capacity(size);
+        array_container_add_from_range(array, min, max, step);
+        assert(array->cardinality == size);
+        return array;
+    } else {  // bitset container
+        *type = BITSET_CONTAINER_TYPE_CODE;
+        bitset_container_t *bitset = bitset_container_create();
+        bitset_container_add_from_range(bitset, min, max, step);
+        assert(bitset->cardinality == size);
+        return bitset;
+    }
+}
+
+/**
+ * "repair" the container after lazy operations.
+ */
+static inline void *container_repair_after_lazy(void *container,
+                                                uint8_t *typecode) {
+    container = get_writable_copy_if_shared(
+        container, typecode);  // TODO: this introduces unnecessary cloning
+    void *result = NULL;
+    switch (*typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            ((bitset_container_t *)container)->cardinality =
+                bitset_container_compute_cardinality(
+                    (bitset_container_t *)container);
+            if (((bitset_container_t *)container)->cardinality <=
+                DEFAULT_MAX_SIZE) {
+                result = array_container_from_bitset(
+                    (const bitset_container_t *)container);
+                bitset_container_free((bitset_container_t *)container);
+                *typecode = ARRAY_CONTAINER_TYPE_CODE;
+                return result;
+            }
+            return container;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return container;  // nothing to do
+        case RUN_CONTAINER_TYPE_CODE:
+            return convert_run_to_efficient_container_and_free(
+                (run_container_t *)container, typecode);
+        case SHARED_CONTAINER_TYPE_CODE:
+            assert(false);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+/**
+ * Writes the underlying array to buf, outputs how many bytes were written.
+ * This is meant to be byte-by-byte compatible with the Java and Go versions of
+ * Roaring.
+ * The number of bytes written should be
+ * container_write(container, buf).
+ *
+ */
+static inline int32_t container_write(const void *container, uint8_t typecode,
+                                      char *buf) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_write((const bitset_container_t *)container, buf);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_write((const array_container_t *)container, buf);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_write((const run_container_t *)container, buf);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+/**
+ * Get the container size in bytes under portable serialization (see
+ * container_write), requires a
+ * typecode
+ */
+static inline int32_t container_size_in_bytes(const void *container,
+                                              uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_size_in_bytes(
+                (const bitset_container_t *)container);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_size_in_bytes(
+                (const array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_size_in_bytes((const run_container_t *)container);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+/**
+ * print the container (useful for debugging), requires a  typecode
+ */
+void container_printf(const void *container, uint8_t typecode);
+
+/**
+ * print the content of the container as a comma-separated list of 32-bit values
+ * starting at base, requires a  typecode
+ */
+void container_printf_as_uint32_array(const void *container, uint8_t typecode,
+                                      uint32_t base);
+
+/**
+ * Checks whether a container is not empty, requires a  typecode
+ */
+static inline bool container_nonzero_cardinality(const void *container,
+                                                 uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_const_nonzero_cardinality(
+                (const bitset_container_t *)container);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_nonzero_cardinality(
+                (const array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_nonzero_cardinality(
+                (const run_container_t *)container);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+/**
+ * Recover memory from a container, requires a  typecode
+ */
+void container_free(void *container, uint8_t typecode);
+
+/**
+ * Convert a container to an array of values, requires a  typecode as well as a
+ * "base" (most significant values)
+ * Returns number of ints added.
+ */
+static inline int container_to_uint32_array(uint32_t *output,
+                                            const void *container,
+                                            uint8_t typecode, uint32_t base) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_to_uint32_array(
+                output, (const bitset_container_t *)container, base);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_to_uint32_array(
+                output, (const array_container_t *)container, base);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_to_uint32_array(
+                output, (const run_container_t *)container, base);
+    }
+    assert(false);
+    __builtin_unreachable();
+    return 0;  // unreached
+}
+
+/**
+ * Add a value to a container, requires a  typecode, fills in new_typecode and
+ * return (possibly different) container.
+ * This function may allocate a new container, and caller is responsible for
+ * memory deallocation
+ */
+static inline void *container_add(void *container, uint16_t val,
+                                  uint8_t typecode, uint8_t *new_typecode) {
+    container = get_writable_copy_if_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            bitset_container_set((bitset_container_t *)container, val);
+            *new_typecode = BITSET_CONTAINER_TYPE_CODE;
+            return container;
+        case ARRAY_CONTAINER_TYPE_CODE: {
+            array_container_t *ac = (array_container_t *)container;
+            if (array_container_try_add(ac, val, DEFAULT_MAX_SIZE) != -1) {
+                *new_typecode = ARRAY_CONTAINER_TYPE_CODE;
+                return ac;
+            } else {
+                bitset_container_t* bitset = bitset_container_from_array(ac);
+                bitset_container_add(bitset, val);
+                *new_typecode = BITSET_CONTAINER_TYPE_CODE;
+                return bitset;
+            }
+        } break;
+        case RUN_CONTAINER_TYPE_CODE:
+            // per Java, no container type adjustments are done (revisit?)
+            run_container_add((run_container_t *)container, val);
+            *new_typecode = RUN_CONTAINER_TYPE_CODE;
+            return container;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Remove a value from a container, requires a  typecode, fills in new_typecode
+ * and
+ * return (possibly different) container.
+ * This function may allocate a new container, and caller is responsible for
+ * memory deallocation
+ */
+static inline void *container_remove(void *container, uint16_t val,
+                                     uint8_t typecode, uint8_t *new_typecode) {
+    container = get_writable_copy_if_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            if (bitset_container_remove((bitset_container_t *)container, val)) {
+                if (bitset_container_cardinality(
+                        (bitset_container_t *)container) <= DEFAULT_MAX_SIZE) {
+                    *new_typecode = ARRAY_CONTAINER_TYPE_CODE;
+                    return array_container_from_bitset(
+                        (bitset_container_t *)container);
+                }
+            }
+            *new_typecode = typecode;
+            return container;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            *new_typecode = typecode;
+            array_container_remove((array_container_t *)container, val);
+            return container;
+        case RUN_CONTAINER_TYPE_CODE:
+            // per Java, no container type adjustments are done (revisit?)
+            run_container_remove((run_container_t *)container, val);
+            *new_typecode = RUN_CONTAINER_TYPE_CODE;
+            return container;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Check whether a value is in a container, requires a  typecode
+ */
+inline bool container_contains(const void *container, uint16_t val,
+                               uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_get((const bitset_container_t *)container,
+                                        val);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_contains(
+                (const array_container_t *)container, val);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_contains((const run_container_t *)container,
+                                          val);
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return false;
+    }
+}
+
+/**
+ * Check whether a range of values from range_start (included) to range_end (excluded)
+ * is in a container, requires a typecode
+ */
+static inline bool container_contains_range(const void *container, uint32_t range_start,
+					uint32_t range_end, uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_get_range((const bitset_container_t *)container,
+                                                range_start, range_end);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_contains_range((const array_container_t *)container,
+                                                    range_start, range_end);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_contains_range((const run_container_t *)container,
+                                                    range_start, range_end);
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return false;
+    }
+}
+
+int32_t container_serialize(const void *container, uint8_t typecode,
+                            char *buf) WARN_UNUSED;
+
+uint32_t container_serialization_len(const void *container, uint8_t typecode);
+
+void *container_deserialize(uint8_t typecode, const char *buf, size_t buf_len);
+
+/**
+ * Returns true if the two containers have the same content. Note that
+ * two containers having different types can be "equal" in this sense.
+ */
+static inline bool container_equals(const void *c1, uint8_t type1,
+                                    const void *c2, uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return bitset_container_equals((const bitset_container_t *)c1,
+                                           (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            return run_container_equals_bitset((const run_container_t *)c2,
+                                               (const bitset_container_t *)c1);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return run_container_equals_bitset((const run_container_t *)c1,
+                                               (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            // java would always return false?
+            return array_container_equal_bitset((const array_container_t *)c2,
+                                                (const bitset_container_t *)c1);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            // java would always return false?
+            return array_container_equal_bitset((const array_container_t *)c1,
+                                                (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return run_container_equals_array((const run_container_t *)c2,
+                                              (const array_container_t *)c1);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            return run_container_equals_array((const run_container_t *)c1,
+                                              (const array_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            return array_container_equals((const array_container_t *)c1,
+                                          (const array_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return run_container_equals((const run_container_t *)c1,
+                                        (const run_container_t *)c2);
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return false;
+    }
+}
+
+/**
+ * Returns true if the container c1 is a subset of the container c2. Note that
+ * c1 can be a subset of c2 even if they have a different type.
+ */
+static inline bool container_is_subset(const void *c1, uint8_t type1,
+                                       const void *c2, uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return bitset_container_is_subset((const bitset_container_t *)c1,
+                                              (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            return bitset_container_is_subset_run((const bitset_container_t *)c1,
+                                                  (const run_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return run_container_is_subset_bitset((const run_container_t *)c1,
+                                                  (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            return false;  // by construction, size(c1) > size(c2)
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return array_container_is_subset_bitset((const array_container_t *)c1,
+                                                    (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return array_container_is_subset_run((const array_container_t *)c1,
+                                                 (const run_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            return run_container_is_subset_array((const run_container_t *)c1,
+                                                 (const array_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            return array_container_is_subset((const array_container_t *)c1,
+                                             (const array_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return run_container_is_subset((const run_container_t *)c1,
+                                           (const run_container_t *)c2);
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return false;
+    }
+}
+
+// macro-izations possibilities for generic non-inplace binary-op dispatch
+
+/**
+ * Compute intersection between two containers, generate a new container (having
+ * type result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ */
+static inline void *container_and(const void *c1, uint8_t type1, const void *c2,
+                                  uint8_t type2, uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = bitset_bitset_container_intersection(
+                               (const bitset_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            array_container_intersection((const array_container_t *)c1,
+                                         (const array_container_t *)c2,
+                                         (array_container_t *)result);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            run_container_intersection((const run_container_t *)c1,
+                                       (const run_container_t *)c2,
+                                       (run_container_t *)result);
+            return convert_run_to_efficient_container_and_free(
+                (run_container_t *)result, result_type);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            array_bitset_container_intersection((const array_container_t *)c2,
+                                                (const bitset_container_t *)c1,
+                                                (array_container_t *)result);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            array_bitset_container_intersection((const array_container_t *)c1,
+                                                (const bitset_container_t *)c2,
+                                                (array_container_t *)result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            *result_type = run_bitset_container_intersection(
+                               (const run_container_t *)c2,
+                               (const bitset_container_t *)c1, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = run_bitset_container_intersection(
+                               (const run_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            array_run_container_intersection((const array_container_t *)c1,
+                                             (const run_container_t *)c2,
+                                             (array_container_t *)result);
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            array_run_container_intersection((const array_container_t *)c2,
+                                             (const run_container_t *)c1,
+                                             (array_container_t *)result);
+            return result;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Compute the size of the intersection between two containers.
+ */
+static inline int container_and_cardinality(const void *c1, uint8_t type1,
+                                            const void *c2, uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return bitset_container_and_justcard(
+                (const bitset_container_t *)c1, (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            return array_container_intersection_cardinality(
+                (const array_container_t *)c1, (const array_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return run_container_intersection_cardinality(
+                (const run_container_t *)c1, (const run_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            return array_bitset_container_intersection_cardinality(
+                (const array_container_t *)c2, (const bitset_container_t *)c1);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return array_bitset_container_intersection_cardinality(
+                (const array_container_t *)c1, (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            return run_bitset_container_intersection_cardinality(
+                (const run_container_t *)c2, (const bitset_container_t *)c1);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return run_bitset_container_intersection_cardinality(
+                (const run_container_t *)c1, (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return array_run_container_intersection_cardinality(
+                (const array_container_t *)c1, (const run_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            return array_run_container_intersection_cardinality(
+                (const array_container_t *)c2, (const run_container_t *)c1);
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return 0;
+    }
+}
+
+/**
+ * Check whether two containers intersect.
+ */
+static inline bool container_intersect(const void *c1, uint8_t type1, const void *c2,
+                                  uint8_t type2) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return bitset_container_intersect(
+                               (const bitset_container_t *)c1,
+                               (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            return array_container_intersect((const array_container_t *)c1,
+                                         (const array_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return run_container_intersect((const run_container_t *)c1,
+                                       (const run_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            return array_bitset_container_intersect((const array_container_t *)c2,
+                                                (const bitset_container_t *)c1);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return array_bitset_container_intersect((const array_container_t *)c1,
+                                                (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            return run_bitset_container_intersect(
+                               (const run_container_t *)c2,
+                               (const bitset_container_t *)c1);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            return run_bitset_container_intersect(
+                               (const run_container_t *)c1,
+                               (const bitset_container_t *)c2);
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            return array_run_container_intersect((const array_container_t *)c1,
+                                             (const run_container_t *)c2);
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            return array_run_container_intersect((const array_container_t *)c2,
+                                             (const run_container_t *)c1);
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return 0;
+    }
+}
+
+/**
+ * Compute intersection between two containers, with result in the first
+ container if possible. If the returned pointer is identical to c1,
+ then the container has been modified. If the returned pointer is different
+ from c1, then a new container has been created and the caller is responsible
+ for freeing it.
+ The type of the first container may change. Returns the modified
+ (and possibly new) container.
+*/
+static inline void *container_iand(void *c1, uint8_t type1, const void *c2,
+                                   uint8_t type2, uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type =
+                bitset_bitset_container_intersection_inplace(
+                    (bitset_container_t *)c1, (const bitset_container_t *)c2, &result)
+                    ? BITSET_CONTAINER_TYPE_CODE
+                    : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            array_container_intersection_inplace((array_container_t *)c1,
+                                                 (const array_container_t *)c2);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;
+            return c1;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            run_container_intersection((const run_container_t *)c1,
+                                       (const run_container_t *)c2,
+                                       (run_container_t *)result);
+            // as of January 2016, Java code used non-in-place intersection for
+            // two runcontainers
+            return convert_run_to_efficient_container_and_free(
+                (run_container_t *)result, result_type);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            // c1 is a bitmap so no inplace possible
+            result = array_container_create();
+            array_bitset_container_intersection((const array_container_t *)c2,
+                                                (const bitset_container_t *)c1,
+                                                (array_container_t *)result);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            array_bitset_container_intersection(
+                (const array_container_t *)c1, (const bitset_container_t *)c2,
+                (array_container_t *)c1);  // allowed
+            return c1;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            // will attempt in-place computation
+            *result_type = run_bitset_container_intersection(
+                               (const run_container_t *)c2,
+                               (const bitset_container_t *)c1, &c1)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return c1;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = run_bitset_container_intersection(
+                               (const run_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            array_run_container_intersection((const array_container_t *)c1,
+                                             (const run_container_t *)c2,
+                                             (array_container_t *)result);
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;  // never bitset
+            array_run_container_intersection((const array_container_t *)c2,
+                                             (const run_container_t *)c1,
+                                             (array_container_t *)result);
+            return result;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Compute union between two containers, generate a new container (having type
+ * result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ */
+static inline void *container_or(const void *c1, uint8_t type1, const void *c2,
+                                 uint8_t type2, uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            bitset_container_or((const bitset_container_t *)c1,
+                                (const bitset_container_t *)c2,
+                                (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_array_container_union(
+                               (const array_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            run_container_union((const run_container_t *)c1,
+                                (const run_container_t *)c2,
+                                (run_container_t *)result);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // todo: could be optimized since will never convert to array
+            result = convert_run_to_efficient_container_and_free(
+                (run_container_t *)result, (uint8_t *)result_type);
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            array_bitset_container_union((const array_container_t *)c2,
+                                         (const bitset_container_t *)c1,
+                                         (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            array_bitset_container_union((const array_container_t *)c1,
+                                         (const bitset_container_t *)c2,
+                                         (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c2)) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                run_container_copy((const run_container_t *)c2,
+                                   (run_container_t *)result);
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_union((const run_container_t *)c2,
+                                       (const bitset_container_t *)c1,
+                                       (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c1)) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                run_container_copy((const run_container_t *)c1,
+                                   (run_container_t *)result);
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_union((const run_container_t *)c1,
+                                       (const bitset_container_t *)c2,
+                                       (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_union((const array_container_t *)c1,
+                                      (const run_container_t *)c2,
+                                      (run_container_t *)result);
+            result = convert_run_to_efficient_container_and_free(
+                (run_container_t *)result, (uint8_t *)result_type);
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_union((const array_container_t *)c2,
+                                      (const run_container_t *)c1,
+                                      (run_container_t *)result);
+            result = convert_run_to_efficient_container_and_free(
+                (run_container_t *)result, (uint8_t *)result_type);
+            return result;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute union between two containers, generate a new container (having type
+ * result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+ */
+static inline void *container_lazy_or(const void *c1, uint8_t type1,
+                                      const void *c2, uint8_t type2,
+                                      uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            bitset_container_or_nocard(
+                (const bitset_container_t *)c1, (const bitset_container_t *)c2,
+                (bitset_container_t *)result);  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_array_container_lazy_union(
+                               (const array_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            run_container_union((const run_container_t *)c1,
+                                (const run_container_t *)c2,
+                                (run_container_t *)result);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // we are being lazy
+            result = convert_run_to_efficient_container(
+                (run_container_t *)result, result_type);
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            array_bitset_container_lazy_union(
+                (const array_container_t *)c2, (const bitset_container_t *)c1,
+                (bitset_container_t *)result);  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            array_bitset_container_lazy_union(
+                (const array_container_t *)c1, (const bitset_container_t *)c2,
+                (bitset_container_t *)result);  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c2)) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                run_container_copy((const run_container_t *)c2,
+                                   (run_container_t *)result);
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_lazy_union(
+                (const run_container_t *)c2, (const bitset_container_t *)c1,
+                (bitset_container_t *)result);  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c1)) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                run_container_copy((const run_container_t *)c1,
+                                   (run_container_t *)result);
+                return result;
+            }
+            result = bitset_container_create();
+            run_bitset_container_lazy_union(
+                (const run_container_t *)c1, (const bitset_container_t *)c2,
+                (bitset_container_t *)result);  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_union((const array_container_t *)c1,
+                                      (const run_container_t *)c2,
+                                      (run_container_t *)result);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_union(
+                (const array_container_t *)c2, (const run_container_t *)c1,
+                (run_container_t *)result);  // TODO make lazy
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute the union between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+*/
+static inline void *container_ior(void *c1, uint8_t type1, const void *c2,
+                                  uint8_t type2, uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            bitset_container_or((const bitset_container_t *)c1,
+                                (const bitset_container_t *)c2,
+                                (bitset_container_t *)c1);
+#ifdef OR_BITSET_CONVERSION_TO_FULL
+            if (((bitset_container_t *)c1)->cardinality ==
+                (1 << 16)) {  // we convert
+                result = run_container_create_range(0, (1 << 16));
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                return result;
+            }
+#endif
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return c1;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_array_container_inplace_union(
+                               (array_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            if((result == NULL)
+               && (*result_type == ARRAY_CONTAINER_TYPE_CODE)) {
+                 return c1; // the computation was done in-place!
+            }
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            run_container_union_inplace((run_container_t *)c1,
+                                        (const run_container_t *)c2);
+            return convert_run_to_efficient_container((run_container_t *)c1,
+                                                      result_type);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            array_bitset_container_union((const array_container_t *)c2,
+                                         (const bitset_container_t *)c1,
+                                         (bitset_container_t *)c1);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;  // never array
+            return c1;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            // c1 is an array, so no in-place possible
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            array_bitset_container_union((const array_container_t *)c1,
+                                         (const bitset_container_t *)c2,
+                                         (bitset_container_t *)result);
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c2)) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                run_container_copy((const run_container_t *)c2,
+                                   (run_container_t *)result);
+                return result;
+            }
+            run_bitset_container_union((const run_container_t *)c2,
+                                       (const bitset_container_t *)c1,
+                                       (bitset_container_t *)c1);  // allowed
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return c1;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c1)) {
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+
+                return c1;
+            }
+            result = bitset_container_create();
+            run_bitset_container_union((const run_container_t *)c1,
+                                       (const bitset_container_t *)c2,
+                                       (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_union((const array_container_t *)c1,
+                                      (const run_container_t *)c2,
+                                      (run_container_t *)result);
+            result = convert_run_to_efficient_container_and_free(
+                (run_container_t *)result, result_type);
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            array_run_container_inplace_union((const array_container_t *)c2,
+                                              (run_container_t *)c1);
+            c1 = convert_run_to_efficient_container((run_container_t *)c1,
+                                                    result_type);
+            return c1;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Compute the union between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+*/
+static inline void *container_lazy_ior(void *c1, uint8_t type1, const void *c2,
+                                       uint8_t type2, uint8_t *result_type) {
+    assert(type1 != SHARED_CONTAINER_TYPE_CODE);
+    // c1 = get_writable_copy_if_shared(c1,&type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+#ifdef LAZY_OR_BITSET_CONVERSION_TO_FULL
+            // if we have two bitsets, we might as well compute the cardinality
+            bitset_container_or((const bitset_container_t *)c1,
+                                (const bitset_container_t *)c2,
+                                (bitset_container_t *)c1);
+            // it is possible that two bitsets can lead to a full container
+            if (((bitset_container_t *)c1)->cardinality ==
+                (1 << 16)) {  // we convert
+                result = run_container_create_range(0, (1 << 16));
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                return result;
+            }
+#else
+            bitset_container_or_nocard((const bitset_container_t *)c1,
+                                       (const bitset_container_t *)c2,
+                                       (bitset_container_t *)c1);
+
+#endif
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return c1;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_array_container_lazy_inplace_union(
+                               (array_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            if((result == NULL)
+               && (*result_type == ARRAY_CONTAINER_TYPE_CODE)) {
+                 return c1; // the computation was done in-place!
+            }
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            run_container_union_inplace((run_container_t *)c1,
+                                        (const run_container_t *)c2);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            return convert_run_to_efficient_container((run_container_t *)c1,
+                                                      result_type);
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            array_bitset_container_lazy_union(
+                (const array_container_t *)c2, (const bitset_container_t *)c1,
+                (bitset_container_t *)c1);              // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;  // never array
+            return c1;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            // c1 is an array, so no in-place possible
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            array_bitset_container_lazy_union(
+                (const array_container_t *)c1, (const bitset_container_t *)c2,
+                (bitset_container_t *)result);  // is lazy
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c2)) {
+                result = run_container_create();
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                run_container_copy((const run_container_t *)c2,
+                                   (run_container_t *)result);
+                return result;
+            }
+            run_bitset_container_lazy_union(
+                (const run_container_t *)c2, (const bitset_container_t *)c1,
+                (bitset_container_t *)c1);  // allowed //  lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return c1;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c1)) {
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                return c1;
+            }
+            result = bitset_container_create();
+            run_bitset_container_lazy_union(
+                (const run_container_t *)c1, (const bitset_container_t *)c2,
+                (bitset_container_t *)result);  //  lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_union((const array_container_t *)c1,
+                                      (const run_container_t *)c2,
+                                      (run_container_t *)result);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container_and_free(result,
+            // result_type);
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            array_run_container_inplace_union((const array_container_t *)c2,
+                                              (run_container_t *)c1);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container_and_free(result,
+            // result_type);
+            return c1;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Compute symmetric difference (xor) between two containers, generate a new
+ * container (having type result_type), requires a typecode. This allocates new
+ * memory, caller is responsible for deallocation.
+ */
+static inline void *container_xor(const void *c1, uint8_t type1, const void *c2,
+                                  uint8_t type2, uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = bitset_bitset_container_xor(
+                               (const bitset_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_array_container_xor(
+                               (const array_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            *result_type =
+                run_run_container_xor((const run_container_t *)c1,
+                                      (const run_container_t *)c2, &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_bitset_container_xor(
+                               (const array_container_t *)c2,
+                               (const bitset_container_t *)c1, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = array_bitset_container_xor(
+                               (const array_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            *result_type = run_bitset_container_xor(
+                               (const run_container_t *)c2,
+                               (const bitset_container_t *)c1, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+
+            *result_type = run_bitset_container_xor(
+                               (const run_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            *result_type =
+                array_run_container_xor((const array_container_t *)c1,
+                                        (const run_container_t *)c2, &result);
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            *result_type =
+                array_run_container_xor((const array_container_t *)c2,
+                                        (const run_container_t *)c1, &result);
+            return result;
+
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute xor between two containers, generate a new container (having type
+ * result_type), requires a typecode. This allocates new memory, caller
+ * is responsible for deallocation.
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+ */
+static inline void *container_lazy_xor(const void *c1, uint8_t type1,
+                                       const void *c2, uint8_t type2,
+                                       uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            bitset_container_xor_nocard(
+                (const bitset_container_t *)c1, (const bitset_container_t *)c2,
+                (bitset_container_t *)result);  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_array_container_lazy_xor(
+                               (const array_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            // nothing special done yet.
+            *result_type =
+                run_run_container_xor((const run_container_t *)c1,
+                                      (const run_container_t *)c2, &result);
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            array_bitset_container_lazy_xor((const array_container_t *)c2,
+                                            (const bitset_container_t *)c1,
+                                            (bitset_container_t *)result);
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            array_bitset_container_lazy_xor((const array_container_t *)c1,
+                                            (const bitset_container_t *)c2,
+                                            (bitset_container_t *)result);
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            run_bitset_container_lazy_xor((const run_container_t *)c2,
+                                          (const bitset_container_t *)c1,
+                                          (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = bitset_container_create();
+            run_bitset_container_lazy_xor((const run_container_t *)c1,
+                                          (const bitset_container_t *)c2,
+                                          (bitset_container_t *)result);
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_lazy_xor((const array_container_t *)c1,
+                                         (const run_container_t *)c2,
+                                         (run_container_t *)result);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            result = run_container_create();
+            array_run_container_lazy_xor((const array_container_t *)c2,
+                                         (const run_container_t *)c1,
+                                         (run_container_t *)result);
+            *result_type = RUN_CONTAINER_TYPE_CODE;
+            // next line skipped since we are lazy
+            // result = convert_run_to_efficient_container(result, result_type);
+            return result;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute the xor between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+*/
+static inline void *container_ixor(void *c1, uint8_t type1, const void *c2,
+                                   uint8_t type2, uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = bitset_bitset_container_ixor(
+                               (bitset_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = array_array_container_ixor(
+                               (array_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            *result_type = run_run_container_ixor(
+                (run_container_t *)c1, (const run_container_t *)c2, &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = bitset_array_container_ixor(
+                               (bitset_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = array_bitset_container_ixor(
+                               (array_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+
+            return result;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            *result_type =
+                bitset_run_container_ixor((bitset_container_t *)c1,
+                                          (const run_container_t *)c2, &result)
+                    ? BITSET_CONTAINER_TYPE_CODE
+                    : ARRAY_CONTAINER_TYPE_CODE;
+
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = run_bitset_container_ixor(
+                               (run_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+
+            return result;
+
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            *result_type = array_run_container_ixor(
+                (array_container_t *)c1, (const run_container_t *)c2, &result);
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = run_array_container_ixor(
+                (run_container_t *)c1, (const array_container_t *)c2, &result);
+            return result;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Compute the xor between two containers, with result in the first container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+ *
+ * This lazy version delays some operations such as the maintenance of the
+ * cardinality. It requires repair later on the generated containers.
+*/
+static inline void *container_lazy_ixor(void *c1, uint8_t type1, const void *c2,
+                                        uint8_t type2, uint8_t *result_type) {
+    assert(type1 != SHARED_CONTAINER_TYPE_CODE);
+    // c1 = get_writable_copy_if_shared(c1,&type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            bitset_container_xor_nocard((bitset_container_t *)c1,
+                                        (const bitset_container_t *)c2,
+                                        (bitset_container_t *)c1);  // is lazy
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            return c1;
+        // TODO: other cases being lazy, esp. when we know inplace not likely
+        // could see the corresponding code for union
+        default:
+            // we may have a dirty bitset (without a precomputed cardinality) and
+            // calling container_ixor on it might be unsafe.
+            if( (type1 == BITSET_CONTAINER_TYPE_CODE)
+              && (((const bitset_container_t *)c1)->cardinality == BITSET_UNKNOWN_CARDINALITY)) {
+                ((bitset_container_t *)c1)->cardinality = bitset_container_compute_cardinality((bitset_container_t *)c1);
+            }
+            return container_ixor(c1, type1, c2, type2, result_type);
+    }
+}
+
+/**
+ * Compute difference (andnot) between two containers, generate a new
+ * container (having type result_type), requires a typecode. This allocates new
+ * memory, caller is responsible for deallocation.
+ */
+static inline void *container_andnot(const void *c1, uint8_t type1,
+                                     const void *c2, uint8_t type2,
+                                     uint8_t *result_type) {
+    c1 = container_unwrap_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = bitset_bitset_container_andnot(
+                               (const bitset_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            array_array_container_andnot((const array_container_t *)c1,
+                                         (const array_container_t *)c2,
+                                         (array_container_t *)result);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c2)) {
+                result = array_container_create();
+                *result_type = ARRAY_CONTAINER_TYPE_CODE;
+                return result;
+            }
+            *result_type =
+                run_run_container_andnot((const run_container_t *)c1,
+                                         (const run_container_t *)c2, &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = bitset_array_container_andnot(
+                               (const bitset_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            result = array_container_create();
+            array_bitset_container_andnot((const array_container_t *)c1,
+                                          (const bitset_container_t *)c2,
+                                          (array_container_t *)result);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c2)) {
+                result = array_container_create();
+                *result_type = ARRAY_CONTAINER_TYPE_CODE;
+                return result;
+            }
+            *result_type = bitset_run_container_andnot(
+                               (const bitset_container_t *)c1,
+                               (const run_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+
+            *result_type = run_bitset_container_andnot(
+                               (const run_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            if (run_container_is_full((const run_container_t *)c2)) {
+                result = array_container_create();
+                *result_type = ARRAY_CONTAINER_TYPE_CODE;
+                return result;
+            }
+            result = array_container_create();
+            array_run_container_andnot((const array_container_t *)c1,
+                                       (const run_container_t *)c2,
+                                       (array_container_t *)result);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = run_array_container_andnot(
+                (const run_container_t *)c1, (const array_container_t *)c2,
+                &result);
+            return result;
+
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;  // unreached
+    }
+}
+
+/**
+ * Compute the andnot between two containers, with result in the first
+ * container.
+ * If the returned pointer is identical to c1, then the container has been
+ * modified.
+ * If the returned pointer is different from c1, then a new container has been
+ * created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container
+*/
+static inline void *container_iandnot(void *c1, uint8_t type1, const void *c2,
+                                      uint8_t type2, uint8_t *result_type) {
+    c1 = get_writable_copy_if_shared(c1, &type1);
+    c2 = container_unwrap_shared(c2, &type2);
+    void *result = NULL;
+    switch (CONTAINER_PAIR(type1, type2)) {
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = bitset_bitset_container_iandnot(
+                               (bitset_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            array_array_container_iandnot((array_container_t *)c1,
+                                          (const array_container_t *)c2);
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;
+            return c1;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            *result_type = run_run_container_iandnot(
+                (run_container_t *)c1, (const run_container_t *)c2, &result);
+            return result;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = bitset_array_container_iandnot(
+                               (bitset_container_t *)c1,
+                               (const array_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;
+
+            array_bitset_container_iandnot((array_container_t *)c1,
+                                           (const bitset_container_t *)c2);
+            return c1;
+
+        case CONTAINER_PAIR(BITSET_CONTAINER_TYPE_CODE,
+                            RUN_CONTAINER_TYPE_CODE):
+            *result_type = bitset_run_container_iandnot(
+                               (bitset_container_t *)c1,
+                               (const run_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+
+            return result;
+
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE,
+                            BITSET_CONTAINER_TYPE_CODE):
+            *result_type = run_bitset_container_iandnot(
+                               (run_container_t *)c1,
+                               (const bitset_container_t *)c2, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+
+            return result;
+
+        case CONTAINER_PAIR(ARRAY_CONTAINER_TYPE_CODE, RUN_CONTAINER_TYPE_CODE):
+            *result_type = ARRAY_CONTAINER_TYPE_CODE;
+            array_run_container_iandnot((array_container_t *)c1,
+                                        (const run_container_t *)c2);
+            return c1;
+        case CONTAINER_PAIR(RUN_CONTAINER_TYPE_CODE, ARRAY_CONTAINER_TYPE_CODE):
+            *result_type = run_array_container_iandnot(
+                (run_container_t *)c1, (const array_container_t *)c2, &result);
+            return result;
+        default:
+            assert(false);
+            __builtin_unreachable();
+            return NULL;
+    }
+}
+
+/**
+ * Visit all values x of the container once, passing (base+x,ptr)
+ * to iterator. You need to specify a container and its type.
+ * Returns true if the iteration should continue.
+ */
+static inline bool container_iterate(const void *container, uint8_t typecode,
+                                     uint32_t base, roaring_iterator iterator,
+                                     void *ptr) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_iterate(
+                (const bitset_container_t *)container, base, iterator, ptr);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_iterate((const array_container_t *)container,
+                                           base, iterator, ptr);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_iterate((const run_container_t *)container,
+                                         base, iterator, ptr);
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return false;
+}
+
+static inline bool container_iterate64(const void *container, uint8_t typecode,
+                                       uint32_t base,
+                                       roaring_iterator64 iterator,
+                                       uint64_t high_bits, void *ptr) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_iterate64(
+                (const bitset_container_t *)container, base, iterator,
+                high_bits, ptr);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_iterate64(
+                (const array_container_t *)container, base, iterator, high_bits,
+                ptr);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_iterate64((const run_container_t *)container,
+                                           base, iterator, high_bits, ptr);
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return false;
+}
+
+static inline void *container_not(const void *c, uint8_t typ,
+                                  uint8_t *result_type) {
+    c = container_unwrap_shared(c, &typ);
+    void *result = NULL;
+    switch (typ) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            *result_type = bitset_container_negation(
+                               (const bitset_container_t *)c, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            array_container_negation((const array_container_t *)c,
+                                     (bitset_container_t *)result);
+            return result;
+        case RUN_CONTAINER_TYPE_CODE:
+            *result_type =
+                run_container_negation((const run_container_t *)c, &result);
+            return result;
+
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return NULL;
+}
+
+static inline void *container_not_range(const void *c, uint8_t typ,
+                                        uint32_t range_start,
+                                        uint32_t range_end,
+                                        uint8_t *result_type) {
+    c = container_unwrap_shared(c, &typ);
+    void *result = NULL;
+    switch (typ) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            *result_type =
+                bitset_container_negation_range((const bitset_container_t *)c,
+                                                range_start, range_end, &result)
+                    ? BITSET_CONTAINER_TYPE_CODE
+                    : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            *result_type =
+                array_container_negation_range((const array_container_t *)c,
+                                               range_start, range_end, &result)
+                    ? BITSET_CONTAINER_TYPE_CODE
+                    : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case RUN_CONTAINER_TYPE_CODE:
+            *result_type = run_container_negation_range(
+                (const run_container_t *)c, range_start, range_end, &result);
+            return result;
+
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return NULL;
+}
+
+static inline void *container_inot(void *c, uint8_t typ, uint8_t *result_type) {
+    c = get_writable_copy_if_shared(c, &typ);
+    void *result = NULL;
+    switch (typ) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            *result_type = bitset_container_negation_inplace(
+                               (bitset_container_t *)c, &result)
+                               ? BITSET_CONTAINER_TYPE_CODE
+                               : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            // will never be inplace
+            result = bitset_container_create();
+            *result_type = BITSET_CONTAINER_TYPE_CODE;
+            array_container_negation((array_container_t *)c,
+                                     (bitset_container_t *)result);
+            array_container_free((array_container_t *)c);
+            return result;
+        case RUN_CONTAINER_TYPE_CODE:
+            *result_type =
+                run_container_negation_inplace((run_container_t *)c, &result);
+            return result;
+
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return NULL;
+}
+
+static inline void *container_inot_range(void *c, uint8_t typ,
+                                         uint32_t range_start,
+                                         uint32_t range_end,
+                                         uint8_t *result_type) {
+    c = get_writable_copy_if_shared(c, &typ);
+    void *result = NULL;
+    switch (typ) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            *result_type =
+                bitset_container_negation_range_inplace(
+                    (bitset_container_t *)c, range_start, range_end, &result)
+                    ? BITSET_CONTAINER_TYPE_CODE
+                    : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case ARRAY_CONTAINER_TYPE_CODE:
+            *result_type =
+                array_container_negation_range_inplace(
+                    (array_container_t *)c, range_start, range_end, &result)
+                    ? BITSET_CONTAINER_TYPE_CODE
+                    : ARRAY_CONTAINER_TYPE_CODE;
+            return result;
+        case RUN_CONTAINER_TYPE_CODE:
+            *result_type = run_container_negation_range_inplace(
+                (run_container_t *)c, range_start, range_end, &result);
+            return result;
+
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return NULL;
+}
+
+/**
+ * If the element of given rank is in this container, supposing that
+ * the first
+ * element has rank start_rank, then the function returns true and
+ * sets element
+ * accordingly.
+ * Otherwise, it returns false and update start_rank.
+ */
+static inline bool container_select(const void *container, uint8_t typecode,
+                                    uint32_t *start_rank, uint32_t rank,
+                                    uint32_t *element) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_select((const bitset_container_t *)container,
+                                           start_rank, rank, element);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_select((const array_container_t *)container,
+                                          start_rank, rank, element);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_select((const run_container_t *)container,
+                                        start_rank, rank, element);
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return false;
+}
+
+static inline uint16_t container_maximum(const void *container,
+                                         uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_maximum((const bitset_container_t *)container);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_maximum((const array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_maximum((const run_container_t *)container);
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return false;
+}
+
+static inline uint16_t container_minimum(const void *container,
+                                         uint8_t typecode) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_minimum((const bitset_container_t *)container);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_minimum((const array_container_t *)container);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_minimum((const run_container_t *)container);
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return false;
+}
+
+// number of values smaller or equal to x
+static inline int container_rank(const void *container, uint8_t typecode,
+                                 uint16_t x) {
+    container = container_unwrap_shared(container, &typecode);
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE_CODE:
+            return bitset_container_rank((const bitset_container_t *)container, x);
+        case ARRAY_CONTAINER_TYPE_CODE:
+            return array_container_rank((const array_container_t *)container, x);
+        case RUN_CONTAINER_TYPE_CODE:
+            return run_container_rank((const run_container_t *)container, x);
+        default:
+            assert(false);
+            __builtin_unreachable();
+    }
+    assert(false);
+    __builtin_unreachable();
+    return false;
+}
+
+/**
+ * Add all values in range [min, max] to a given container.
+ *
+ * If the returned pointer is different from $container, then a new container
+ * has been created and the caller is responsible for freeing it.
+ * The type of the first container may change. Returns the modified
+ * (and possibly new) container.
+ */
+static inline void *container_add_range(void *container, uint8_t type,
+                                        uint32_t min, uint32_t max,
+                                        uint8_t *result_type) {
+    // NB: when selecting new container type, we perform only inexpensive checks
+    switch (type) {
+        case BITSET_CONTAINER_TYPE_CODE: {
+            bitset_container_t *bitset = (bitset_container_t *) container;
+
+            int32_t union_cardinality = 0;
+            union_cardinality += bitset->cardinality;
+            union_cardinality += max - min + 1;
+            union_cardinality -= bitset_lenrange_cardinality(bitset->array, min, max-min);
+
+            if (union_cardinality == INT32_C(0x10000)) {
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                return run_container_create_range(0, INT32_C(0x10000));
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE_CODE;
+                bitset_set_lenrange(bitset->array, min, max - min);
+                bitset->cardinality = union_cardinality;
+                return bitset;
+            }
+        }
+        case ARRAY_CONTAINER_TYPE_CODE: {
+            array_container_t *array = (array_container_t *) container;
+
+            int32_t nvals_greater = count_greater(array->array, array->cardinality, max);
+            int32_t nvals_less = count_less(array->array, array->cardinality - nvals_greater, min);
+            int32_t union_cardinality = nvals_less + (max - min + 1) + nvals_greater;
+
+            if (union_cardinality == INT32_C(0x10000)) {
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                return run_container_create_range(0, INT32_C(0x10000));
+            } else if (union_cardinality <= DEFAULT_MAX_SIZE) {
+                *result_type = ARRAY_CONTAINER_TYPE_CODE;
+                array_container_add_range_nvals(array, min, max, nvals_less, nvals_greater);
+                return array;
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE_CODE;
+                bitset_container_t *bitset = bitset_container_from_array(array);
+                bitset_set_lenrange(bitset->array, min, max - min);
+                bitset->cardinality = union_cardinality;
+                return bitset;
+            }
+        }
+        case RUN_CONTAINER_TYPE_CODE: {
+            run_container_t *run = (run_container_t *) container;
+
+            int32_t nruns_greater = rle16_count_greater(run->runs, run->n_runs, max);
+            int32_t nruns_less = rle16_count_less(run->runs, run->n_runs - nruns_greater, min);
+
+            int32_t run_size_bytes = (nruns_less + 1 + nruns_greater) * sizeof(rle16_t);
+            int32_t bitset_size_bytes = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+
+            if (run_size_bytes <= bitset_size_bytes) {
+                run_container_add_range_nruns(run, min, max, nruns_less, nruns_greater);
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                return run;
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE_CODE;
+                return bitset_container_from_run_range(run, min, max);
+            }
+        }
+        default:
+            __builtin_unreachable();
+    }
+}
+
+/*
+ * Removes all elements in range [min, max].
+ * Returns one of:
+ *   - NULL if no elements left
+ *   - pointer to the original container
+ *   - pointer to a newly-allocated container (if it is more efficient)
+ *
+ * If the returned pointer is different from $container, then a new container
+ * has been created and the caller is responsible for freeing the original container.
+ */
+static inline void *container_remove_range(void *container, uint8_t type,
+                                           uint32_t min, uint32_t max,
+                                           uint8_t *result_type) {
+     switch (type) {
+        case BITSET_CONTAINER_TYPE_CODE: {
+            bitset_container_t *bitset = (bitset_container_t *) container;
+
+            int32_t result_cardinality = bitset->cardinality -
+                bitset_lenrange_cardinality(bitset->array, min, max-min);
+
+            if (result_cardinality == 0) {
+                return NULL;
+            } else if (result_cardinality < DEFAULT_MAX_SIZE) {
+                *result_type = ARRAY_CONTAINER_TYPE_CODE;
+                bitset_reset_range(bitset->array, min, max+1);
+                bitset->cardinality = result_cardinality;
+                return array_container_from_bitset(bitset);
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE_CODE;
+                bitset_reset_range(bitset->array, min, max+1);
+                bitset->cardinality = result_cardinality;
+                return bitset;
+            }
+        }
+        case ARRAY_CONTAINER_TYPE_CODE: {
+            array_container_t *array = (array_container_t *) container;
+
+            int32_t nvals_greater = count_greater(array->array, array->cardinality, max);
+            int32_t nvals_less = count_less(array->array, array->cardinality - nvals_greater, min);
+            int32_t result_cardinality = nvals_less + nvals_greater;
+
+            if (result_cardinality == 0) {
+                return NULL;
+            } else {
+                *result_type = ARRAY_CONTAINER_TYPE_CODE;
+                array_container_remove_range(array, nvals_less,
+                    array->cardinality - result_cardinality);
+                return array;
+            }
+        }
+        case RUN_CONTAINER_TYPE_CODE: {
+            run_container_t *run = (run_container_t *) container;
+
+            if (run->n_runs == 0) {
+                return NULL;
+            }
+            if (min <= run_container_minimum(run) && max >= run_container_maximum(run)) {
+                return NULL;
+            }
+
+            run_container_remove_range(run, min, max);
+
+            if (run_container_serialized_size_in_bytes(run->n_runs) <=
+                    bitset_container_serialized_size_in_bytes()) {
+                *result_type = RUN_CONTAINER_TYPE_CODE;
+                return run;
+            } else {
+                *result_type = BITSET_CONTAINER_TYPE_CODE;
+                return bitset_container_from_run(run);
+            }
+        }
+        default:
+            __builtin_unreachable();
+     }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CONTAINERS_CONTAINERS_H */
+
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/containers.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring_array.h */
+#ifndef INCLUDE_ROARING_ARRAY_H
+#define INCLUDE_ROARING_ARRAY_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#define MAX_CONTAINERS 65536
+
+#define SERIALIZATION_ARRAY_UINT32 1
+#define SERIALIZATION_CONTAINER 2
+
+enum {
+    SERIAL_COOKIE_NO_RUNCONTAINER = 12346,
+    SERIAL_COOKIE = 12347,
+    NO_OFFSET_THRESHOLD = 4
+};
+
+/**
+ * Roaring arrays are array-based key-value pairs having containers as values
+ * and 16-bit integer keys. A roaring bitmap  might be implemented as such.
+ */
+
+// parallel arrays.  Element sizes quite different.
+// Alternative is array
+// of structs.  Which would have better
+// cache performance through binary searches?
+
+typedef struct roaring_array_s {
+    int32_t size;
+    int32_t allocation_size;
+    void **containers;
+    uint16_t *keys;
+    uint8_t *typecodes;
+} roaring_array_t;
+
+/**
+ * Create a new roaring array
+ */
+roaring_array_t *ra_create(void);
+
+/**
+ * Initialize an existing roaring array with the specified capacity (in number
+ * of containers)
+ */
+bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap);
+
+/**
+ * Initialize with default capacity
+ */
+bool ra_init(roaring_array_t *t);
+
+/**
+ * Copies this roaring array, we assume that dest is not initialized
+ */
+bool ra_copy(const roaring_array_t *source, roaring_array_t *dest,
+             bool copy_on_write);
+
+/*
+ * Shrinks the capacity, returns the number of bytes saved.
+ */
+int ra_shrink_to_fit(roaring_array_t *ra);
+
+/**
+ * Copies this roaring array, we assume that dest is initialized
+ */
+bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest,
+                  bool copy_on_write);
+
+/**
+ * Frees the memory used by a roaring array
+ */
+void ra_clear(roaring_array_t *r);
+
+/**
+ * Frees the memory used by a roaring array, but does not free the containers
+ */
+void ra_clear_without_containers(roaring_array_t *r);
+
+/**
+ * Frees just the containers
+ */
+void ra_clear_containers(roaring_array_t *ra);
+
+/**
+ * Get the index corresponding to a 16-bit key
+ */
+inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x) {
+    if ((ra->size == 0) || ra->keys[ra->size - 1] == x) return ra->size - 1;
+    return binarySearch(ra->keys, (int32_t)ra->size, x);
+}
+
+/**
+ * Retrieves the container at index i, filling in the typecode
+ */
+inline void *ra_get_container_at_index(const roaring_array_t *ra, uint16_t i,
+                                       uint8_t *typecode) {
+    *typecode = ra->typecodes[i];
+    return ra->containers[i];
+}
+
+/**
+ * Retrieves the key at index i
+ */
+uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i);
+
+/**
+ * Add a new key-value pair at index i
+ */
+void ra_insert_new_key_value_at(roaring_array_t *ra, int32_t i, uint16_t key,
+                                void *container, uint8_t typecode);
+
+/**
+ * Append a new key-value pair
+ */
+void ra_append(roaring_array_t *ra, uint16_t s, void *c, uint8_t typecode);
+
+/**
+ * Append a new key-value pair to ra, cloning (in COW sense) a value from sa
+ * at index index
+ */
+void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa,
+                    uint16_t index, bool copy_on_write);
+
+/**
+ * Append new key-value pairs to ra, cloning (in COW sense)  values from sa
+ * at indexes
+ * [start_index, end_index)
+ */
+void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index,
+                          bool copy_on_write);
+
+/** appends from sa to ra, ending with the greatest key that is
+ * is less or equal stopping_key
+ */
+void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t stopping_key, bool copy_on_write);
+
+/** appends from sa to ra, starting with the smallest key that is
+ * is strictly greater than before_start
+ */
+
+void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa,
+                            uint16_t before_start, bool copy_on_write);
+
+/**
+ * Move the key-value pairs to ra from sa at indexes
+ * [start_index, end_index), old array should not be freed
+ * (use ra_clear_without_containers)
+ **/
+void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa,
+                          int32_t start_index, int32_t end_index);
+/**
+ * Append new key-value pairs to ra,  from sa at indexes
+ * [start_index, end_index)
+ */
+void ra_append_range(roaring_array_t *ra, roaring_array_t *sa,
+                     int32_t start_index, int32_t end_index,
+                     bool copy_on_write);
+
+/**
+ * Set the container at the corresponding index using the specified
+ * typecode.
+ */
+inline void ra_set_container_at_index(const roaring_array_t *ra, int32_t i,
+                                      void *c, uint8_t typecode) {
+    assert(i < ra->size);
+    ra->containers[i] = c;
+    ra->typecodes[i] = typecode;
+}
+
+/**
+ * If needed, increase the capacity of the array so that it can fit k values
+ * (at
+ * least);
+ */
+bool extend_array(roaring_array_t *ra, int32_t k);
+
+inline int32_t ra_get_size(const roaring_array_t *ra) { return ra->size; }
+
+static inline int32_t ra_advance_until(const roaring_array_t *ra, uint16_t x,
+                                       int32_t pos) {
+    return advanceUntil(ra->keys, pos, ra->size, x);
+}
+
+int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos);
+
+void ra_downsize(roaring_array_t *ra, int32_t new_length);
+
+inline void ra_replace_key_and_container_at_index(roaring_array_t *ra,
+                                                  int32_t i, uint16_t key,
+                                                  void *c, uint8_t typecode) {
+    assert(i < ra->size);
+
+    ra->keys[i] = key;
+    ra->containers[i] = c;
+    ra->typecodes[i] = typecode;
+}
+
+// write set bits to an array
+void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans);
+
+bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset, size_t limit, uint32_t *ans);
+
+/**
+ * write a bitmap to a buffer. This is meant to be compatible with
+ * the
+ * Java and Go versions. Return the size in bytes of the serialized
+ * output (which should be ra_portable_size_in_bytes(ra)).
+ */
+size_t ra_portable_serialize(const roaring_array_t *ra, char *buf);
+
+/**
+ * read a bitmap from a serialized version. This is meant to be compatible
+ * with the Java and Go versions.
+ * maxbytes  indicates how many bytes available from buf.
+ * When the function returns true, roaring_array_t is populated with the data
+ * and *readbytes indicates how many bytes were read. In all cases, if the function
+ * returns true, then maxbytes >= *readbytes.
+ */
+bool ra_portable_deserialize(roaring_array_t *ra, const char *buf, const size_t maxbytes, size_t * readbytes);
+
+/**
+ * Quickly checks whether there is a serialized bitmap at the pointer,
+ * not exceeding size "maxbytes" in bytes. This function does not allocate
+ * memory dynamically.
+ *
+ * This function returns 0 if and only if no valid bitmap is found.
+ * Otherwise, it returns how many bytes are occupied by the bitmap data.
+ */
+size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes);
+
+/**
+ * How many bytes are required to serialize this bitmap (meant to be
+ * compatible
+ * with Java and Go versions)
+ */
+size_t ra_portable_size_in_bytes(const roaring_array_t *ra);
+
+/**
+ * return true if it contains at least one run container.
+ */
+bool ra_has_run_container(const roaring_array_t *ra);
+
+/**
+ * Size of the header when serializing (meant to be compatible
+ * with Java and Go versions)
+ */
+uint32_t ra_portable_header_size(const roaring_array_t *ra);
+
+/**
+ * If the container at the index i is share, unshare it (creating a local
+ * copy if needed).
+ */
+static inline void ra_unshare_container_at_index(roaring_array_t *ra,
+                                                 uint16_t i) {
+    assert(i < ra->size);
+    ra->containers[i] =
+        get_writable_copy_if_shared(ra->containers[i], &ra->typecodes[i]);
+}
+
+/**
+ * remove at index i, sliding over all entries after i
+ */
+void ra_remove_at_index(roaring_array_t *ra, int32_t i);
+
+
+/**
+* clears all containers, sets the size at 0 and shrinks the memory usage.
+*/
+void ra_reset(roaring_array_t *ra);
+
+/**
+ * remove at index i, sliding over all entries after i. Free removed container.
+ */
+void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i);
+
+/**
+ * remove a chunk of indices, sliding over entries after it
+ */
+// void ra_remove_index_range(roaring_array_t *ra, int32_t begin, int32_t end);
+
+// used in inplace andNot only, to slide left the containers from
+// the mutated RoaringBitmap that are after the largest container of
+// the argument RoaringBitmap.  It is followed by a call to resize.
+//
+void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end,
+                   uint32_t new_begin);
+
+/**
+ * Shifts rightmost $count containers to the left (distance < 0) or
+ * to the right (distance > 0).
+ * Allocates memory if necessary.
+ * This function doesn't free or create new containers.
+ * Caller is responsible for that.
+ */
+void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring_array.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/misc/configreport.h */
+/*
+ * configreport.h
+ *
+ */
+
+#ifndef INCLUDE_MISC_CONFIGREPORT_H_
+#define INCLUDE_MISC_CONFIGREPORT_H_
+
+#include <stddef.h>  // for size_t
+#include <stdint.h>
+#include <stdio.h>
+
+
+#ifdef IS_X64
+// useful for basic info (0)
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+                                unsigned int *ecx, unsigned int *edx) {
+#ifdef ROARING_INLINE_ASM
+    __asm volatile("cpuid"
+                   : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
+                   : "0"(*eax), "2"(*ecx));
+#endif /* not sure what to do when inline assembly is unavailable*/
+}
+
+// CPUID instruction takes no parameters as CPUID implicitly uses the EAX
+// register.
+// The EAX register should be loaded with a value specifying what information to
+// return
+static inline void cpuinfo(int code, int *eax, int *ebx, int *ecx, int *edx) {
+#ifdef ROARING_INLINE_ASM
+    __asm__ volatile("cpuid;"  //  call cpuid instruction
+                     : "=a"(*eax), "=b"(*ebx), "=c"(*ecx),
+                       "=d"(*edx)  // output equal to "movl  %%eax %1"
+                     : "a"(code)   // input equal to "movl %1, %%eax"
+                     //:"%eax","%ebx","%ecx","%edx"// clobbered register
+                     );
+#endif /* not sure what to do when inline assembly is unavailable*/
+}
+
+static inline int computecacheline() {
+    int eax = 0, ebx = 0, ecx = 0, edx = 0;
+    cpuinfo((int)0x80000006, &eax, &ebx, &ecx, &edx);
+    return ecx & 0xFF;
+}
+
+// this is quite imperfect, but can be handy
+static inline const char *guessprocessor() {
+    unsigned eax = 1, ebx = 0, ecx = 0, edx = 0;
+    native_cpuid(&eax, &ebx, &ecx, &edx);
+    const char *codename;
+    switch (eax >> 4) {
+        case 0x506E:
+            codename = "Skylake";
+            break;
+        case 0x406C:
+            codename = "CherryTrail";
+            break;
+        case 0x306D:
+            codename = "Broadwell";
+            break;
+        case 0x306C:
+            codename = "Haswell";
+            break;
+        case 0x306A:
+            codename = "IvyBridge";
+            break;
+        case 0x206A:
+        case 0x206D:
+            codename = "SandyBridge";
+            break;
+        case 0x2065:
+        case 0x206C:
+        case 0x206F:
+            codename = "Westmere";
+            break;
+        case 0x106E:
+        case 0x106A:
+        case 0x206E:
+            codename = "Nehalem";
+            break;
+        case 0x1067:
+        case 0x106D:
+            codename = "Penryn";
+            break;
+        case 0x006F:
+        case 0x1066:
+            codename = "Merom";
+            break;
+        case 0x0066:
+            codename = "Presler";
+            break;
+        case 0x0063:
+        case 0x0064:
+            codename = "Prescott";
+            break;
+        case 0x006D:
+            codename = "Dothan";
+            break;
+        case 0x0366:
+            codename = "Cedarview";
+            break;
+        case 0x0266:
+            codename = "Lincroft";
+            break;
+        case 0x016C:
+            codename = "Pineview";
+            break;
+        default:
+            codename = "UNKNOWN";
+            break;
+    }
+    return codename;
+}
+
+static inline void tellmeall() {
+    printf("Intel processor:  %s\t", guessprocessor());
+
+#ifdef __VERSION__
+    printf(" compiler version: %s\t", __VERSION__);
+#endif
+    printf("\tBuild option USEAVX ");
+#ifdef USEAVX
+    printf("enabled\n");
+#else
+    printf("disabled\n");
+#endif
+#ifndef __AVX2__
+    printf("AVX2 is NOT available.\n");
+#endif
+
+    if ((sizeof(int) != 4) || (sizeof(long) != 8)) {
+        printf("number of bytes: int = %lu long = %lu \n",
+               (long unsigned int)sizeof(size_t),
+               (long unsigned int)sizeof(int));
+    }
+#if __LITTLE_ENDIAN__
+// This is what we expect!
+// printf("you have little endian machine");
+#endif
+#if __BIG_ENDIAN__
+    printf("you have a big endian machine");
+#endif
+#if __CHAR_BIT__
+    if (__CHAR_BIT__ != 8) printf("on your machine, chars don't have 8bits???");
+#endif
+    if (computecacheline() != 64)
+        printf("cache line: %d bytes\n", computecacheline());
+}
+#else
+
+static inline void tellmeall() {
+    printf("Non-X64  processor\n");
+#ifdef __arm__
+    printf("ARM processor detected\n");
+#endif
+#ifdef __VERSION__
+    printf(" compiler version: %s\t", __VERSION__);
+#endif
+    if ((sizeof(int) != 4) || (sizeof(long) != 8)) {
+        printf("number of bytes: int = %lu long = %lu \n",
+               (long unsigned int)sizeof(size_t),
+               (long unsigned int)sizeof(int));
+    }
+#if __LITTLE_ENDIAN__
+// This is what we expect!
+// printf("you have little endian machine");
+#endif
+#if __BIG_ENDIAN__
+    printf("you have a big endian machine");
+#endif
+#if __CHAR_BIT__
+    if (__CHAR_BIT__ != 8) printf("on your machine, chars don't have 8bits???");
+#endif
+}
+
+#endif
+
+#endif /* INCLUDE_MISC_CONFIGREPORT_H_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/misc/configreport.h */
+/* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring.h */
+/*
+An implementation of Roaring Bitmaps in C.
+*/
+
+#ifndef ROARING_H
+#define ROARING_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+
+typedef struct roaring_bitmap_s {
+    roaring_array_t high_low_container;
+    bool copy_on_write; /* copy_on_write: whether you want to use copy-on-write
+                         (saves memory and avoids
+                         copies but needs more care in a threaded context).
+                         Most users should ignore this flag.
+                         Note: if you do turn this flag to 'true', enabling
+                         COW, then ensure that you do so for all of your bitmaps since
+                         interactions between bitmaps with and without COW is unsafe. */
+} roaring_bitmap_t;
+
+
+void *containerptr_roaring_bitmap_add(roaring_bitmap_t *r,
+                                                    uint32_t val,
+                                                    uint8_t *typecode,
+                                                    int *index);
+/**
+ * Creates a new bitmap (initially empty)
+ */
+roaring_bitmap_t *roaring_bitmap_create(void);
+
+/**
+ * Add all the values between min (included) and max (excluded) that are at a
+ * distance k*step from min.
+*/
+roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max,
+                                            uint32_t step);
+
+/**
+ * Creates a new bitmap (initially empty) with a provided
+ * container-storage capacity (it is a performance hint).
+ */
+roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap);
+
+/**
+ * Creates a new bitmap from a pointer of uint32_t integers
+ */
+roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals);
+
+/**
+ * Describe the inner structure of the bitmap.
+ */
+void roaring_bitmap_printf_describe(const roaring_bitmap_t *ra);
+
+/**
+ * Creates a new bitmap from a list of uint32_t integers
+ */
+roaring_bitmap_t *roaring_bitmap_of(size_t n, ...);
+
+/**
+ * Copies a  bitmap. This does memory allocation. The caller is responsible for
+ * memory management.
+ *
+ */
+roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r);
+
+
+/**
+ * Copies a  bitmap from src to dest. It is assumed that the pointer dest
+ * is to an already allocated bitmap. The content of the dest bitmap is
+ * freed/deleted.
+ *
+ * It might be preferable and simpler to call roaring_bitmap_copy except
+ * that roaring_bitmap_overwrite can save on memory allocations.
+ *
+ */
+bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
+                                     const roaring_bitmap_t *src);
+
+/**
+ * Print the content of the bitmap.
+ */
+void roaring_bitmap_printf(const roaring_bitmap_t *ra);
+
+/**
+ * Computes the intersection between two bitmaps and returns new bitmap. The
+ * caller is
+ * responsible for memory management.
+ *
+ */
+roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2);
+
+/**
+ * Computes the size of the intersection between two bitmaps.
+ *
+ */
+uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2);
+
+
+/**
+ * Check whether two bitmaps intersect.
+ *
+ */
+bool roaring_bitmap_intersect(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2);
+
+/**
+ * Computes the Jaccard index between two bitmaps. (Also known as the Tanimoto
+ * distance,
+ * or the Jaccard similarity coefficient)
+ *
+ * The Jaccard index is undefined if both bitmaps are empty.
+ *
+ */
+double roaring_bitmap_jaccard_index(const roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2);
+
+/**
+ * Computes the size of the union between two bitmaps.
+ *
+ */
+uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *x1,
+                                       const roaring_bitmap_t *x2);
+
+/**
+ * Computes the size of the difference (andnot) between two bitmaps.
+ *
+ */
+uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *x1,
+                                           const roaring_bitmap_t *x2);
+
+/**
+ * Computes the size of the symmetric difference (andnot) between two bitmaps.
+ *
+ */
+uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2);
+
+/**
+ * Inplace version modifies x1, x1 == x2 is allowed
+ */
+void roaring_bitmap_and_inplace(roaring_bitmap_t *x1,
+                                const roaring_bitmap_t *x2);
+
+/**
+ * Computes the union between two bitmaps and returns new bitmap. The caller is
+ * responsible for memory management.
+ */
+roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2);
+
+/**
+ * Inplace version of roaring_bitmap_or, modifies x1. TDOO: decide whether x1 ==
+ *x2 ok
+ *
+ */
+void roaring_bitmap_or_inplace(roaring_bitmap_t *x1,
+                               const roaring_bitmap_t *x2);
+
+/**
+ * Compute the union of 'number' bitmaps. See also roaring_bitmap_or_many_heap.
+ * Caller is responsible for freeing the
+ * result.
+ *
+ */
+roaring_bitmap_t *roaring_bitmap_or_many(size_t number,
+                                         const roaring_bitmap_t **x);
+
+/**
+ * Compute the union of 'number' bitmaps using a heap. This can
+ * sometimes be faster than roaring_bitmap_or_many which uses
+ * a naive algorithm. Caller is responsible for freeing the
+ * result.
+ *
+ */
+roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number,
+                                              const roaring_bitmap_t **x);
+
+/**
+ * Computes the symmetric difference (xor) between two bitmaps
+ * and returns new bitmap. The caller is responsible for memory management.
+ */
+roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2);
+
+/**
+ * Inplace version of roaring_bitmap_xor, modifies x1. x1 != x2.
+ *
+ */
+void roaring_bitmap_xor_inplace(roaring_bitmap_t *x1,
+                                const roaring_bitmap_t *x2);
+
+/**
+ * Compute the xor of 'number' bitmaps.
+ * Caller is responsible for freeing the
+ * result.
+ *
+ */
+roaring_bitmap_t *roaring_bitmap_xor_many(size_t number,
+                                          const roaring_bitmap_t **x);
+
+/**
+ * Computes the  difference (andnot) between two bitmaps
+ * and returns new bitmap. The caller is responsible for memory management.
+ */
+roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *x1,
+                                        const roaring_bitmap_t *x2);
+
+/**
+ * Inplace version of roaring_bitmap_andnot, modifies x1. x1 != x2.
+ *
+ */
+void roaring_bitmap_andnot_inplace(roaring_bitmap_t *x1,
+                                   const roaring_bitmap_t *x2);
+
+/**
+ * TODO: consider implementing:
+ * Compute the xor of 'number' bitmaps using a heap. This can
+ * sometimes be faster than roaring_bitmap_xor_many which uses
+ * a naive algorithm. Caller is responsible for freeing the
+ * result.
+ *
+ * roaring_bitmap_t *roaring_bitmap_xor_many_heap(uint32_t number,
+ *                                              const roaring_bitmap_t **x);
+ */
+
+/**
+ * Frees the memory.
+ */
+void roaring_bitmap_free(roaring_bitmap_t *r);
+
+/**
+ * Add value n_args from pointer vals, faster than repeatedly calling
+ * roaring_bitmap_add
+ *
+ */
+void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
+                             const uint32_t *vals);
+
+/**
+ * Add value x
+ *
+ */
+void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Add value x
+ * Returns true if a new value was added, false if the value was already existing.
+ */
+bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Add all values in range [min, max]
+ */
+void roaring_bitmap_add_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max);
+
+/**
+ * Add all values in range [min, max)
+ */
+inline void roaring_bitmap_add_range(roaring_bitmap_t *ra, uint64_t min, uint64_t max) {
+  if(max == min) return;
+  roaring_bitmap_add_range_closed(ra, (uint32_t)min, (uint32_t)(max - 1));
+}
+
+/**
+ * Remove value x
+ *
+ */
+void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t x);
+
+/** Remove all values in range [min, max] */
+void roaring_bitmap_remove_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max);
+
+/** Remove all values in range [min, max) */
+inline void roaring_bitmap_remove_range(roaring_bitmap_t *ra, uint64_t min, uint64_t max) {
+    if(max == min) return;
+    roaring_bitmap_remove_range_closed(ra, (uint32_t)min, (uint32_t)(max - 1));
+}
+
+/** Remove multiple values */
+void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args,
+                                const uint32_t *vals);
+
+/**
+ * Remove value x
+ * Returns true if a new value was removed, false if the value was not existing.
+ */
+bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t x);
+
+/**
+ * Check if value x is present
+ */
+inline bool roaring_bitmap_contains(const roaring_bitmap_t *r, uint32_t val) {
+    const uint16_t hb = val >> 16;
+    /*
+     * the next function call involves a binary search and lots of branching.
+     */
+    int32_t i = ra_get_index(&r->high_low_container, hb);
+    if (i < 0) return false;
+
+    uint8_t typecode;
+    // next call ought to be cheap
+    void *container =
+        ra_get_container_at_index(&r->high_low_container, i, &typecode);
+    // rest might be a tad expensive, possibly involving another round of binary search
+    return container_contains(container, val & 0xFFFF, typecode);
+}
+
+/**
+ * Check whether a range of values from range_start (included) to range_end (excluded) is present
+ */
+bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, uint64_t range_start, uint64_t range_end);
+
+/**
+ * Get the cardinality of the bitmap (number of elements).
+ */
+uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *ra);
+
+/**
+ * Returns number of elements in range [range_start, range_end).
+ */
+uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *ra,
+                                          uint64_t range_start, uint64_t range_end);
+
+/**
+* Returns true if the bitmap is empty (cardinality is zero).
+*/
+bool roaring_bitmap_is_empty(const roaring_bitmap_t *ra);
+
+
+/**
+* Empties the bitmap
+*/
+void roaring_bitmap_clear(roaring_bitmap_t *ra);
+
+/**
+ * Convert the bitmap to an array. Write the output to "ans",
+ * caller is responsible to ensure that there is enough memory
+ * allocated
+ * (e.g., ans = malloc(roaring_bitmap_get_cardinality(mybitmap)
+ *   * sizeof(uint32_t))
+ */
+void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *ra, uint32_t *ans);
+
+
+/**
+ * Convert the bitmap to an array from "offset" by "limit". Write the output to "ans".
+ * so, you can get data in paging.
+ * caller is responsible to ensure that there is enough memory
+ * allocated
+ * (e.g., ans = malloc(roaring_bitmap_get_cardinality(limit)
+ *   * sizeof(uint32_t))
+ * Return false in case of failure (e.g., insufficient memory)
+ */
+bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *ra, size_t offset, size_t limit, uint32_t *ans);
+
+/**
+ *  Remove run-length encoding even when it is more space efficient
+ *  return whether a change was applied
+ */
+bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r);
+
+/** convert array and bitmap containers to run containers when it is more
+ * efficient;
+ * also convert from run containers when more space efficient.  Returns
+ * true if the result has at least one run container.
+ * Additional savings might be possible by calling shrinkToFit().
+ */
+bool roaring_bitmap_run_optimize(roaring_bitmap_t *r);
+
+/**
+ * If needed, reallocate memory to shrink the memory usage. Returns
+ * the number of bytes saved.
+*/
+size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r);
+
+/**
+* write the bitmap to an output pointer, this output buffer should refer to
+* at least roaring_bitmap_size_in_bytes(ra) allocated bytes.
+*
+* see roaring_bitmap_portable_serialize if you want a format that's compatible
+* with Java and Go implementations
+*
+* this format has the benefit of being sometimes more space efficient than
+* roaring_bitmap_portable_serialize
+* e.g., when the data is sparse.
+*
+* Returns how many bytes were written which should be
+* roaring_bitmap_size_in_bytes(ra).
+*/
+size_t roaring_bitmap_serialize(const roaring_bitmap_t *ra, char *buf);
+
+/**  use with roaring_bitmap_serialize
+* see roaring_bitmap_portable_deserialize if you want a format that's
+* compatible with Java and Go implementations
+*/
+roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf);
+
+/**
+ * How many bytes are required to serialize this bitmap (NOT compatible
+ * with Java and Go versions)
+ */
+size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *ra);
+
+/**
+ * read a bitmap from a serialized version. This is meant to be compatible with
+ * the Java and Go versions. See format specification at
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ * In case of failure, a null pointer is returned.
+ * This function is unsafe in the sense that if there is no valid serialized
+ * bitmap at the pointer, then many bytes could be read, possibly causing a buffer
+ * overflow. For a safer approach,
+ * call roaring_bitmap_portable_deserialize_safe.
+ */
+roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf);
+
+/**
+ * read a bitmap from a serialized version in a safe manner (reading up to maxbytes).
+ * This is meant to be compatible with
+ * the Java and Go versions. See format specification at
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ * In case of failure, a null pointer is returned.
+ */
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, size_t maxbytes);
+
+/**
+ * Check how many bytes would be read (up to maxbytes) at this pointer if there
+ * is a bitmap, returns zero if there is no valid bitmap.
+ * This is meant to be compatible with
+ * the Java and Go versions. See format specification at
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ */
+size_t roaring_bitmap_portable_deserialize_size(const char *buf, size_t maxbytes);
+
+
+/**
+ * How many bytes are required to serialize this bitmap (meant to be compatible
+ * with Java and Go versions).  See format specification at
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ */
+size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *ra);
+
+/**
+ * write a bitmap to a char buffer.  The output buffer should refer to at least
+ *  roaring_bitmap_portable_size_in_bytes(ra) bytes of allocated memory.
+ * This is meant to be compatible with
+ * the
+ * Java and Go versions. Returns how many bytes were written which should be
+ * roaring_bitmap_portable_size_in_bytes(ra).  See format specification at
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ */
+size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *ra, char *buf);
+
+/**
+ * Iterate over the bitmap elements. The function iterator is called once for
+ *  all the values with ptr (can be NULL) as the second parameter of each call.
+ *
+ *  roaring_iterator is simply a pointer to a function that returns bool
+ *  (true means that the iteration should continue while false means that it
+ * should stop),
+ *  and takes (uint32_t,void*) as inputs.
+ *
+ *  Returns true if the roaring_iterator returned true throughout (so that
+ *  all data points were necessarily visited).
+ */
+bool roaring_iterate(const roaring_bitmap_t *ra, roaring_iterator iterator,
+                     void *ptr);
+
+bool roaring_iterate64(const roaring_bitmap_t *ra, roaring_iterator64 iterator,
+                       uint64_t high_bits, void *ptr);
+
+/**
+ * Return true if the two bitmaps contain the same elements.
+ */
+bool roaring_bitmap_equals(const roaring_bitmap_t *ra1,
+                           const roaring_bitmap_t *ra2);
+
+/**
+ * Return true if all the elements of ra1 are also in ra2.
+ */
+bool roaring_bitmap_is_subset(const roaring_bitmap_t *ra1,
+                              const roaring_bitmap_t *ra2);
+
+/**
+ * Return true if all the elements of ra1 are also in ra2 and ra2 is strictly
+ * greater
+ * than ra1.
+ */
+bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *ra1,
+                                            const roaring_bitmap_t *ra2);
+
+/**
+ * (For expert users who seek high performance.)
+ *
+ * Computes the union between two bitmaps and returns new bitmap. The caller is
+ * responsible for memory management.
+ *
+ * The lazy version defers some computations such as the maintenance of the
+ * cardinality counts. Thus you need
+ * to call roaring_bitmap_repair_after_lazy after executing "lazy" computations.
+ * It is safe to repeatedly call roaring_bitmap_lazy_or_inplace on the result.
+ * The bitsetconversion conversion is a flag which determines
+ * whether container-container operations force a bitset conversion.
+ **/
+roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *x1,
+                                         const roaring_bitmap_t *x2,
+                                         const bool bitsetconversion);
+
+/**
+ * (For expert users who seek high performance.)
+ * Inplace version of roaring_bitmap_lazy_or, modifies x1
+ * The bitsetconversion conversion is a flag which determines
+ * whether container-container operations force a bitset conversion.
+ */
+void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *x1,
+                                    const roaring_bitmap_t *x2,
+                                    const bool bitsetconversion);
+
+/**
+ * (For expert users who seek high performance.)
+ *
+ * Execute maintenance operations on a bitmap created from
+ * roaring_bitmap_lazy_or
+ * or modified with roaring_bitmap_lazy_or_inplace.
+ */
+void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *x1);
+
+/**
+ * Computes the symmetric difference between two bitmaps and returns new bitmap.
+ *The caller is
+ * responsible for memory management.
+ *
+ * The lazy version defers some computations such as the maintenance of the
+ * cardinality counts. Thus you need
+ * to call roaring_bitmap_repair_after_lazy after executing "lazy" computations.
+ * It is safe to repeatedly call roaring_bitmap_lazy_xor_inplace on the result.
+ *
+ */
+roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *x1,
+                                          const roaring_bitmap_t *x2);
+
+/**
+ * (For expert users who seek high performance.)
+ * Inplace version of roaring_bitmap_lazy_xor, modifies x1. x1 != x2
+ *
+ */
+void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *x1,
+                                     const roaring_bitmap_t *x2);
+
+/**
+ * compute the negation of the roaring bitmap within a specified
+ * interval: [range_start, range_end). The number of negated values is
+ * range_end - range_start.
+ * Areas outside the range are passed through unchanged.
+ */
+
+roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *x1,
+                                      uint64_t range_start, uint64_t range_end);
+
+/**
+ * compute (in place) the negation of the roaring bitmap within a specified
+ * interval: [range_start, range_end). The number of negated values is
+ * range_end - range_start.
+ * Areas outside the range are passed through unchanged.
+ */
+
+void roaring_bitmap_flip_inplace(roaring_bitmap_t *x1, uint64_t range_start,
+                                 uint64_t range_end);
+
+/**
+ * If the size of the roaring bitmap is strictly greater than rank, then this
+   function returns true and set element to the element of given rank.
+   Otherwise, it returns false.
+ */
+bool roaring_bitmap_select(const roaring_bitmap_t *ra, uint32_t rank,
+                           uint32_t *element);
+/**
+* roaring_bitmap_rank returns the number of integers that are smaller or equal
+* to x.
+*/
+uint64_t roaring_bitmap_rank(const roaring_bitmap_t *bm, uint32_t x);
+
+/**
+* roaring_bitmap_smallest returns the smallest value in the set.
+* Returns UINT32_MAX if the set is empty.
+*/
+uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *bm);
+
+/**
+* roaring_bitmap_smallest returns the greatest value in the set.
+* Returns 0 if the set is empty.
+*/
+uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *bm);
+
+/**
+*  (For advanced users.)
+* Collect statistics about the bitmap, see roaring_types.h for
+* a description of roaring_statistics_t
+*/
+void roaring_bitmap_statistics(const roaring_bitmap_t *ra,
+                               roaring_statistics_t *stat);
+
+/*********************
+* What follows is code use to iterate through values in a roaring bitmap
+
+roaring_bitmap_t *ra =...
+roaring_uint32_iterator_t   i;
+roaring_create_iterator(ra, &i);
+while(i.has_value) {
+  printf("value = %d\n", i.current_value);
+  roaring_advance_uint32_iterator(&i);
+}
+
+Obviously, if you modify the underlying bitmap, the iterator
+becomes invalid. So don't.
+*/
+
+typedef struct roaring_uint32_iterator_s {
+    const roaring_bitmap_t *parent;  // owner
+    int32_t container_index;         // point to the current container index
+    int32_t in_container_index;  // for bitset and array container, this is out
+                                 // index
+    int32_t run_index;           // for run container, this points  at the run
+    uint32_t in_run_index;  // within a run, this is our index (points at the
+                            // end of the current run)
+
+    uint32_t current_value;
+    bool has_value;
+
+    const void
+        *container;  // should be:
+                     // parent->high_low_container.containers[container_index];
+    uint8_t typecode;  // should be:
+                       // parent->high_low_container.typecodes[container_index];
+    uint32_t highbits;  // should be:
+                        // parent->high_low_container.keys[container_index]) <<
+                        // 16;
+
+} roaring_uint32_iterator_t;
+
+/**
+* Initialize an iterator object that can be used to iterate through the
+* values.  If there is a  value, then it->has_value is true.
+* The first value is in it->current_value. The iterator traverses the values
+* in increasing order.
+*/
+void roaring_init_iterator(const roaring_bitmap_t *ra,
+                           roaring_uint32_iterator_t *newit);
+
+/**
+* Create an iterator object that can be used to iterate through the
+* values. Caller is responsible for calling roaring_free_iterator.
+* The iterator is initialized. If there is a  value, then it->has_value is true.
+* The first value is in it->current_value. The iterator traverses the values
+* in increasing order.
+*
+* This function calls roaring_init_iterator.
+*/
+roaring_uint32_iterator_t *roaring_create_iterator(const roaring_bitmap_t *ra);
+
+/**
+* Advance the iterator. If there is a new value, then it->has_value is true.
+* The new value is in it->current_value. Values are traversed in increasing
+* orders. For convenience, returns it->has_value.
+*/
+bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it);
+
+/**
+* Move the iterator to the first value >= val. If there is a such a value, then it->has_value is true.
+* The new value is in it->current_value. For convenience, returns it->has_value.
+*/
+bool roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t *it, uint32_t val) ;
+/**
+* Creates a copy of an iterator.
+* Caller must free it.
+*/
+roaring_uint32_iterator_t *roaring_copy_uint32_iterator(
+    const roaring_uint32_iterator_t *it);
+
+/**
+* Free memory following roaring_create_iterator
+*/
+void roaring_free_uint32_iterator(roaring_uint32_iterator_t *it);
+
+/*
+ * Reads next ${count} values from iterator into user-supplied ${buf}.
+ * Returns the number of read elements.
+ * This number can be smaller than ${count}, which means that iterator is drained.
+ *
+ * This function satisfies semantics of iteration and can be used together with
+ * other iterator functions.
+ *  - first value is copied from ${it}->current_value
+ *  - after function returns, iterator is positioned at the next element
+ */
+uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* buf, uint32_t count);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+/* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring.h */
diff --git a/contrib/croaring/roaring.hh b/contrib/croaring/roaring.hh
new file mode 100644
index 00000000000..6266fff2758
--- /dev/null
+++ b/contrib/croaring/roaring.hh
@@ -0,0 +1,1732 @@
+/* auto-generated on Tue Dec 18 09:42:59 CST 2018. Do not edit! */
+#include "roaring.h"
+/* begin file /opt/bitmap/CRoaring-0.2.57/cpp/roaring.hh */
+/*
+A C++ header for Roaring Bitmaps.
+*/
+#ifndef INCLUDE_ROARING_HH_
+#define INCLUDE_ROARING_HH_
+
+#include <stdarg.h>
+
+#include <algorithm>
+#include <new>
+#include <stdexcept>
+#include <string>
+
+class RoaringSetBitForwardIterator;
+
+class Roaring {
+   public:
+    /**
+     * Create an empty bitmap
+     */
+    Roaring() {
+        bool is_ok = ra_init(&roaring.high_low_container);
+        if (!is_ok) {
+            throw std::runtime_error("failed memory alloc in constructor");
+        }
+        roaring.copy_on_write = false;
+    }
+
+    /**
+     * Construct a bitmap from a list of integer values.
+     */
+    Roaring(size_t n, const uint32_t *data) : Roaring() {
+        roaring_bitmap_add_many(&roaring, n, data);
+    }
+
+    /**
+     * Copy constructor
+     */
+    Roaring(const Roaring &r) {
+        bool is_ok =
+            ra_copy(&r.roaring.high_low_container, &roaring.high_low_container,
+                    r.roaring.copy_on_write);
+        if (!is_ok) {
+            throw std::runtime_error("failed memory alloc in constructor");
+        }
+        roaring.copy_on_write = r.roaring.copy_on_write;
+    }
+
+    /**
+     * Move constructor. The moved object remains valid, i.e.
+     * all methods can still be called on it.
+     */
+    Roaring(Roaring &&r) {
+        roaring = std::move(r.roaring);
+
+        // left the moved object in a valid state
+        bool is_ok = ra_init_with_capacity(&r.roaring.high_low_container, 1);
+        if (!is_ok) {
+            throw std::runtime_error("failed memory alloc in constructor");
+        }
+    }
+
+    /**
+     * Construct a roaring object from the C struct.
+     *
+     * Passing a NULL point is unsafe.
+     * the pointer to the C struct will be invalid after the call.
+     */
+    Roaring(roaring_bitmap_t *s) {
+        // steal the interior struct
+        roaring.high_low_container = s->high_low_container;
+        roaring.copy_on_write = s->copy_on_write;
+        // deallocate the old container
+        free(s);
+    }
+
+    /**
+     * Construct a bitmap from a list of integer values.
+     */
+    static Roaring bitmapOf(size_t n, ...) {
+        Roaring ans;
+        va_list vl;
+        va_start(vl, n);
+        for (size_t i = 0; i < n; i++) {
+            ans.add(va_arg(vl, uint32_t));
+        }
+        va_end(vl);
+        return ans;
+    }
+
+    /**
+     * Add value x
+     *
+     */
+    void add(uint32_t x) { roaring_bitmap_add(&roaring, x); }
+
+    /**
+     * Add value x
+     * Returns true if a new value was added, false if the value was already existing.
+     */
+    bool addChecked(uint32_t x) { 
+        return roaring_bitmap_add_checked(&roaring, x);
+    }
+
+    /**
+    * add if all values from x (included) to y (excluded)
+    */
+    void addRange(const uint64_t x, const uint64_t y)  {
+        return roaring_bitmap_add_range(&roaring, x, y);
+    }
+
+    /**
+     * Add value n_args from pointer vals
+     *
+     */
+    void addMany(size_t n_args, const uint32_t *vals) {
+        roaring_bitmap_add_many(&roaring, n_args, vals);
+    }
+
+    /**
+     * Remove value x
+     *
+     */
+    void remove(uint32_t x) { roaring_bitmap_remove(&roaring, x); }
+
+    /**
+     * Remove value x
+     * Returns true if a new value was removed, false if the value was not existing.
+     */
+    bool removeChecked(uint32_t x) {
+        return roaring_bitmap_remove_checked(&roaring, x);
+    }
+
+    /**
+     * Return the largest value (if not empty)
+     *
+     */
+    uint32_t maximum() const { return roaring_bitmap_maximum(&roaring); }
+
+    /**
+    * Return the smallest value (if not empty)
+    *
+    */
+    uint32_t minimum() const { return roaring_bitmap_minimum(&roaring); }
+
+    /**
+     * Check if value x is present
+     */
+    bool contains(uint32_t x) const {
+        return roaring_bitmap_contains(&roaring, x);
+    }
+
+    /**
+    * Check if all values from x (included) to y (excluded) are present
+    */
+    bool containsRange(const uint64_t x, const uint64_t y) const {
+        return roaring_bitmap_contains_range(&roaring, x, y);
+    }
+
+    /**
+     * Destructor
+     */
+    ~Roaring() { ra_clear(&roaring.high_low_container); }
+
+    /**
+     * Copies the content of the provided bitmap, and
+     * discard the current content.
+     */
+    Roaring &operator=(const Roaring &r) {
+        ra_clear(&roaring.high_low_container);
+        bool is_ok =
+            ra_copy(&r.roaring.high_low_container, &roaring.high_low_container,
+                    r.roaring.copy_on_write);
+        if (!is_ok) {
+            throw std::runtime_error("failed memory alloc in assignment");
+        }
+        roaring.copy_on_write = r.roaring.copy_on_write;
+        return *this;
+    }
+
+    /**
+     * Moves the content of the provided bitmap, and
+     * discard the current content.
+     */
+    Roaring &operator=(Roaring &&r) {
+        ra_clear(&roaring.high_low_container);
+
+        roaring = std::move(r.roaring);
+        bool is_ok = ra_init_with_capacity(&r.roaring.high_low_container, 1);
+        if (!is_ok) {
+            throw std::runtime_error("failed memory alloc in assignment");
+        }
+
+        return *this;
+    }
+
+    /**
+     * Compute the intersection between the current bitmap and the provided
+     * bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     */
+    Roaring &operator&=(const Roaring &r) {
+        roaring_bitmap_and_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Compute the difference between the current bitmap and the provided
+     * bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     */
+    Roaring &operator-=(const Roaring &r) {
+        roaring_bitmap_andnot_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Compute the union between the current bitmap and the provided bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     *
+     * See also the fastunion function to aggregate many bitmaps more quickly.
+     */
+    Roaring &operator|=(const Roaring &r) {
+        roaring_bitmap_or_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Compute the symmetric union between the current bitmap and the provided
+     * bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     */
+    Roaring &operator^=(const Roaring &r) {
+        roaring_bitmap_xor_inplace(&roaring, &r.roaring);
+        return *this;
+    }
+
+    /**
+     * Exchange the content of this bitmap with another.
+     */
+    void swap(Roaring &r) { std::swap(r.roaring, roaring); }
+
+    /**
+     * Get the cardinality of the bitmap (number of elements).
+     */
+    uint64_t cardinality() const {
+        return roaring_bitmap_get_cardinality(&roaring);
+    }
+
+    /**
+    * Returns true if the bitmap is empty (cardinality is zero).
+    */
+    bool isEmpty() const { return roaring_bitmap_is_empty(&roaring); }
+
+    /**
+    * Returns true if the bitmap is subset of the other.
+    */
+    bool isSubset(const Roaring &r) const {
+        return roaring_bitmap_is_subset(&roaring, &r.roaring);
+    }
+
+    /**
+    * Returns true if the bitmap is strict subset of the other.
+    */
+    bool isStrictSubset(const Roaring &r) const {
+        return roaring_bitmap_is_strict_subset(&roaring, &r.roaring);
+    }
+
+    /**
+     * Convert the bitmap to an array. Write the output to "ans",
+     * caller is responsible to ensure that there is enough memory
+     * allocated
+     * (e.g., ans = new uint32[mybitmap.cardinality()];)
+     */
+    void toUint32Array(uint32_t *ans) const {
+        roaring_bitmap_to_uint32_array(&roaring, ans);
+    }
+    /**
+     * to int array with pagination
+     * 
+     */
+    void rangeUint32Array(uint32_t *ans, size_t offset, size_t limit) const {
+        roaring_bitmap_range_uint32_array(&roaring, offset, limit, ans);
+    }
+
+    /**
+     * Return true if the two bitmaps contain the same elements.
+     */
+    bool operator==(const Roaring &r) const {
+        return roaring_bitmap_equals(&roaring, &r.roaring);
+    }
+
+    /**
+     * compute the negation of the roaring bitmap within a specified interval.
+     * areas outside the range are passed through unchanged.
+     */
+    void flip(uint64_t range_start, uint64_t range_end) {
+        roaring_bitmap_flip_inplace(&roaring, range_start, range_end);
+    }
+
+    /**
+     *  Remove run-length encoding even when it is more space efficient
+     *  return whether a change was applied
+     */
+    bool removeRunCompression() {
+        return roaring_bitmap_remove_run_compression(&roaring);
+    }
+
+    /** convert array and bitmap containers to run containers when it is more
+     * efficient;
+     * also convert from run containers when more space efficient.  Returns
+     * true if the result has at least one run container.
+     * Additional savings might be possible by calling shrinkToFit().
+     */
+    bool runOptimize() { return roaring_bitmap_run_optimize(&roaring); }
+
+    /**
+     * If needed, reallocate memory to shrink the memory usage. Returns
+     * the number of bytes saved.
+    */
+    size_t shrinkToFit() { return roaring_bitmap_shrink_to_fit(&roaring); }
+
+    /**
+     * Iterate over the bitmap elements. The function iterator is called once for
+     * all the values with ptr (can be NULL) as the second parameter of each call.
+     *
+     * roaring_iterator is simply a pointer to a function that returns bool
+     * (true means that the iteration should continue while false means that it
+     * should stop), and takes (uint32_t,void*) as inputs.
+     */
+    void iterate(roaring_iterator iterator, void *ptr) const {
+        roaring_iterate(&roaring, iterator, ptr);
+    }
+
+    /**
+     * If the size of the roaring bitmap is strictly greater than rank, then
+     * this function returns true and set element to the element of given rank.
+     *   Otherwise, it returns false.
+     */
+    bool select(uint32_t rnk, uint32_t *element) const {
+        return roaring_bitmap_select(&roaring, rnk, element);
+    }
+
+    /**
+     * Computes the size of the intersection between two bitmaps.
+     *
+     */
+    uint64_t and_cardinality(const Roaring &r) const {
+        return roaring_bitmap_and_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+     * Check whether the two bitmaps intersect.
+     *
+     */
+    bool intersect(const Roaring &r) const {
+    	 return roaring_bitmap_intersect(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the Jaccard index between two bitmaps. (Also known as the
+     * Tanimoto distance,
+     * or the Jaccard similarity coefficient)
+     *
+     * The Jaccard index is undefined if both bitmaps are empty.
+     *
+     */
+    double jaccard_index(const Roaring &r) const {
+        return roaring_bitmap_jaccard_index(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the size of the union between two bitmaps.
+     *
+     */
+    uint64_t or_cardinality(const Roaring &r) const {
+        return roaring_bitmap_or_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the size of the difference (andnot) between two bitmaps.
+     *
+     */
+    uint64_t andnot_cardinality(const Roaring &r) const {
+        return roaring_bitmap_andnot_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+     * Computes the size of the symmetric difference (andnot) between two
+     * bitmaps.
+     *
+     */
+    uint64_t xor_cardinality(const Roaring &r) const {
+        return roaring_bitmap_xor_cardinality(&roaring, &r.roaring);
+    }
+
+    /**
+    * Returns the number of integers that are smaller or equal to x.
+    */
+    uint64_t rank(uint32_t x) const { return roaring_bitmap_rank(&roaring, x); }
+
+    /**
+    * write a bitmap to a char buffer. This is meant to be compatible with
+    * the
+    * Java and Go versions. Returns how many bytes were written which should be
+    * getSizeInBytes().
+    *
+    * Setting the portable flag to false enable a custom format that
+    * can save space compared to the portable format (e.g., for very
+    * sparse bitmaps).
+    *
+    * Boost users can serialize bitmaps in this manner:
+    *
+    *       BOOST_SERIALIZATION_SPLIT_FREE(Roaring)
+    *       namespace boost {
+    *       namespace serialization {
+    *
+    *       template <class Archive>
+    *       void save(Archive& ar, const Roaring& bitmask, 
+    *          const unsigned int version) {
+    *         std::size_t expected_size_in_bytes = bitmask.getSizeInBytes();
+    *         std::vector<char> buffer(expected_size_in_bytes);
+    *         std::size_t       size_in_bytes = bitmask.write(buffer.data());
+    *
+    *         ar& size_in_bytes;
+    *         ar& boost::serialization::make_binary_object(buffer.data(), 
+    *             size_in_bytes);
+    *      }
+    *      template <class Archive>
+    *      void load(Archive& ar, Roaring& bitmask, 
+    *          const unsigned int version) {
+    *         std::size_t size_in_bytes = 0;
+    *         ar& size_in_bytes;
+    *         std::vector<char> buffer(size_in_bytes);
+    *         ar&  boost::serialization::make_binary_object(buffer.data(),
+    *            size_in_bytes);
+    *         bitmask = Roaring::readSafe(buffer.data(), size_in_bytes);
+    *}
+    *}  // namespace serialization
+    *}  // namespace boost
+    */
+    size_t write(char *buf, bool portable = true) const {
+        if (portable)
+            return roaring_bitmap_portable_serialize(&roaring, buf);
+        else
+            return roaring_bitmap_serialize(&roaring, buf);
+    }
+
+    /**
+     * read a bitmap from a serialized version. This is meant to be compatible
+     * with the Java and Go versions.
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     *
+     * This function is unsafe in the sense that if you provide bad data,
+     * many, many bytes could be read. See also readSafe.
+     */
+    static Roaring read(const char *buf, bool portable = true) {
+        roaring_bitmap_t * r = portable ? roaring_bitmap_portable_deserialize(buf) : roaring_bitmap_deserialize(buf);
+        if (r == NULL) {
+            throw std::runtime_error("failed alloc while reading");
+        }
+        return Roaring(r);
+    }
+    /**
+     * read a bitmap from a serialized version, reading no more than maxbytes bytes.
+     * This is meant to be compatible with the Java and Go versions.
+     *
+     */
+    static Roaring readSafe(const char *buf, size_t maxbytes) {
+        roaring_bitmap_t * r = roaring_bitmap_portable_deserialize_safe(buf,maxbytes);
+        if (r == NULL) {
+            throw std::runtime_error("failed alloc while reading");
+        }
+        return Roaring(r);
+    }
+    /**
+     * How many bytes are required to serialize this bitmap (meant to be
+     * compatible
+     * with Java and Go versions)
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     */
+    size_t getSizeInBytes(bool portable = true) const {
+        if (portable)
+            return roaring_bitmap_portable_size_in_bytes(&roaring);
+        else
+            return roaring_bitmap_size_in_bytes(&roaring);
+    }
+
+    /**
+     * Computes the intersection between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring operator&(const Roaring &o) const {
+        roaring_bitmap_t *r = roaring_bitmap_and(&roaring, &o.roaring);
+        if (r == NULL) {
+            throw std::runtime_error("failed materalization in and");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Computes the difference between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring operator-(const Roaring &o) const {
+        roaring_bitmap_t *r = roaring_bitmap_andnot(&roaring, &o.roaring);
+        if (r == NULL) {
+            throw std::runtime_error("failed materalization in andnot");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Computes the union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring operator|(const Roaring &o) const {
+        roaring_bitmap_t *r = roaring_bitmap_or(&roaring, &o.roaring);
+        if (r == NULL) {
+            throw std::runtime_error("failed materalization in or");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Computes the symmetric union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring operator^(const Roaring &o) const {
+        roaring_bitmap_t *r = roaring_bitmap_xor(&roaring, &o.roaring);
+        if (r == NULL) {
+            throw std::runtime_error("failed materalization in xor");
+        }
+        return Roaring(r);
+    }
+
+    /**
+     * Whether or not we apply copy and write.
+     */
+    void setCopyOnWrite(bool val) { roaring.copy_on_write = val; }
+
+    /**
+     * Print the content of the bitmap
+     */
+    void printf() const { roaring_bitmap_printf(&roaring); }
+
+    /**
+     * Print the content of the bitmap into a string
+     */
+    std::string toString() const {
+        struct iter_data {
+            std::string str;
+            char first_char = '{';
+        } outer_iter_data;
+        if (!isEmpty()) {
+            iterate(
+                [](uint32_t value, void *inner_iter_data) -> bool {
+                    ((iter_data *)inner_iter_data)->str +=
+                        ((iter_data *)inner_iter_data)->first_char;
+                    ((iter_data *)inner_iter_data)->str +=
+                        std::to_string(value);
+                    ((iter_data *)inner_iter_data)->first_char = ',';
+                    return true;
+                },
+                (void *)&outer_iter_data);
+        } else
+            outer_iter_data.str = '{';
+        outer_iter_data.str += '}';
+        return outer_iter_data.str;
+    }
+
+    /**
+     * Whether or not copy and write is active.
+     */
+    bool getCopyOnWrite() const { return roaring.copy_on_write; }
+
+    /**
+     * computes the logical or (union) between "n" bitmaps (referenced by a
+     * pointer).
+     */
+    static Roaring fastunion(size_t n, const Roaring **inputs) {
+        const roaring_bitmap_t **x =
+            (const roaring_bitmap_t **)malloc(n * sizeof(roaring_bitmap_t *));
+        if (x == NULL) {
+            throw std::runtime_error("failed memory alloc in fastunion");
+        }
+        for (size_t k = 0; k < n; ++k) x[k] = &inputs[k]->roaring;
+
+        roaring_bitmap_t *c_ans = roaring_bitmap_or_many(n, x);
+        if (c_ans == NULL) {
+            free(x);
+            throw std::runtime_error("failed memory alloc in fastunion");
+        }
+        Roaring ans(c_ans);
+        free(x);
+        return ans;
+    }
+
+    typedef RoaringSetBitForwardIterator const_iterator;
+
+    /**
+    * Returns an iterator that can be used to access the position of the
+    * set bits. The running time complexity of a full scan is proportional to
+    * the
+    * number
+    * of set bits: be aware that if you have long strings of 1s, this can be
+    * very inefficient.
+    *
+    * It can be much faster to use the toArray method if you want to
+    * retrieve the set bits.
+    */
+    const_iterator begin() const;
+
+    /**
+    * A bogus iterator that can be used together with begin()
+    * for constructions such as for(auto i = b.begin();
+    * i!=b.end(); ++i) {}
+    */
+    const_iterator &end() const;
+
+    roaring_bitmap_t roaring;
+};
+
+/**
+ * Used to go through the set bits. Not optimally fast, but convenient.
+ */
+class RoaringSetBitForwardIterator final {
+   public:
+    typedef std::forward_iterator_tag iterator_category;
+    typedef uint32_t *pointer;
+    typedef uint32_t &reference_type;
+    typedef uint32_t value_type;
+    typedef int32_t difference_type;
+    typedef RoaringSetBitForwardIterator type_of_iterator;
+
+    /**
+     * Provides the location of the set bit.
+     */
+    value_type operator*() const { return i.current_value; }
+
+    bool operator<(const type_of_iterator &o) {
+        if (!i.has_value) return false;
+        if (!o.i.has_value) return true;
+        return i.current_value < *o;
+    }
+
+    bool operator<=(const type_of_iterator &o) {
+        if (!o.i.has_value) return true;
+        if (!i.has_value) return false;
+        return i.current_value <= *o;
+    }
+
+    bool operator>(const type_of_iterator &o) {
+        if (!o.i.has_value) return false;
+        if (!i.has_value) return true;
+        return i.current_value > *o;
+    }
+
+    bool operator>=(const type_of_iterator &o) {
+        if (!i.has_value) return true;
+        if (!o.i.has_value) return false;
+        return i.current_value >= *o;
+    }
+
+    /**
+    * Move the iterator to the first value >= val.
+    */
+    void equalorlarger(uint32_t val) {
+      roaring_move_uint32_iterator_equalorlarger(&i,val);
+    }
+
+    type_of_iterator &operator++() {  // ++i, must returned inc. value
+        roaring_advance_uint32_iterator(&i);
+        return *this;
+    }
+
+    type_of_iterator operator++(int) {  // i++, must return orig. value
+        RoaringSetBitForwardIterator orig(*this);
+        roaring_advance_uint32_iterator(&i);
+        return orig;
+    }
+
+    bool operator==(const RoaringSetBitForwardIterator &o) const {
+        return i.current_value == *o && i.has_value == o.i.has_value;
+    }
+
+    bool operator!=(const RoaringSetBitForwardIterator &o) const {
+        return i.current_value != *o || i.has_value != o.i.has_value;
+    }
+
+    RoaringSetBitForwardIterator(const Roaring &parent,
+                                 bool exhausted = false) {
+        if (exhausted) {
+            i.parent = &parent.roaring;
+            i.container_index = INT32_MAX;
+            i.has_value = false;
+            i.current_value = UINT32_MAX;
+        } else {
+            roaring_init_iterator(&parent.roaring, &i);
+        }
+    }
+
+    RoaringSetBitForwardIterator &operator=(
+        const RoaringSetBitForwardIterator &o) = default;
+    RoaringSetBitForwardIterator &operator=(RoaringSetBitForwardIterator &&o) =
+        default;
+
+    ~RoaringSetBitForwardIterator() = default;
+
+    RoaringSetBitForwardIterator(const RoaringSetBitForwardIterator &o)
+        : i(o.i) {}
+
+    roaring_uint32_iterator_t i;
+};
+
+inline RoaringSetBitForwardIterator Roaring::begin() const {
+    return RoaringSetBitForwardIterator(*this);
+}
+
+inline RoaringSetBitForwardIterator &Roaring::end() const {
+    static RoaringSetBitForwardIterator e(*this, true);
+    return e;
+}
+
+#endif /* INCLUDE_ROARING_HH_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/cpp/roaring.hh */
+/* begin file /opt/bitmap/CRoaring-0.2.57/cpp/roaring64map.hh */
+/*
+A C++ header for 64-bit Roaring Bitmaps, implemented by way of a map of many
+32-bit Roaring Bitmaps.
+*/
+#ifndef INCLUDE_ROARING_64_MAP_HH_
+#define INCLUDE_ROARING_64_MAP_HH_
+
+#include <algorithm>
+#include <cstdarg>
+#include <cstdio>
+#include <limits>
+#include <map>
+#include <new>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+
+class Roaring64MapSetBitForwardIterator;
+
+class Roaring64Map {
+   public:
+    /**
+     * Create an empty bitmap
+     */
+    Roaring64Map() = default;
+
+    /**
+     * Construct a bitmap from a list of 32-bit integer values.
+     */
+    Roaring64Map(size_t n, const uint32_t *data) { addMany(n, data); }
+
+    /**
+     * Construct a bitmap from a list of 64-bit integer values.
+     */
+    Roaring64Map(size_t n, const uint64_t *data) { addMany(n, data); }
+
+    /**
+     * Copy constructor
+     */
+    Roaring64Map(const Roaring64Map &r) = default;
+
+    /**
+     * Move constructor
+     */
+    Roaring64Map(Roaring64Map &&r) = default;
+
+    /**
+     * Construct a 64-bit map from a 32-bit one
+     */
+    Roaring64Map(const Roaring &r) { emplaceOrInsert(0, r); }
+
+    /**
+     * Construct a roaring object from the C struct.
+     *
+     * Passing a NULL point is unsafe.
+     */
+    Roaring64Map(roaring_bitmap_t *s) { emplaceOrInsert(0, s); }
+
+    /**
+     * Construct a bitmap from a list of integer values.
+     */
+    static Roaring64Map bitmapOf(size_t n...) {
+        Roaring64Map ans;
+        va_list vl;
+        va_start(vl, n);
+        for (size_t i = 0; i < n; i++) {
+            ans.add(va_arg(vl, uint64_t));
+        }
+        va_end(vl);
+        return ans;
+    }
+
+    /**
+     * Add value x
+     *
+     */
+    void add(uint32_t x) {
+        roarings[0].add(x);
+        roarings[0].setCopyOnWrite(copyOnWrite);
+    }
+    void add(uint64_t x) {
+        roarings[highBytes(x)].add(lowBytes(x));
+        roarings[highBytes(x)].setCopyOnWrite(copyOnWrite);
+    }
+
+    /**
+     * Add value x
+     * Returns true if a new value was added, false if the value was already existing.
+     */
+    bool addChecked(uint32_t x) {
+        bool result = roarings[0].addChecked(x);
+        roarings[0].setCopyOnWrite(copyOnWrite);
+        return result;
+    }
+    bool addChecked(uint64_t x) {
+        bool result = roarings[highBytes(x)].addChecked(lowBytes(x));
+        roarings[highBytes(x)].setCopyOnWrite(copyOnWrite);
+        return result;
+    }
+
+    /**
+     * Add value n_args from pointer vals
+     *
+     */
+    void addMany(size_t n_args, const uint32_t *vals) {
+        for (size_t lcv = 0; lcv < n_args; lcv++) {
+            roarings[0].add(vals[lcv]);
+            roarings[0].setCopyOnWrite(copyOnWrite);
+        }
+    }
+    void addMany(size_t n_args, const uint64_t *vals) {
+        for (size_t lcv = 0; lcv < n_args; lcv++) {
+            roarings[highBytes(vals[lcv])].add(lowBytes(vals[lcv]));
+            roarings[highBytes(vals[lcv])].setCopyOnWrite(copyOnWrite);
+        }
+    }
+
+    /**
+     * Remove value x
+     *
+     */
+    void remove(uint32_t x) { roarings[0].remove(x); }
+    void remove(uint64_t x) {
+        auto roaring_iter = roarings.find(highBytes(x));
+        if (roaring_iter != roarings.cend())
+            roaring_iter->second.remove(lowBytes(x));
+    }
+
+    /**
+     * Remove value x
+     * Returns true if a new value was removed, false if the value was not existing.
+     */
+    bool removeChecked(uint32_t x) {
+        return roarings[0].removeChecked(x);
+    }
+    bool removeChecked(uint64_t x) {
+        auto roaring_iter = roarings.find(highBytes(x));
+        if (roaring_iter != roarings.cend())
+            return roaring_iter->second.removeChecked(lowBytes(x));
+        return false;
+    }
+
+    /**
+     * Return the largest value (if not empty)
+     *
+     */
+    uint64_t maximum() const {
+        for (auto roaring_iter = roarings.crbegin();
+             roaring_iter != roarings.crend(); ++roaring_iter) {
+            if (!roaring_iter->second.isEmpty()) {
+                return uniteBytes(roaring_iter->first,
+                                  roaring_iter->second.maximum());
+            }
+        }
+        // we put std::numeric_limits<>::max/min in parenthesis
+        // to avoid a clash with the Windows.h header under Windows
+        return (std::numeric_limits<uint64_t>::min)();
+    }
+
+    /**
+     * Return the smallest value (if not empty)
+     *
+     */
+    uint64_t minimum() const {
+        for (auto roaring_iter = roarings.cbegin();
+             roaring_iter != roarings.cend(); ++roaring_iter) {
+            if (!roaring_iter->second.isEmpty()) {
+                return uniteBytes(roaring_iter->first,
+                                  roaring_iter->second.minimum());
+            }
+        }
+        // we put std::numeric_limits<>::max/min in parenthesis
+        // to avoid a clash with the Windows.h header under Windows
+        return (std::numeric_limits<uint64_t>::max)();
+    }
+
+    /**
+     * Check if value x is present
+     */
+    bool contains(uint32_t x) const {
+        return roarings.count(0) == 0 ? false : roarings.at(0).contains(x);
+    }
+    bool contains(uint64_t x) const {
+        return roarings.count(highBytes(x)) == 0
+                   ? false
+                   : roarings.at(highBytes(x)).contains(lowBytes(x));
+    }
+
+    /**
+     * Destructor
+     */
+    ~Roaring64Map() = default;
+
+    /**
+     * Copies the content of the provided bitmap, and
+     * discards the current content.
+     */
+    Roaring64Map &operator=(const Roaring64Map &r) {
+        roarings = r.roarings;
+        copyOnWrite = r.copyOnWrite;
+        return *this;
+    }
+
+    /**
+     * Moves the content of the provided bitmap, and
+     * discards the current content.
+     */
+    Roaring64Map &operator=(Roaring64Map &&r) {
+        roarings = std::move(r.roarings);
+        copyOnWrite = r.copyOnWrite;
+        return *this;
+    }
+
+    /**
+     * Compute the intersection between the current bitmap and the provided
+     * bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     */
+    Roaring64Map &operator&=(const Roaring64Map &r) {
+        for (auto &map_entry : roarings) {
+            if (r.roarings.count(map_entry.first) == 1)
+                map_entry.second &= r.roarings.at(map_entry.first);
+            else
+                map_entry.second = Roaring();
+        }
+        return *this;
+    }
+
+    /**
+     * Compute the difference between the current bitmap and the provided
+     * bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     */
+    Roaring64Map &operator-=(const Roaring64Map &r) {
+        for (auto &map_entry : roarings) {
+            if (r.roarings.count(map_entry.first) == 1)
+                map_entry.second -= r.roarings.at(map_entry.first);
+        }
+        return *this;
+    }
+
+    /**
+     * Compute the union between the current bitmap and the provided bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     *
+     * See also the fastunion function to aggregate many bitmaps more quickly.
+     */
+    Roaring64Map &operator|=(const Roaring64Map &r) {
+        for (const auto &map_entry : r.roarings) {
+            if (roarings.count(map_entry.first) == 0) {
+                roarings[map_entry.first] = map_entry.second;
+                roarings[map_entry.first].setCopyOnWrite(copyOnWrite);
+            } else
+                roarings[map_entry.first] |= map_entry.second;
+        }
+        return *this;
+    }
+
+    /**
+     * Compute the symmetric union between the current bitmap and the provided
+     * bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     */
+    Roaring64Map &operator^=(const Roaring64Map &r) {
+        for (const auto &map_entry : r.roarings) {
+            if (roarings.count(map_entry.first) == 0) {
+                roarings[map_entry.first] = map_entry.second;
+                roarings[map_entry.first].setCopyOnWrite(copyOnWrite);
+            } else
+                roarings[map_entry.first] ^= map_entry.second;
+        }
+        return *this;
+    }
+
+    /**
+     * Exchange the content of this bitmap with another.
+     */
+    void swap(Roaring64Map &r) { roarings.swap(r.roarings); }
+
+    /**
+     * Get the cardinality of the bitmap (number of elements).
+     * Throws std::length_error in the special case where the bitmap is full
+     * (cardinality() == 2^64). Check isFull() before calling to avoid
+     * exception.
+     */
+    uint64_t cardinality() const {
+        if (isFull()) {
+            throw std::length_error(
+                "bitmap is full, cardinality is 2^64, "
+                "unable to represent in a 64-bit integer");
+        }
+        return std::accumulate(
+            roarings.cbegin(), roarings.cend(), (uint64_t)0,
+            [](uint64_t previous,
+               const std::pair<uint32_t, Roaring> &map_entry) {
+                return previous + map_entry.second.cardinality();
+            });
+    }
+
+    /**
+    * Returns true if the bitmap is empty (cardinality is zero).
+    */
+    bool isEmpty() const {
+        return std::all_of(roarings.cbegin(), roarings.cend(),
+                           [](const std::pair<uint32_t, Roaring> &map_entry) {
+                               return map_entry.second.isEmpty();
+                           });
+    }
+
+    /**
+    * Returns true if the bitmap is full (cardinality is max uint64_t + 1).
+    */
+    bool isFull() const {
+        // only bother to check if map is fully saturated
+        //
+        // we put std::numeric_limits<>::max/min in parenthesis
+        // to avoid a clash with the Windows.h header under Windows
+        return roarings.size() ==
+                       ((size_t)(std::numeric_limits<uint32_t>::max)()) + 1
+                   ? std::all_of(
+                         roarings.cbegin(), roarings.cend(),
+                         [](const std::pair<uint32_t, Roaring> &roaring_map_entry) {
+                             // roarings within map are saturated if cardinality
+                             // is uint32_t max + 1
+                             return roaring_map_entry.second.cardinality() ==
+                                    ((uint64_t)
+                                         (std::numeric_limits<uint32_t>::max)()) +
+                                        1;
+                         })
+                   : false;
+    }
+
+    /**
+    * Returns true if the bitmap is subset of the other.
+    */
+    bool isSubset(const Roaring64Map &r) const {
+        for (const auto &map_entry : roarings) {
+            auto roaring_iter = r.roarings.find(map_entry.first);
+            if (roaring_iter == roarings.cend())
+                return false;
+            else if (!map_entry.second.isSubset(roaring_iter->second))
+                return false;
+        }
+        return true;
+    }
+
+    /**
+    * Returns true if the bitmap is strict subset of the other.
+    * Throws std::length_error in the special case where the bitmap is full
+    * (cardinality() == 2^64). Check isFull() before calling to avoid exception.
+    */
+    bool isStrictSubset(const Roaring64Map &r) const {
+        return isSubset(r) && cardinality() != r.cardinality();
+    }
+
+    /**
+     * Convert the bitmap to an array. Write the output to "ans",
+     * caller is responsible to ensure that there is enough memory
+     * allocated
+     * (e.g., ans = new uint32[mybitmap.cardinality()];)
+     */
+    void toUint64Array(uint64_t *ans) const {
+        // Annoyingly, VS 2017 marks std::accumulate() as [[nodiscard]]
+        (void)std::accumulate(roarings.cbegin(), roarings.cend(), ans,
+                              [](uint64_t *previous,
+                                 const std::pair<uint32_t, Roaring> &map_entry) {
+                                  for (uint32_t low_bits : map_entry.second)
+                                      *previous++ =
+                                          uniteBytes(map_entry.first, low_bits);
+                                  return previous;
+                              });
+    }
+
+    /**
+     * Return true if the two bitmaps contain the same elements.
+     */
+    bool operator==(const Roaring64Map &r) const {
+        // we cannot use operator == on the map because either side may contain
+        // empty Roaring Bitmaps
+        auto lhs_iter = roarings.cbegin();
+        auto rhs_iter = r.roarings.cbegin();
+        do {
+            // if the left map has reached its end, ensure that the right map
+            // contains only empty Bitmaps
+            if (lhs_iter == roarings.cend()) {
+                while (rhs_iter != r.roarings.cend()) {
+                    if (rhs_iter->second.isEmpty()) {
+                        ++rhs_iter;
+                        continue;
+                    }
+                    return false;
+                }
+                return true;
+            }
+            // if the left map has an empty bitmap, skip it
+            if (lhs_iter->second.isEmpty()) {
+                ++lhs_iter;
+                continue;
+            }
+
+            do {
+                // if the right map has reached its end, ensure that the right
+                // map contains only empty Bitmaps
+                if (rhs_iter == r.roarings.cend()) {
+                    while (lhs_iter != roarings.cend()) {
+                        if (lhs_iter->second.isEmpty()) {
+                            ++lhs_iter;
+                            continue;
+                        }
+                        return false;
+                    }
+                    return true;
+                }
+                // if the right map has an empty bitmap, skip it
+                if (rhs_iter->second.isEmpty()) {
+                    ++rhs_iter;
+                    continue;
+                }
+            } while (false);
+            // if neither map has reached its end ensure elements are equal and
+            // move to the next element in both
+        } while (lhs_iter++->second == rhs_iter++->second);
+        return false;
+    }
+
+    /**
+     * compute the negation of the roaring bitmap within a specified interval.
+     * areas outside the range are passed through unchanged.
+     */
+    void flip(uint64_t range_start, uint64_t range_end) {
+        uint32_t start_high = highBytes(range_start);
+        uint32_t start_low = lowBytes(range_start);
+        uint32_t end_high = highBytes(range_end);
+        uint32_t end_low = lowBytes(range_end);
+
+        if (start_high == end_high) {
+            roarings[start_high].flip(start_low, end_low);
+            return;
+        }
+        // we put std::numeric_limits<>::max/min in parenthesis
+        // to avoid a clash with the Windows.h header under Windows
+        roarings[start_high].flip(start_low,
+                                  (std::numeric_limits<uint32_t>::max)());
+        roarings[start_high++].setCopyOnWrite(copyOnWrite);
+
+        for (; start_high <= highBytes(range_end) - 1; ++start_high) {
+            roarings[start_high].flip((std::numeric_limits<uint32_t>::min)(),
+                                      (std::numeric_limits<uint32_t>::max)());
+            roarings[start_high].setCopyOnWrite(copyOnWrite);
+        }
+
+        roarings[start_high].flip((std::numeric_limits<uint32_t>::min)(),
+                                  end_low);
+        roarings[start_high].setCopyOnWrite(copyOnWrite);
+    }
+
+    /**
+     *  Remove run-length encoding even when it is more space efficient
+     *  return whether a change was applied
+     */
+    bool removeRunCompression() {
+        return std::accumulate(
+            roarings.begin(), roarings.end(), false,
+            [](bool previous, std::pair<const uint32_t, Roaring> &map_entry) {
+                return map_entry.second.removeRunCompression() && previous;
+            });
+    }
+
+    /** convert array and bitmap containers to run containers when it is more
+     * efficient;
+     * also convert from run containers when more space efficient.  Returns
+     * true if the result has at least one run container.
+     * Additional savings might be possible by calling shrinkToFit().
+     */
+    bool runOptimize() {
+        return std::accumulate(
+            roarings.begin(), roarings.end(), false,
+            [](bool previous, std::pair<const uint32_t, Roaring> &map_entry) {
+                return map_entry.second.runOptimize() && previous;
+            });
+    }
+
+    /**
+     * If needed, reallocate memory to shrink the memory usage. Returns
+     * the number of bytes saved.
+    */
+    size_t shrinkToFit() {
+        size_t savedBytes = 0;
+        auto iter = roarings.begin();
+        while (iter != roarings.cend()) {
+            if (iter->second.isEmpty()) {
+                // empty Roarings are 84 bytes
+                savedBytes += 88;
+                roarings.erase(iter++);
+            } else {
+                savedBytes += iter->second.shrinkToFit();
+                iter++;
+            }
+        }
+        return savedBytes;
+    }
+
+    /**
+     * Iterate over the bitmap elements. The function iterator is called once
+     * for all the values with ptr (can be NULL) as the second parameter of each
+     * call.
+     *
+     * roaring_iterator is simply a pointer to a function that returns bool
+     * (true means that the iteration should continue while false means that it
+     * should stop), and takes (uint32_t,void*) as inputs.
+     */
+    void iterate(roaring_iterator64 iterator, void *ptr) const {
+        std::for_each(roarings.begin(), roarings.cend(),
+                      [=](const std::pair<uint32_t, Roaring> &map_entry) {
+                          roaring_iterate64(&map_entry.second.roaring, iterator,
+                                            uint64_t(map_entry.first) << 32,
+                                            ptr);
+                      });
+    }
+
+    /**
+     * If the size of the roaring bitmap is strictly greater than rank, then
+     this
+       function returns true and set element to the element of given rank.
+       Otherwise, it returns false.
+     */
+    bool select(uint64_t rnk, uint64_t *element) const {
+        for (const auto &map_entry : roarings) {
+            uint64_t sub_cardinality = (uint64_t)map_entry.second.cardinality();
+            if (rnk < sub_cardinality) {
+                *element = ((uint64_t)map_entry.first) << 32;
+                // assuming little endian
+                return map_entry.second.select((uint32_t)rnk,
+                                               ((uint32_t *)element));
+            }
+            rnk -= sub_cardinality;
+        }
+        return false;
+    }
+
+    /**
+    * Returns the number of integers that are smaller or equal to x.
+    */
+    uint64_t rank(uint64_t x) const {
+        uint64_t result = 0;
+        auto roaring_destination = roarings.find(highBytes(x));
+        if (roaring_destination != roarings.cend()) {
+            for (auto roaring_iter = roarings.cbegin();
+                 roaring_iter != roaring_destination; ++roaring_iter) {
+                result += roaring_iter->second.cardinality();
+            }
+            result += roaring_destination->second.rank(lowBytes(x));
+            return result;
+        }
+        roaring_destination = roarings.lower_bound(highBytes(x));
+        for (auto roaring_iter = roarings.cbegin();
+             roaring_iter != roaring_destination; ++roaring_iter) {
+            result += roaring_iter->second.cardinality();
+        }
+        return result;
+    }
+
+    /**
+     * write a bitmap to a char buffer. This is meant to be compatible with
+     * the
+     * Java and Go versions. Returns how many bytes were written which should be
+     * getSizeInBytes().
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     */
+    size_t write(char *buf, bool portable = true) const {
+        const char *orig = buf;
+        // push map size
+        *((uint64_t *)buf) = roarings.size();
+        buf += sizeof(uint64_t);
+        std::for_each(
+            roarings.cbegin(), roarings.cend(),
+            [&buf, portable](const std::pair<uint32_t, Roaring> &map_entry) {
+                // push map key
+                memcpy(buf, &map_entry.first,
+                       sizeof(uint32_t));  // this is undefined:
+                                           // *((uint32_t*)buf) =
+                                           // map_entry.first;
+                buf += sizeof(uint32_t);
+                // push map value Roaring
+                buf += map_entry.second.write(buf, portable);
+            });
+        return buf - orig;
+    }
+
+    /**
+     * read a bitmap from a serialized version. This is meant to be compatible
+     * with
+     * the
+     * Java and Go versions.
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     *
+     * This function is unsafe in the sense that if you provide bad data,
+     * many bytes could be read, possibly causing a buffer overflow. See also readSafe.
+     */
+    static Roaring64Map read(const char *buf, bool portable = true) {
+        Roaring64Map result;
+        // get map size
+        uint64_t map_size = *((uint64_t *)buf);
+        buf += sizeof(uint64_t);
+        for (uint64_t lcv = 0; lcv < map_size; lcv++) {
+            // get map key
+            uint32_t key;
+            memcpy(&key, buf, sizeof(uint32_t));  // this is undefined: uint32_t
+                                                  // key = *((uint32_t*)buf);
+            buf += sizeof(uint32_t);
+            // read map value Roaring
+            Roaring read = Roaring::read(buf, portable);
+            result.emplaceOrInsert(key, read);
+            // forward buffer past the last Roaring Bitmap
+            buf += read.getSizeInBytes(portable);
+        }
+        return result;
+    }
+
+    /**
+     * read a bitmap from a serialized version, reading no more than maxbytes bytes.
+     * This is meant to be compatible with the Java and Go versions.
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     */
+    static Roaring64Map readSafe(const char *buf, size_t maxbytes) {
+        Roaring64Map result;
+        // get map size
+        uint64_t map_size = *((uint64_t *)buf);
+        buf += sizeof(uint64_t);
+        for (uint64_t lcv = 0; lcv < map_size; lcv++) {
+            // get map key
+            if(maxbytes < sizeof(uint32_t)) {
+                throw std::runtime_error("ran out of bytes");
+            }
+            uint32_t key;
+            memcpy(&key, buf, sizeof(uint32_t));  // this is undefined: uint32_t
+                                                  // key = *((uint32_t*)buf);
+            buf += sizeof(uint32_t);
+            maxbytes -= sizeof(uint32_t);
+            // read map value Roaring
+            Roaring read = Roaring::readSafe(buf, maxbytes);
+            result.emplaceOrInsert(key, read);
+            // forward buffer past the last Roaring Bitmap
+            size_t tz = read.getSizeInBytes(true);
+            buf += tz;
+            maxbytes -= tz;
+        }
+        return result;
+    }
+
+    /**
+     * How many bytes are required to serialize this bitmap (meant to be
+     * compatible
+     * with Java and Go versions)
+     *
+     * Setting the portable flag to false enable a custom format that
+     * can save space compared to the portable format (e.g., for very
+     * sparse bitmaps).
+     */
+    size_t getSizeInBytes(bool portable = true) const {
+        // start with, respectively, map size and size of keys for each map
+        // entry
+        return std::accumulate(
+            roarings.cbegin(), roarings.cend(),
+            sizeof(uint64_t) + roarings.size() * sizeof(uint32_t),
+            [=](size_t previous,
+                const std::pair<uint32_t, Roaring> &map_entry) {
+                // add in bytes used by each Roaring
+                return previous + map_entry.second.getSizeInBytes(portable);
+            });
+    }
+
+    /**
+     * Computes the intersection between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring64Map operator&(const Roaring64Map &o) const {
+        return Roaring64Map(*this) &= o;
+    }
+
+    /**
+     * Computes the difference between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring64Map operator-(const Roaring64Map &o) const {
+        return Roaring64Map(*this) -= o;
+    }
+
+    /**
+     * Computes the union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring64Map operator|(const Roaring64Map &o) const {
+        return Roaring64Map(*this) |= o;
+    }
+
+    /**
+     * Computes the symmetric union between two bitmaps and returns new bitmap.
+     * The current bitmap and the provided bitmap are unchanged.
+     */
+    Roaring64Map operator^(const Roaring64Map &o) const {
+        return Roaring64Map(*this) ^= o;
+    }
+
+    /**
+     * Whether or not we apply copy and write.
+     */
+    void setCopyOnWrite(bool val) {
+        if (copyOnWrite == val) return;
+        copyOnWrite = val;
+        std::for_each(roarings.begin(), roarings.end(),
+                      [=](std::pair<const uint32_t, Roaring> &map_entry) {
+                          map_entry.second.setCopyOnWrite(val);
+                      });
+    }
+
+    /**
+     * Print the content of the bitmap
+     */
+    void printf() const {
+        if (!isEmpty()) {
+            auto map_iter = roarings.cbegin();
+            while (map_iter->second.isEmpty()) ++map_iter;
+            struct iter_data {
+                uint32_t high_bits;
+                char first_char = '{';
+            } outer_iter_data;
+            outer_iter_data.high_bits = roarings.begin()->first;
+            map_iter->second.iterate(
+                [](uint32_t low_bits, void *inner_iter_data) -> bool {
+                    std::printf("%c%llu",
+                                ((iter_data *)inner_iter_data)->first_char,
+                                (long long unsigned)uniteBytes(
+                                    ((iter_data *)inner_iter_data)->high_bits,
+                                    low_bits));
+                    ((iter_data *)inner_iter_data)->first_char = ',';
+                    return true;
+                },
+                (void *)&outer_iter_data);
+            std::for_each(
+                ++map_iter, roarings.cend(),
+                [](const std::pair<uint32_t, Roaring> &map_entry) {
+                    map_entry.second.iterate(
+                        [](uint32_t low_bits, void *high_bits) -> bool {
+                            std::printf(",%llu",
+                                        (long long unsigned)uniteBytes(
+                                            *(uint32_t *)high_bits, low_bits));
+                            return true;
+                        },
+                        (void *)&map_entry.first);
+                });
+        } else
+            std::printf("{");
+        std::printf("}\n");
+    }
+
+    /**
+     * Print the content of the bitmap into a string
+     */
+    std::string toString() const {
+        struct iter_data {
+            std::string str;
+            uint32_t high_bits;
+            char first_char = '{';
+        } outer_iter_data;
+        if (!isEmpty()) {
+            auto map_iter = roarings.cbegin();
+            while (map_iter->second.isEmpty()) ++map_iter;
+            outer_iter_data.high_bits = roarings.begin()->first;
+            map_iter->second.iterate(
+                [](uint32_t low_bits, void *inner_iter_data) -> bool {
+                    ((iter_data *)inner_iter_data)->str +=
+                        ((iter_data *)inner_iter_data)->first_char;
+                    ((iter_data *)inner_iter_data)->str += std::to_string(
+                        uniteBytes(((iter_data *)inner_iter_data)->high_bits,
+                                   low_bits));
+                    ((iter_data *)inner_iter_data)->first_char = ',';
+                    return true;
+                },
+                (void *)&outer_iter_data);
+            std::for_each(
+                ++map_iter, roarings.cend(),
+                [&outer_iter_data](
+                    const std::pair<uint32_t, Roaring> &map_entry) {
+                    outer_iter_data.high_bits = map_entry.first;
+                    map_entry.second.iterate(
+                        [](uint32_t low_bits, void *inner_iter_data) -> bool {
+                            ((iter_data *)inner_iter_data)->str +=
+                                ((iter_data *)inner_iter_data)->first_char;
+                            ((iter_data *)inner_iter_data)->str +=
+                                std::to_string(uniteBytes(
+                                    ((iter_data *)inner_iter_data)->high_bits,
+                                    low_bits));
+                            return true;
+                        },
+                        (void *)&outer_iter_data);
+                });
+        } else
+            outer_iter_data.str = '{';
+        outer_iter_data.str += '}';
+        return outer_iter_data.str;
+    }
+
+    /**
+     * Whether or not copy and write is active.
+     */
+    bool getCopyOnWrite() const { return copyOnWrite; }
+
+    /**
+     * computes the logical or (union) between "n" bitmaps (referenced by a
+     * pointer).
+     */
+    static Roaring64Map fastunion(size_t n, const Roaring64Map **inputs) {
+        Roaring64Map ans;
+        // not particularly fast
+        for (size_t lcv = 0; lcv < n; ++lcv) {
+            ans |= *(inputs[lcv]);
+        }
+        return ans;
+    }
+
+    friend class Roaring64MapSetBitForwardIterator;
+    typedef Roaring64MapSetBitForwardIterator const_iterator;
+
+    /**
+    * Returns an iterator that can be used to access the position of the
+    * set bits. The running time complexity of a full scan is proportional to
+    * the
+    * number
+    * of set bits: be aware that if you have long strings of 1s, this can be
+    * very inefficient.
+    *
+    * It can be much faster to use the toArray method if you want to
+    * retrieve the set bits.
+    */
+    const_iterator begin() const;
+
+    /**
+    * A bogus iterator that can be used together with begin()
+    * for constructions such as for(auto i = b.begin();
+    * i!=b.end(); ++i) {}
+    */
+    const_iterator end() const;
+
+   private:
+    std::map<uint32_t, Roaring> roarings;
+    bool copyOnWrite = false;
+    static uint32_t highBytes(const uint64_t in) { return uint32_t(in >> 32); }
+    static uint32_t lowBytes(const uint64_t in) { return uint32_t(in); }
+    static uint64_t uniteBytes(const uint32_t highBytes,
+                               const uint32_t lowBytes) {
+        return (uint64_t(highBytes) << 32) | uint64_t(lowBytes);
+    }
+    // this is needed to tolerate gcc's C++11 libstdc++ lacking emplace
+    // prior to version 4.8
+    void emplaceOrInsert(const uint32_t key, const Roaring &value) {
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20130322
+        roarings.insert(std::make_pair(key, value));
+#else
+        roarings.emplace(std::make_pair(key, value));
+#endif
+    }
+};
+
+/**
+ * Used to go through the set bits. Not optimally fast, but convenient.
+ */
+class Roaring64MapSetBitForwardIterator final {
+   public:
+    typedef std::forward_iterator_tag iterator_category;
+    typedef uint64_t *pointer;
+    typedef uint64_t &reference_type;
+    typedef uint64_t value_type;
+    typedef int64_t difference_type;
+    typedef Roaring64MapSetBitForwardIterator type_of_iterator;
+
+    /**
+     * Provides the location of the set bit.
+     */
+    value_type operator*() const {
+        return Roaring64Map::uniteBytes(map_iter->first, i.current_value);
+    }
+
+    bool operator<(const type_of_iterator &o) {
+        if (map_iter == map_end) return false;
+        if (o.map_iter == o.map_end) return true;
+        return **this < *o;
+    }
+
+    bool operator<=(const type_of_iterator &o) {
+        if (o.map_iter == o.map_end) return true;
+        if (map_iter == map_end) return false;
+        return **this <= *o;
+    }
+
+    bool operator>(const type_of_iterator &o) {
+        if (o.map_iter == o.map_end) return false;
+        if (map_iter == map_end) return true;
+        return **this > *o;
+    }
+
+    bool operator>=(const type_of_iterator &o) {
+        if (map_iter == map_end) return true;
+        if (o.map_iter == o.map_end) return false;
+        return **this >= *o;
+    }
+
+    type_of_iterator &operator++() {  // ++i, must returned inc. value
+        if (i.has_value == true) roaring_advance_uint32_iterator(&i);
+        while (!i.has_value) {
+            map_iter++;
+            if (map_iter == map_end) return *this;
+            roaring_init_iterator(&map_iter->second.roaring, &i);
+        }
+        return *this;
+    }
+
+    type_of_iterator operator++(int) {  // i++, must return orig. value
+        Roaring64MapSetBitForwardIterator orig(*this);
+        roaring_advance_uint32_iterator(&i);
+        while (!i.has_value) {
+            map_iter++;
+            if (map_iter == map_end) return orig;
+            roaring_init_iterator(&map_iter->second.roaring, &i);
+        }
+        return orig;
+    }
+
+    bool operator==(const Roaring64MapSetBitForwardIterator &o) {
+        if (map_iter == map_end && o.map_iter == o.map_end) return true;
+        if (o.map_iter == o.map_end) return false;
+        return **this == *o;
+    }
+
+    bool operator!=(const Roaring64MapSetBitForwardIterator &o) {
+        if (map_iter == map_end && o.map_iter == o.map_end) return false;
+        if (o.map_iter == o.map_end) return true;
+        return **this != *o;
+    }
+
+    Roaring64MapSetBitForwardIterator(const Roaring64Map &parent,
+                                      bool exhausted = false)
+        : map_end(parent.roarings.cend()) {
+        if (exhausted || parent.roarings.empty()) {
+            map_iter = parent.roarings.cend();
+        } else {
+            map_iter = parent.roarings.cbegin();
+            roaring_init_iterator(&map_iter->second.roaring, &i);
+            while (!i.has_value) {
+                map_iter++;
+                if (map_iter == map_end) return;
+                roaring_init_iterator(&map_iter->second.roaring, &i);
+            }
+        }
+    }
+
+    ~Roaring64MapSetBitForwardIterator() = default;
+
+    Roaring64MapSetBitForwardIterator(
+        const Roaring64MapSetBitForwardIterator &o) = default;
+
+   private:
+    std::map<uint32_t, Roaring>::const_iterator map_iter;
+    std::map<uint32_t, Roaring>::const_iterator map_end;
+    roaring_uint32_iterator_t i;
+};
+
+inline Roaring64MapSetBitForwardIterator Roaring64Map::begin() const {
+    return Roaring64MapSetBitForwardIterator(*this);
+}
+
+inline Roaring64MapSetBitForwardIterator Roaring64Map::end() const {
+    return Roaring64MapSetBitForwardIterator(*this, true);
+}
+
+#endif /* INCLUDE_ROARING_64_MAP_HH_ */
+/* end file /opt/bitmap/CRoaring-0.2.57/cpp/roaring64map.hh */
diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index 3eb84d8eefa..acbe157c18b 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -192,6 +192,7 @@ target_link_libraries (clickhouse_common_io
     ${RE2_LIBRARY}
     ${RE2_ST_LIBRARY}
     ${CITYHASH_LIBRARIES}
+	roaring
         PRIVATE
     ${ZLIB_LIBRARIES}
     ${EXECINFO_LIBRARY}
diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
new file mode 100644
index 00000000000..269476ce9ad
--- /dev/null
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
@@ -0,0 +1,40 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionGroupBitmap.h>
+#include <AggregateFunctions/Helpers.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+
+
+namespace DB
+{
+
+namespace
+{
+
+template <template <typename> class Data>
+AggregateFunctionPtr createAggregateFunctionBitmap(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    assertNoParameters(name, parameters);
+    assertUnary(name, argument_types);
+
+    if (!argument_types[0]->canBeUsedInBitOperations())
+        throw Exception("The type " + argument_types[0]->getName() + " of argument for aggregate function " + name
+            + " is illegal, because it cannot be used in Bitmap operations",
+            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+    AggregateFunctionPtr res(createWithUnsignedIntegerType<AggregateFunctionBitmap, Data>(*argument_types[0]));
+
+    if (!res)
+        throw Exception("Illegal type " + argument_types[0]->getName() + " of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+    return res;
+}
+
+}
+
+void registerAggregateFunctionsBitmap(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction("groupBitmap", createAggregateFunctionBitmap<AggregateFunctionGroupBitmapData>);
+
+}
+
+}
diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h
new file mode 100644
index 00000000000..38b63b713e7
--- /dev/null
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <Columns/ColumnVector.h>
+#include <boost/noncopyable.hpp>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/AggregateFunctionGroupBitmapData.h>
+#include <DataTypes/DataTypesNumber.h>
+
+namespace DB
+{
+
+/// Counts bitmap operation on numbers.
+template <typename T, typename Data>
+class AggregateFunctionBitmap final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitmap<T, Data>>
+{
+public:
+    String getName() const override { return Data::name(); }
+
+    DataTypePtr getReturnType() const override
+    {
+        return std::make_shared<DataTypeNumber<T>>();
+    }
+
+    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).rbs.add(static_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num]);
+    }
+
+    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).rbs.merge(this->data(rhs).rbs);
+    }
+
+    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
+    {
+        this->data(place).rbs.write(buf);
+    }
+
+    void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
+    {
+        this->data(place).rbs.read(buf);
+    }
+
+    void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
+    {
+        static_cast<ColumnVector<T> &>(to).getData().push_back(this->data(place).rbs.size());
+    }
+
+    const char * getHeaderFilePath() const override { return __FILE__; }
+};
+
+
+}
diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
new file mode 100644
index 00000000000..f90625b2a3b
--- /dev/null
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
@@ -0,0 +1,492 @@
+#pragma once
+
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <boost/noncopyable.hpp>
+#include <Common/HashTable/SmallTable.h>
+#include <roaring.hh>
+#include <roaring.h>
+
+namespace DB
+{
+
+/**
+  * For a small number of values - an array of fixed size "on the stack".
+  * For large, roaring_bitmap_t is allocated.
+  * For a description of the roaring_bitmap_t, see: https://github.com/RoaringBitmap/CRoaring
+  */
+template <typename T, UInt8 small_set_size>
+class RoaringBitmapWithSmallSet : private boost::noncopyable
+{
+private:
+    using Small = SmallSet<T, small_set_size>;
+    using ValueBuffer = std::vector<T>;
+    Small small;
+    roaring_bitmap_t * rb = nullptr;
+
+    void toLarge()
+    {
+        rb = roaring_bitmap_create();
+
+        for (const auto & x : small)
+            roaring_bitmap_add(rb, x);
+    }
+
+public:
+    bool isLarge() const { return rb != nullptr; }
+
+    bool isSmall() const { return rb == nullptr; }
+
+    ~RoaringBitmapWithSmallSet()
+    {
+        if (isLarge())
+            roaring_bitmap_free(rb);
+    }
+
+    void add(T value) {
+        if (isSmall())
+        {
+            if (small.find(value) == small.end())
+            {
+                if (!small.full())
+                    small.insert(value);
+                else
+                {
+                    toLarge();
+                    roaring_bitmap_add(rb, value);
+                }
+            }
+        }
+        else
+            roaring_bitmap_add(rb, value);
+    }
+
+    UInt64 size() const
+    {
+        return isSmall() ? small.size() : roaring_bitmap_get_cardinality( rb );
+    }
+
+    void merge(const RoaringBitmapWithSmallSet & r1)
+    {
+        if (r1.isLarge())
+        {
+            if (isSmall())
+                toLarge();
+
+            roaring_bitmap_or_inplace( rb, r1.rb );
+        }
+        else
+        {
+            for (const auto & x : r1.small)
+                add(x);
+        }
+    }
+
+    void read(DB::ReadBuffer & in)
+    {
+        bool is_large;
+        readBinary(is_large, in);
+
+        if (is_large)
+        {
+            toLarge();
+            UInt32 cardinality;
+            readBinary( cardinality, in );
+            db_roaring_bitmap_add_many( in, rb, cardinality );
+        }
+        else
+            small.read(in);
+    }
+
+    void write(DB::WriteBuffer & out) const
+    {
+        writeBinary(isLarge(), out);
+
+        if (isLarge())
+        {
+            UInt32 cardinality = roaring_bitmap_get_cardinality(rb);
+            writePODBinary( cardinality, out );
+            db_ra_to_uint32_array( out, &rb->high_low_container );
+        } else
+            small.write(out);
+    }
+
+
+    roaring_bitmap_t * getRb() const
+    {
+        return rb;
+    }
+
+    Small& getSmall() const
+    {
+        return small;
+    }
+
+    /**
+     * Get a new roaring_bitmap_t from elements of small
+     */
+    roaring_bitmap_t * getNewRbFromSmall() const
+    {
+        roaring_bitmap_t * smallRb = roaring_bitmap_create();
+        for (const auto & x : small)
+            roaring_bitmap_add(smallRb, x);
+        return smallRb;
+    }
+
+    /**
+     * Computes the intersection between two bitmaps
+     */
+    void rb_and(const RoaringBitmapWithSmallSet & r1)
+    {
+        ValueBuffer buffer;
+        if(isSmall() &&  r1.isSmall())
+        {
+            // intersect
+            for (const auto & value : this->small)
+                if (r1.small.find(value) != r1.small.end())  buffer.push_back(value);
+
+            // Clear out the original values
+            this->small.clear();
+
+            for (const auto & value : buffer)
+                this->small.insert(value);
+
+            buffer.clear();
+        } else if(isSmall() &&  r1.isLarge()){
+            for (const auto & value : this->small)
+                if( roaring_bitmap_contains( r1.rb, value ) )  buffer.push_back( value );
+
+            // Clear out the original values
+            this->small.clear();
+
+            for (const auto & value : buffer)
+                this->small.insert(value);
+
+            buffer.clear();
+        } else {
+            roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
+            roaring_bitmap_and_inplace( rb, rb1 );
+            if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        }
+    }
+
+    /**
+     * Computes the union between two bitmaps.
+     */
+    void rb_or(const RoaringBitmapWithSmallSet & r1)
+    {
+        this->merge( r1 );
+    }
+
+    /**
+     * Computes the symmetric difference (xor) between two bitmaps.
+     */
+    void rb_xor(const RoaringBitmapWithSmallSet & r1)
+    {
+        if( this->isSmall() ) toLarge();
+        roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
+        roaring_bitmap_xor_inplace( rb, rb1 );
+        if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+    }
+
+    /**
+     * Computes the difference (andnot) between two bitmaps
+     */
+    void rb_andnot(const RoaringBitmapWithSmallSet & r1)
+    {
+        ValueBuffer buffer;
+        if(isSmall() &&  r1.isSmall())
+        {
+            // subtract
+            for (const auto & value : this->small)
+                if (r1.small.find(value) == r1.small.end())  buffer.push_back(value);
+
+            // Clear out the original values
+            this->small.clear();
+
+            for (const auto & value : buffer)
+                this->small.insert(value);
+
+            buffer.clear();
+        } else if(isSmall() &&  r1.isLarge()){
+            for (const auto & value : this->small)
+                if( !roaring_bitmap_contains( r1.rb, value ) )  buffer.push_back( value );
+
+            // Clear out the original values
+            this->small.clear();
+
+            for (const auto & value : buffer)
+                this->small.insert(value);
+
+            buffer.clear();
+        } else {
+            roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
+            roaring_bitmap_andnot_inplace( rb, rb1 );
+            if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        }
+    }
+
+    /**
+     * Computes the cardinality of the intersection between two bitmaps.
+     */
+    UInt64 rb_and_cardinality(const RoaringBitmapWithSmallSet & r1) const
+    {
+        UInt64 retSize = 0;
+        if( isSmall() &&  r1.isSmall() ){
+            for (const auto & value : this->small)
+                if (r1.small.find( value ) != r1.small.end()) retSize++;
+        } else if( isSmall() &&  r1.isLarge() ){
+            for (const auto & value : this->small)
+                if( roaring_bitmap_contains( r1.rb, value ) )   retSize++;
+        } else {
+            roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
+            retSize = roaring_bitmap_and_cardinality( rb, rb1 );
+            if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        }
+        return retSize;
+    }
+
+    /**
+     * Computes the cardinality of the union between two bitmaps.
+    */
+    UInt64 rb_or_cardinality(const RoaringBitmapWithSmallSet & r1) const
+    {
+        UInt64 c1 = this->size();
+        UInt64 c2 = r1.size();
+        UInt64 inter = this->rb_and_cardinality( r1 );
+        return c1 + c2 - inter;
+    }
+
+    /**
+     * Computes the cardinality of the symmetric difference (andnot) between two bitmaps.
+    */
+    UInt64 rb_xor_cardinality(const RoaringBitmapWithSmallSet & r1) const
+    {
+        UInt64 c1 = this->size();
+        UInt64 c2 = r1.size();
+        UInt64 inter = this->rb_and_cardinality( r1 );
+        return c1 + c2 - 2 * inter;
+    }
+
+    /**
+     * Computes the cardinality of the difference (andnot) between two bitmaps.
+     */
+    UInt64 rb_andnot_cardinality(const RoaringBitmapWithSmallSet & r1) const
+    {
+        UInt64 c1 = this->size();
+        UInt64 inter = this->rb_and_cardinality( r1 );
+        return c1 - inter;
+    }
+
+    /**
+     * Return 1 if the two bitmaps contain the same elements.
+     */
+    UInt8 rb_equals(const RoaringBitmapWithSmallSet & r1)
+    {
+        if( this->isSmall() ) toLarge();
+        roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
+        UInt8 is_true = roaring_bitmap_equals( rb, rb1 );
+        if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        return is_true;
+    }
+
+    /**
+     * Check whether two bitmaps intersect.
+     */
+    UInt8 rb_intersect(const RoaringBitmapWithSmallSet & r1)
+    {
+        if( this->isSmall() ) toLarge();
+        roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
+        UInt8 is_true = roaring_bitmap_intersect( rb, rb1 );
+        if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        return is_true;
+    }
+
+    /**
+     * Remove value
+     */
+    void rb_remove(UInt64 offsetid)
+    {
+        if( this->isSmall() ) toLarge();
+        roaring_bitmap_remove( rb, offsetid );
+    }
+
+    /**
+     * compute (in place) the negation of the roaring bitmap within a specified
+     * interval: [range_start, range_end). The number of negated values is
+     * range_end - range_start.
+     * Areas outside the range are passed through unchanged.
+     */
+    void rb_flip(UInt64 offsetstart, UInt64 offsetend)
+    {
+        if( this->isSmall() ) toLarge();
+        roaring_bitmap_flip_inplace( rb, offsetstart, offsetend );
+    }
+
+    /**
+     * returns the number of integers that are smaller or equal to offsetid.
+     */
+    UInt64 rb_rank(UInt64 offsetid)
+    {
+        if( this->isSmall() ) toLarge();
+        return roaring_bitmap_rank( rb, offsetid );
+    }
+
+    /**
+     * Convert elements to integer array, return number of elements
+     */
+    template <typename Element>
+    UInt64 rb_to_array(PaddedPODArray<Element> & res_data) const
+    {
+        UInt64 count = 0;
+        if (this->isSmall())
+        {
+            for (const auto & x : small)
+            {
+                res_data.emplace_back(x);
+                count++;
+            }
+        } else {
+            roaring_uint32_iterator_t iterator;
+            roaring_init_iterator(rb, &iterator);
+            while(iterator.has_value)
+            {
+                res_data.emplace_back(iterator.current_value);
+                roaring_advance_uint32_iterator(&iterator);
+                count++;
+            }
+        }
+        return count;
+    }
+
+private:
+    /// To read and write the DB Buffer directly, migrate code from CRoaring
+    void db_roaring_bitmap_add_many( DB::ReadBuffer & dbBuf, roaring_bitmap_t * r, size_t n_args ) {
+        void *container = NULL;  // hold value of last container touched
+        uint8_t typecode = 0;    // typecode of last container touched
+        uint32_t prev = 0;       // previous valued inserted
+        size_t i = 0;            // index of value
+        int containerindex = 0;
+        if (n_args == 0) return;
+        uint32_t val;
+        readBinary( val, dbBuf );
+        container = containerptr_roaring_bitmap_add(r, val, &typecode, &containerindex);
+        prev = val;
+        i++;
+        for (; i < n_args; i++) {
+            readBinary( val, dbBuf );
+            if (((prev ^ val) >> 16) ==
+                0) {  // no need to seek the container, it is at hand
+                // because we already have the container at hand, we can do the
+                // insertion
+                // automatically, bypassing the roaring_bitmap_add call
+                uint8_t newtypecode = typecode;
+                void * container2 =
+                        container_add(container, val & 0xFFFF, typecode, &newtypecode);
+                // rare instance when we need to
+                if (container2 != container)
+                {
+                    // change the container type
+                    container_free(container, typecode);
+                    ra_set_container_at_index(&r->high_low_container,
+                                              containerindex, container2,
+                                              newtypecode);
+                    typecode = newtypecode;
+                    container = container2;
+                }
+            } else {
+                container = containerptr_roaring_bitmap_add(r, val, &typecode,
+                                                            &containerindex);
+            }
+            prev = val;
+        }
+    }
+
+    void db_ra_to_uint32_array( DB::WriteBuffer & dbBuf, roaring_array_t * ra ) const {
+        size_t ctr = 0;
+        for (Int32 i = 0; i < ra->size; ++i)
+        {
+            Int32 num_added = db_container_to_uint32_array( dbBuf, ra->containers[i], ra->typecodes[i],
+                                                             ((UInt32)ra->keys[i]) << 16);
+            ctr += num_added;
+        }
+    }
+	
+    UInt32 db_container_to_uint32_array( DB::WriteBuffer & dbBuf, const void * container, UInt8 typecode, UInt32 base) const {
+        container = container_unwrap_shared(container, &typecode);
+        switch (typecode) {
+            case BITSET_CONTAINER_TYPE_CODE:
+                return db_bitset_container_to_uint32_array( dbBuf,
+                (const bitset_container_t *)container, base);
+            case ARRAY_CONTAINER_TYPE_CODE:
+                return db_array_container_to_uint32_array( dbBuf,
+                (const array_container_t *)container, base);
+            case RUN_CONTAINER_TYPE_CODE:
+                return db_run_container_to_uint32_array( dbBuf,
+                (const run_container_t *)container, base);
+        }
+        return 0;
+    }
+	
+    UInt32 db_bitset_container_to_uint32_array( DB::WriteBuffer & dbBuf, const bitset_container_t * cont, UInt32 base) const {
+        return (UInt32) db_bitset_extract_setbits( dbBuf, cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, base);
+    }
+
+    size_t db_bitset_extract_setbits( DB::WriteBuffer & dbBuf, UInt64 * bitset, size_t length, UInt32 base) const {
+        UInt32 outpos = 0;
+        for (size_t i = 0; i < length; ++i)
+        {
+            UInt64 w = bitset[i];
+            while (w != 0)
+            {
+                UInt64 t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
+                UInt32 r = __builtin_ctzll(w); // on x64, should compile to TZCNT
+                UInt32 val = r + base;
+				writePODBinary( val, dbBuf );
+                outpos++;
+                w ^= t;
+            }
+            base += 64;
+        }
+        return outpos;
+    }
+
+    int db_array_container_to_uint32_array( DB::WriteBuffer & dbBuf, const array_container_t * cont, UInt32 base) const {
+        UInt32 outpos = 0;
+        for ( Int32 i = 0; i < cont->cardinality; ++i)
+        {
+            const UInt32 val = base + cont->array[i];
+			writePODBinary( val, dbBuf );
+            outpos++;
+        }
+        return outpos;
+    }
+
+    int db_run_container_to_uint32_array( DB::WriteBuffer & dbBuf, const run_container_t * cont, UInt32 base) const {
+        UInt32 outpos = 0;
+        for (Int32 i = 0; i < cont->n_runs; ++i)
+        {
+            UInt32 run_start = base + cont->runs[i].value;
+            UInt16 le = cont->runs[i].length;
+            for (Int32 j = 0; j <= le; ++j)
+            {
+                UInt32 val = run_start + j;
+				writePODBinary( val, dbBuf );
+                outpos++;
+            }
+        }
+        return outpos;
+    }
+	
+	
+};
+
+template <typename T>
+struct AggregateFunctionGroupBitmapData
+{
+    RoaringBitmapWithSmallSet<T,32> rbs;
+    static const char * name() { return "groupBitmap"; }
+};
+
+
+}
diff --git a/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp b/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp
index f5e15b6a887..89b11d1e6f7 100644
--- a/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp
@@ -26,6 +26,7 @@ void registerAggregateFunctionUniqCombined(AggregateFunctionFactory &);
 void registerAggregateFunctionUniqUpTo(AggregateFunctionFactory &);
 void registerAggregateFunctionTopK(AggregateFunctionFactory &);
 void registerAggregateFunctionsBitwise(AggregateFunctionFactory &);
+void registerAggregateFunctionsBitmap(AggregateFunctionFactory &);
 void registerAggregateFunctionsMaxIntersections(AggregateFunctionFactory &);
 
 void registerAggregateFunctionCombinatorIf(AggregateFunctionCombinatorFactory &);
@@ -62,6 +63,7 @@ void registerAggregateFunctions()
         registerAggregateFunctionUniqUpTo(factory);
         registerAggregateFunctionTopK(factory);
         registerAggregateFunctionsBitwise(factory);
+        registerAggregateFunctionsBitmap(factory);
         registerAggregateFunctionsMaxIntersections(factory);
         registerAggregateFunctionHistogram(factory);
         registerAggregateFunctionRetention(factory);
diff --git a/dbms/src/Functions/FunctionsBitmap.cpp b/dbms/src/Functions/FunctionsBitmap.cpp
new file mode 100644
index 00000000000..1b4bcbaaf7b
--- /dev/null
+++ b/dbms/src/Functions/FunctionsBitmap.cpp
@@ -0,0 +1,25 @@
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionsBitmap.h>
+
+
+namespace DB
+{
+
+void registerFunctionsBitmap(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionBitmapBuild>();
+    factory.registerFunction<FunctionBitmapToArray>();
+
+    factory.registerFunction<FunctionBitmapSelfCardinality>();
+    factory.registerFunction<FunctionBitmapAndCardinality>();
+    factory.registerFunction<FunctionBitmapOrCardinality>();
+    factory.registerFunction<FunctionBitmapXorCardinality>();
+    factory.registerFunction<FunctionBitmapAndnotCardinality>();
+
+    factory.registerFunction<FunctionBitmapAnd>();
+    factory.registerFunction<FunctionBitmapOr>();
+    factory.registerFunction<FunctionBitmapXor>();
+    factory.registerFunction<FunctionBitmapAndnot>();
+
+}
+}
diff --git a/dbms/src/Functions/FunctionsBitmap.h b/dbms/src/Functions/FunctionsBitmap.h
new file mode 100644
index 00000000000..055607a9038
--- /dev/null
+++ b/dbms/src/Functions/FunctionsBitmap.h
@@ -0,0 +1,499 @@
+#pragma once
+
+#include <Common/typeid_cast.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnConst.h>
+#include <Columns/ColumnFunction.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeAggregateFunction.h>
+#include <Common/typeid_cast.h>
+#include <Columns/ColumnAggregateFunction.h>
+#include <Functions/IFunction.h>
+#include <Functions/FunctionHelpers.h>
+#include <AggregateFunctions/AggregateFunctionGroupBitmapData.h>
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+
+/** Bitmap functions.
+  * Build a bitmap from integer array:
+  * bitmapBuild: integer[] -> bitmap
+  *
+  * Convert bitmap to integer array:
+  * bitmapToArray:	bitmap -> integer[]
+  *
+  * Two bitmap and calculation:
+  * bitmapAnd:	bitmap,bitmap -> bitmap
+  *
+  * Two bitmap or calculation:
+  * bitmapOr:	bitmap,bitmap -> bitmap
+  *
+  * Two bitmap xor calculation:
+  * bitmapXor:	bitmap,bitmap -> bitmap
+  *
+  * Two bitmap andnot calculation:
+  * bitmapAndnot:	bitmap,bitmap -> bitmap
+  *
+  * Retrun bitmap cardinality:
+  * bitmapCardinality:	bitmap -> integer
+  *
+  * Two bitmap and calculation, return cardinality:
+  * bitmapAndCardinality:	bitmap,bitmap -> integer
+  *
+  * Two bitmap or calculation, return cardinality:
+  * bitmapOrCardinality:	bitmap,bitmap -> integer
+  *
+  * Two bitmap xor calculation, return cardinality:
+  * bitmapXorCardinality:	bitmap,bitmap -> integer
+  *
+  * Two bitmap andnot calculation, return cardinality:
+  * bitmapAndnotCardinality: bitmap,bitmap -> integer
+  */
+
+template <typename Name>
+class FunctionBitmapBuildImpl : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitmapBuildImpl>(); }
+
+    String getName() const override { return name; }
+
+    bool isVariadic() const override { return false; }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (arguments[0]->onlyNull())
+            return arguments[0];
+
+        auto array_type = typeid_cast<const DataTypeArray *>(arguments[0].get());
+        if (!array_type)
+            throw Exception("First argument for function " + getName() + " must be an array but it has type "
+                            + arguments[0]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        auto nested_type = array_type->getNestedType();
+        DataTypes argument_types = {nested_type};
+        Array params_row;
+        AggregateFunctionPtr bitmap_function = AggregateFunctionFactory::instance().get(AggregateFunctionGroupBitmapData<UInt32>::name(), argument_types, params_row);
+
+        return std::make_shared<DataTypeAggregateFunction>(bitmap_function, argument_types, params_row);
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /* input_rows_count */) override
+    {
+        const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
+        auto array_type = typeid_cast<const DataTypeArray *>(from_type);
+        auto nested_type = array_type->getNestedType();
+
+        DataTypes argument_types = {nested_type};
+
+        WhichDataType which(nested_type);
+        if      (which.isUInt8()) executeBitmapData<UInt8>(block, argument_types, arguments, result);
+        else if (which.isUInt16()) executeBitmapData<UInt16>(block, argument_types, arguments, result);
+        else if (which.isUInt32()) executeBitmapData<UInt32>(block, argument_types, arguments, result);
+        else if (which.isUInt64()) executeBitmapData<UInt64>(block, argument_types, arguments, result);
+        else
+            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+    }
+
+private:
+    template <typename T>
+    void executeBitmapData( Block & block, DataTypes & argument_types, const ColumnNumbers & arguments, size_t result)
+    {
+        // input data
+        const ColumnArray * array = typeid_cast<const ColumnArray *>(block.getByPosition(arguments[0]).column.get());
+        ColumnPtr mapped = array->getDataPtr();
+        const ColumnArray::Offsets & offsets = array->getOffsets();
+        const ColumnVector<T> * column = checkAndGetColumn<ColumnVector<T>>(&*mapped);
+        const typename ColumnVector<T>::Container & input_data = column->getData();
+
+        // output data
+        Array params_row;
+        AggregateFunctionPtr bitmap_function = AggregateFunctionFactory::instance().get(AggregateFunctionGroupBitmapData<UInt32>::name(), argument_types, params_row);
+        auto col_to = ColumnAggregateFunction::create(bitmap_function);
+        col_to->reserve(offsets.size());
+
+        size_t pos = 0;
+        for(size_t i = 0; i < offsets.size(); ++i)
+        {
+            col_to->insertDefault();
+            AggregateFunctionGroupBitmapData<T>& bitmap_data = *reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>( col_to->getData()[i] );
+            for (; pos < offsets[i]; ++pos)
+            {
+                bitmap_data.rbs.add(input_data[pos]);
+            }
+        }
+        block.getByPosition(result).column = std::move(col_to);
+    }
+};
+
+template <typename Name>
+class FunctionBitmapToArrayImpl : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitmapToArrayImpl>(); }
+
+    String getName() const override { return name; }
+
+    bool isVariadic() const override { return false; }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>( arguments[0].get() );
+        if (!aggr_type)
+            throw Exception("First argument for function " + getName() + " must be an AggregateFunction but it has type "
+                            + arguments[0]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        const DataTypePtr data_type = aggr_type->getArgumentsDataTypes()[0];
+
+        return std::make_shared<DataTypeArray>(data_type);
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
+    {
+        // input data
+        const auto & return_type = block.getByPosition(result).type;
+        auto res_ptr = return_type->createColumn();
+        ColumnArray & res = static_cast<ColumnArray &>(*res_ptr);
+
+        IColumn & res_data = res.getData();
+        ColumnArray::Offsets & res_offsets = res.getOffsets();
+
+        const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
+        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
+        WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
+        if      (which.isUInt8()) executeIntType<UInt8>(block, arguments, input_rows_count, res_data, res_offsets);
+        else if (which.isUInt16()) executeIntType<UInt16>(block, arguments, input_rows_count, res_data, res_offsets);
+        else if (which.isUInt32()) executeIntType<UInt32>(block, arguments, input_rows_count, res_data, res_offsets);
+        else if (which.isUInt64()) executeIntType<UInt64>(block, arguments, input_rows_count, res_data, res_offsets);
+        else
+            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        block.getByPosition(result).column = std::move(res_ptr);
+    }
+private:
+    using ToType = UInt64;
+
+    template <typename T>
+    void executeIntType( Block & block, const ColumnNumbers & arguments, size_t input_rows_count,
+                         IColumn & res_data_col, ColumnArray::Offsets & res_offsets ) const
+    {
+        const ColumnAggregateFunction * column = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[0]).column.get());
+
+        PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData();
+        ColumnArray::Offset res_offset = 0;
+
+        for(size_t i = 0; i < input_rows_count; ++i)
+        {
+            const AggregateFunctionGroupBitmapData<T>& bd1 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( column->getData()[i] );
+            UInt64 count = bd1.rbs.rb_to_array( res_data );
+            res_offset += count;
+            res_offsets.emplace_back(res_offset);
+        }
+    }
+};
+
+template <typename Name>
+class FunctionBitmapSelfCardinalityImpl : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitmapSelfCardinalityImpl>(); }
+
+    String getName() const override { return name; }
+
+    bool isVariadic() const override { return false; }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments */) const override
+    { return std::make_shared<DataTypeNumber<ToType>>(); }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
+    {
+        auto col_to = ColumnVector<ToType>::create(input_rows_count);
+        typename ColumnVector<ToType>::Container & vec_to = col_to->getData();
+        const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
+
+        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
+        WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
+        if      (which.isUInt8()) executeIntType<UInt8>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt16()) executeIntType<UInt16>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt32()) executeIntType<UInt32>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt64()) executeIntType<UInt64>(block, arguments, input_rows_count, vec_to);
+        else
+            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        block.getByPosition(result).column = std::move(col_to);
+    }
+private:
+    using ToType = UInt64;
+
+    template <typename T>
+    void executeIntType( Block & block, const ColumnNumbers & arguments, size_t input_rows_count, typename ColumnVector<ToType>::Container & vec_to)
+    {
+        const ColumnAggregateFunction * column = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[0]).column.get());
+        for(size_t i = 0; i < input_rows_count; ++i)
+        {
+            const AggregateFunctionGroupBitmapData<T>& bd1 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( column->getData()[i] );
+            vec_to[i] = bd1.rbs.size();
+        }
+    }
+};
+
+template <typename T>
+struct BitmapAndCardinalityImpl
+{
+    using ReturnType = UInt64;
+    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        // roaring_bitmap_and_cardinality( rb1, rb2 );
+        return bd1.rbs.rb_and_cardinality( bd2.rbs );
+    }
+};
+
+
+template <typename T>
+struct BitmapOrCardinalityImpl
+{
+    using ReturnType = UInt64;
+    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        // return roaring_bitmap_or_cardinality( rb1, rb2 );
+        return bd1.rbs.rb_or_cardinality( bd2.rbs );
+    }
+};
+
+template <typename T>
+struct BitmapXorCardinalityImpl
+{
+    using ReturnType = UInt64;
+    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        // return roaring_bitmap_xor_cardinality( rb1, rb2 );
+        return bd1.rbs.rb_xor_cardinality( bd2.rbs );
+    }
+};
+
+template <typename T>
+struct BitmapAndnotCardinalityImpl
+{
+    using ReturnType = UInt64;
+    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        // roaring_bitmap_andnot_cardinality( rb1, rb2 );
+        return bd1.rbs.rb_andnot_cardinality( bd2.rbs );
+    }
+};
+
+template <template <typename> class Impl, typename Name>
+class FunctionBitmapCardinality : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+	
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitmapCardinality>(); }
+
+    String getName() const override { return name; }
+
+    bool isVariadic() const override { return false; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments */) const override
+    { return std::make_shared<DataTypeNumber<ToType>>(); }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
+    {
+        auto col_to = ColumnVector<ToType>::create(input_rows_count);
+        typename ColumnVector<ToType>::Container & vec_to = col_to->getData();
+        const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
+
+        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
+        WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
+        if      (which.isUInt8()) executeIntType<UInt8>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt16()) executeIntType<UInt16>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt32()) executeIntType<UInt32>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt64()) executeIntType<UInt64>(block, arguments, input_rows_count, vec_to);
+        else
+            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        block.getByPosition(result).column = std::move(col_to);
+    }
+private:
+    using ToType = UInt64;
+
+    template <typename T>
+    void executeIntType( Block & block, const ColumnNumbers & arguments, size_t input_rows_count, typename ColumnVector<ToType>::Container & vec_to)
+    {
+        const ColumnAggregateFunction * columns[2];
+        for (size_t i = 0; i < 2; ++i)
+            columns[i] = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[i]).column.get());
+
+        for(size_t i = 0; i < input_rows_count; ++i)
+        {
+            const AggregateFunctionGroupBitmapData<T>& bd1 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( columns[0]->getData()[i] );
+            const AggregateFunctionGroupBitmapData<T>& bd2 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( columns[1]->getData()[i] );
+            vec_to[i] = Impl<T>::apply( bd1, bd2 );
+        }
+    }
+};
+
+template <typename T>
+struct BitmapAndImpl
+{
+    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        toBd.rbs.rb_and( bd2.rbs );
+    }
+};
+
+template <typename T>
+struct BitmapOrImpl
+{
+    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        toBd.rbs.rb_or( bd2.rbs );
+    }
+};
+
+template <typename T>
+struct BitmapXorImpl
+{
+    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        toBd.rbs.rb_xor( bd2.rbs );
+    }
+};
+
+template <typename T>
+struct BitmapAndnotImpl
+{
+    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    {
+        toBd.rbs.rb_andnot( bd2.rbs );
+    }
+};
+
+template <template <typename> class Impl, typename Name>
+class FunctionBitmap : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitmap>(); }
+
+    String getName() const override { return name; }
+
+    bool isVariadic() const override { return false; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>( arguments[0].get() );
+        if (!aggr_type)
+            throw Exception("First argument for function " + getName() + " must be an AggregateFunction but it has type "
+                            + arguments[0]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        return arguments[0];
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
+    {
+        const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
+        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
+        WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
+        if      (which.isUInt8()) executeBitmapData<UInt8>(block, arguments, result, input_rows_count);
+        else if (which.isUInt16()) executeBitmapData<UInt16>(block, arguments, result, input_rows_count);
+        else if (which.isUInt32()) executeBitmapData<UInt32>(block, arguments, result, input_rows_count);
+        else if (which.isUInt64()) executeBitmapData<UInt64>(block, arguments, result, input_rows_count);
+        else
+            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+    }
+
+private:
+    template <typename T>
+    void executeBitmapData( Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count)
+    {
+        const ColumnAggregateFunction * columns[2];
+        for (size_t i = 0; i < 2; ++i)
+            columns[i] = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[i]).column.get());
+
+        auto col_to = ColumnAggregateFunction::create(columns[0]->getAggregateFunction());
+
+        col_to->reserve(input_rows_count);
+
+        for(size_t i = 0; i < input_rows_count; ++i)
+        {
+            col_to->insertFrom(columns[0]->getData()[i]);
+            AggregateFunctionGroupBitmapData<T>& toBd = *reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>( col_to->getData()[i] );
+            const AggregateFunctionGroupBitmapData<T>& bd2 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( columns[1]->getData()[i] );
+            Impl<T>::apply( toBd, bd2 );
+        }
+        block.getByPosition(result).column = std::move(col_to);
+    }
+};
+
+struct NameBitmapBuild   { static constexpr auto name = "bitmapBuild"; };
+using FunctionBitmapBuild = FunctionBitmapBuildImpl<NameBitmapBuild>;
+
+struct NameBitmapToArray   { static constexpr auto name = "bitmapToArray"; };
+using FunctionBitmapToArray = FunctionBitmapToArrayImpl<NameBitmapToArray>;
+
+struct NameBitmapCardinality         { static constexpr auto name = "bitmapCardinality"; };
+struct NameBitmapAndCardinality      { static constexpr auto name = "bitmapAndCardinality"; };
+struct NameBitmapOrCardinality       { static constexpr auto name = "bitmapOrCardinality"; };
+struct NameBitmapXorCardinality      { static constexpr auto name = "bitmapXorCardinality"; };
+struct NameBitmapAndnotCardinality   { static constexpr auto name = "bitmapAndnotCardinality"; };
+
+using FunctionBitmapSelfCardinality   = FunctionBitmapSelfCardinalityImpl<NameBitmapCardinality>;
+using FunctionBitmapAndCardinality    = FunctionBitmapCardinality<BitmapAndCardinalityImpl, NameBitmapAndCardinality>;
+using FunctionBitmapOrCardinality     = FunctionBitmapCardinality<BitmapOrCardinalityImpl, NameBitmapOrCardinality>;
+using FunctionBitmapXorCardinality    = FunctionBitmapCardinality<BitmapXorCardinalityImpl, NameBitmapXorCardinality>;
+using FunctionBitmapAndnotCardinality = FunctionBitmapCardinality<BitmapAndnotCardinalityImpl, NameBitmapAndnotCardinality>;
+
+struct NameBitmapAnd     { static constexpr auto name = "bitmapAnd"; };
+struct NameBitmapOr      { static constexpr auto name = "bitmapOr"; };
+struct NameBitmapXor     { static constexpr auto name = "bitmapXor"; };
+struct NameBitmapAndnot  { static constexpr auto name = "bitmapAndnot"; };
+using FunctionBitmapAnd    = FunctionBitmap<BitmapAndImpl, NameBitmapAnd>;
+using FunctionBitmapOr     = FunctionBitmap<BitmapOrImpl, NameBitmapOr>;
+using FunctionBitmapXor    = FunctionBitmap<BitmapXorImpl, NameBitmapXor>;
+using FunctionBitmapAndnot = FunctionBitmap<BitmapAndnotImpl, NameBitmapAndnot>;
+
+
+
+}
+
+
diff --git a/dbms/src/Functions/registerFunctions.cpp b/dbms/src/Functions/registerFunctions.cpp
index 41164e5e65e..2cc0d04aea5 100644
--- a/dbms/src/Functions/registerFunctions.cpp
+++ b/dbms/src/Functions/registerFunctions.cpp
@@ -13,6 +13,7 @@ namespace DB
 void registerFunctionsArithmetic(FunctionFactory &);
 void registerFunctionsArray(FunctionFactory &);
 void registerFunctionsTuple(FunctionFactory &);
+void registerFunctionsBitmap(FunctionFactory &);
 void registerFunctionsCoding(FunctionFactory &);
 void registerFunctionsComparison(FunctionFactory &);
 void registerFunctionsConditional(FunctionFactory &);
@@ -52,6 +53,7 @@ void registerFunctions()
     registerFunctionsArithmetic(factory);
     registerFunctionsArray(factory);
     registerFunctionsTuple(factory);
+    registerFunctionsBitmap(factory);
     registerFunctionsCoding(factory);
     registerFunctionsComparison(factory);
     registerFunctionsConditional(factory);
diff --git a/dbms/tests/queries/0_stateless/00834_bitmap_function.reference b/dbms/tests/queries/0_stateless/00834_bitmap_function.reference
new file mode 100644
index 00000000000..ea26ab0d097
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00834_bitmap_function.reference
@@ -0,0 +1,15 @@
+[1,2,3,4,5]
+[3]
+[1,2,3,4,5]
+[1,2,4,5]
+[1,2]
+5
+1
+5
+4
+2
+70
+2019-01-01	50	[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50]
+2019-01-02	60	[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
+60	50	70	40	20	30
+60	50	70	40	20	30
diff --git a/dbms/tests/queries/0_stateless/00834_bitmap_function.sql b/dbms/tests/queries/0_stateless/00834_bitmap_function.sql
new file mode 100644
index 00000000000..e76a447db25
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00834_bitmap_function.sql
@@ -0,0 +1,56 @@
+SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5]));
+SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])));
+SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])));
+SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])));
+SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])));
+SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5]));
+SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]));
+SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]));
+SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]));
+SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]));
+
+DROP TABLE IF EXISTS test.bitmap_test;
+
+CREATE TABLE test.bitmap_test(pickup_date Date, city_id UInt32, uid UInt32)ENGINE = Memory;
+INSERT INTO test.bitmap_test SELECT '2019-01-01', 1, number FROM numbers(1,50);
+INSERT INTO test.bitmap_test SELECT '2019-01-02', 1, number FROM numbers(11,60);
+
+SELECT groupBitmap( uid ) AS user_num FROM test.bitmap_test;
+
+SELECT pickup_date, groupBitmap( uid ) AS user_num, bitmapToArray(groupBitmapState( uid )) AS users FROM test.bitmap_test GROUP BY pickup_date;
+
+SELECT
+    bitmapCardinality(day_today) AS today_users,
+    bitmapCardinality(day_before) AS before_users,
+    bitmapOrCardinality(day_today, day_before)ll_users,
+    bitmapAndCardinality(day_today, day_before) AS old_users,
+    bitmapAndnotCardinality(day_today, day_before) AS new_users,
+    bitmapXorCardinality(day_today, day_before) AS diff_users
+FROM
+(
+ SELECT city_id, groupBitmapState( uid ) AS day_today FROM test.bitmap_test WHERE pickup_date = '2019-01-02' GROUP BY city_id
+ )
+ALL LEFT JOIN
+(
+ SELECT city_id, groupBitmapState( uid ) AS day_before FROM test.bitmap_test WHERE pickup_date = '2019-01-01' GROUP BY city_id
+)
+USING city_id;
+
+SELECT
+    bitmapCardinality(day_today) AS today_users,
+    bitmapCardinality(day_before) AS before_users,
+    bitmapCardinality(bitmapOr(day_today, day_before))ll_users,
+    bitmapCardinality(bitmapAnd(day_today, day_before)) AS old_users,
+    bitmapCardinality(bitmapAndnot(day_today, day_before)) AS new_users,
+    bitmapCardinality(bitmapXor(day_today, day_before)) AS diff_users
+FROM
+(
+ SELECT city_id, groupBitmapState( uid ) AS day_today FROM test.bitmap_test WHERE pickup_date = '2019-01-02' GROUP BY city_id
+ )
+ALL LEFT JOIN
+(
+ SELECT city_id, groupBitmapState( uid ) AS day_before FROM test.bitmap_test WHERE pickup_date = '2019-01-01' GROUP BY city_id
+)
+USING city_id;
+
+DROP TABLE IF EXISTS test.bitmap_test;
diff --git a/docs/en/query_language/agg_functions/reference.md b/docs/en/query_language/agg_functions/reference.md
index b8bd95d376d..e17090fb666 100644
--- a/docs/en/query_language/agg_functions/reference.md
+++ b/docs/en/query_language/agg_functions/reference.md
@@ -179,6 +179,48 @@ binary     decimal
 01101000 = 104
 ```
 
+
+##groupBitmap
+
+Bitmap or Aggregate calculations from a unsigned integer column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../functions/bitmap_functions.md).
+
+```
+groupBitmap(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `UInt*` type.
+
+**Return value**
+
+Value of the `UInt64` type.
+
+**Example**
+
+Test data:
+
+```
+userid
+1
+1
+2
+3
+```
+
+Query:
+
+```
+SELECT groupBitmap(userid) as num FROM t
+```
+
+Result:
+
+```
+num
+3
+```
+
 ## min(x) {#agg_function-min}
 
 Calculates the minimum.
diff --git a/docs/en/query_language/functions/bitmap_functions.md b/docs/en/query_language/functions/bitmap_functions.md
new file mode 100644
index 00000000000..32264db2c9a
--- /dev/null
+++ b/docs/en/query_language/functions/bitmap_functions.md
@@ -0,0 +1,277 @@
+# Bitmap functions
+
+Bitmap functions work for two bitmaps Object value calculation, it is to return new bitmap or cardinality while using formula calculation, such as and, or, xor, and not, etc.
+
+There are 2 kinds of construction methods for Bitmap Object. One is to be constructed by aggregation function groupBitmap with -State, the other is to be constructed by Array Object. It is also to convert Bitmap Object to Array Object.
+
+RoaringBitmap is wrapped into a data structure while actual storage of Bitmap objects. When the cardinality is less than or equal to 32, it uses Set objet. When the cardinality is greater than 32, it uses RoaringBitmap object. That is why storage of low cardinality set is faster. 
+
+For more information on RoaringBitmap, see: [CRoaring](https://github.com/RoaringBitmap/CRoaring).
+
+
+## bitmapBuild
+
+Build a bitmap from unsigned integer array.
+
+```
+bitmapBuild(array)
+```
+
+**Parameters**
+
+- `array` – unsigned integer array.
+
+**Example**
+
+``` sql
+SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res
+```
+
+## bitmapToArray
+
+Convert bitmap to integer array.
+
+```
+bitmapToArray(bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res
+```
+
+```
+┌─res─────────┐
+│ [1,2,3,4,5] │
+└─────────────┘
+```
+
+
+## bitmapAnd
+
+Two bitmap and calculation, the result is a new bitmap.
+
+```
+bitmapAnd(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res
+```
+
+```
+┌─res─┐
+│ [3] │
+└─────┘
+```
+
+
+## bitmapOr
+
+Two bitmap or calculation, the result is a new bitmap.
+
+```
+bitmapOr(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res
+```
+
+```
+┌─res─────────┐
+│ [1,2,3,4,5] │
+└─────────────┘
+```
+
+## bitmapXor
+
+Two bitmap xor calculation, the result is a new bitmap.
+
+```
+bitmapXor(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res
+```
+
+```
+┌─res───────┐
+│ [1,2,4,5] │
+└───────────┘
+```
+
+## bitmapAndnot
+
+Two bitmap andnot calculation, the result is a new bitmap.
+
+```
+bitmapAndnot(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res
+```
+
+```
+┌─res───┐
+│ [1,2] │
+└───────┘
+```
+
+## bitmapCardinality
+
+Retrun bitmap cardinality of type UInt64.
+
+
+```
+bitmapCardinality(bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res
+```
+
+```
+┌─res─┐
+│   5 │
+└─────┘
+```
+
+## bitmapAndCardinality
+
+Two bitmap and calculation, return cardinality of type UInt64.
+
+
+```
+bitmapAndCardinality(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res;
+```
+
+```
+┌─res─┐
+│   1 │
+└─────┘
+```
+
+
+## bitmapOrCardinality
+
+Two bitmap or calculation, return cardinality of type UInt64.
+
+```
+bitmapOrCardinality(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res;
+```
+
+```
+┌─res─┐
+│   5 │
+└─────┘
+```
+
+## bitmapXorCardinality
+
+Two bitmap xor calculation, return cardinality of type UInt64.
+
+```
+bitmapXorCardinality(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res;
+```
+
+```
+┌─res─┐
+│   4 │
+└─────┘
+```
+
+
+## bitmapAndnotCardinality
+
+Two bitmap andnot calculation, return cardinality of type UInt64.
+
+```
+bitmapAndnotCardinality(bitmap,bitmap)
+```
+
+**Parameters**
+
+- `bitmap` – bitmap object.
+
+**Example**
+
+``` sql
+SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res;
+```
+
+```
+┌─res─┐
+│   2 │
+└─────┘
+```
+
+
+[Original article](https://clickhouse.yandex/docs/en/query_language/functions/bitmap_functions/) <!--hide-->
diff --git a/docs/redirects.txt b/docs/redirects.txt
index be807dd547d..0ff077b660c 100644
--- a/docs/redirects.txt
+++ b/docs/redirects.txt
@@ -86,6 +86,7 @@ functions/arithmetic_functions.md query_language/functions/arithmetic_functions.
 functions/array_functions.md query_language/functions/array_functions.md
 functions/array_join.md query_language/functions/array_join.md
 functions/bit_functions.md query_language/functions/bit_functions.md
+functions/bitmap_functions.md query_language/functions/bitmap_functions.md
 functions/comparison_functions.md query_language/functions/comparison_functions.md
 functions/conditional_functions.md query_language/functions/conditional_functions.md
 functions/date_time_functions.md query_language/functions/date_time_functions.md
diff --git a/docs/toc_en.yml b/docs/toc_en.yml
index dd2218ccb47..57dd22fb223 100644
--- a/docs/toc_en.yml
+++ b/docs/toc_en.yml
@@ -80,6 +80,7 @@ nav:
     - 'Working with Arrays': 'query_language/functions/array_functions.md'
     - 'Splitting and Merging Strings and Arrays': 'query_language/functions/splitting_merging_functions.md'
     - 'Bit': 'query_language/functions/bit_functions.md'
+    - 'Bitmap functions': 'query_language/functions/bitmap_functions.md'    
     - 'Hash': 'query_language/functions/hash_functions.md'
     - 'Generating Pseudo-Random Numbers': 'query_language/functions/random_functions.md'
     - 'Encoding': 'query_language/functions/encoding_functions.md'

From 1a28ba01c103453a3198a2a2ea9cab52adf0ad7f Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Sat, 9 Feb 2019 14:33:09 +0800
Subject: [PATCH 02/69] Added bitmap function feature, fixed test errors and
 code styles

---
 contrib/croaring/roaring.h                    |  27 +-
 .../AggregateFunctionGroupBitmapData.h        | 252 ++++++++------
 dbms/src/Functions/FunctionsBitmap.h          | 328 +++++++++++-------
 3 files changed, 373 insertions(+), 234 deletions(-)

diff --git a/contrib/croaring/roaring.h b/contrib/croaring/roaring.h
index 6583188c56e..53413b2a06d 100644
--- a/contrib/croaring/roaring.h
+++ b/contrib/croaring/roaring.h
@@ -17,15 +17,30 @@ enum {
  *
  */
 
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wcast-align"
+#pragma clang diagnostic ignored "-Wcast-qual"
+#pragma clang diagnostic ignored "-Wundef"
+#endif
+
 #ifndef INCLUDE_PORTABILITY_H_
 #define INCLUDE_PORTABILITY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS 1
-#endif
+//#ifndef __STDC_FORMAT_MACROS
+//#define __STDC_FORMAT_MACROS 1
+//#endif
 
 #if !(defined(_POSIX_C_SOURCE)) || (_POSIX_C_SOURCE < 200809L)
 #define _POSIX_C_SOURCE 200809L
@@ -250,6 +265,10 @@ static inline int hamming(uint64_t x) {
 #define UINT32_C(c) (c##UL)
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* INCLUDE_PORTABILITY_H_ */
 /* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/portability.h */
 /* begin file /opt/bitmap/CRoaring-0.2.57/include/roaring/containers/perfparameters.h */
@@ -1534,6 +1553,7 @@ inline bool array_container_contains(const array_container_t *arr,
 
 }
 
+
 //* Check whether a range of values from range_start (included) to range_end (excluded) is present. */
 static inline bool array_container_contains_range(const array_container_t *arr,
                                                     uint32_t range_start, uint32_t range_end) {
@@ -7163,4 +7183,5 @@ uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* b
 #endif
 
 #endif
+
 /* end file /opt/bitmap/CRoaring-0.2.57/include/roaring/roaring.h */
diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
index f90625b2a3b..8a121b92866 100644
--- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
@@ -1,15 +1,14 @@
 #pragma once
 
-#include <IO/WriteHelpers.h>
-#include <IO/ReadHelpers.h>
-#include <boost/noncopyable.hpp>
-#include <Common/HashTable/SmallTable.h>
-#include <roaring.hh>
 #include <roaring.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <boost/noncopyable.hpp>
+#include <roaring.hh>
+#include <Common/HashTable/SmallTable.h>
 
 namespace DB
 {
-
 /**
   * For a small number of values - an array of fixed size "on the stack".
   * For large, roaring_bitmap_t is allocated.
@@ -43,7 +42,8 @@ public:
             roaring_bitmap_free(rb);
     }
 
-    void add(T value) {
+    void add(T value)
+    {
         if (isSmall())
         {
             if (small.find(value) == small.end())
@@ -61,10 +61,7 @@ public:
             roaring_bitmap_add(rb, value);
     }
 
-    UInt64 size() const
-    {
-        return isSmall() ? small.size() : roaring_bitmap_get_cardinality( rb );
-    }
+    UInt64 size() const { return isSmall() ? small.size() : roaring_bitmap_get_cardinality(rb); }
 
     void merge(const RoaringBitmapWithSmallSet & r1)
     {
@@ -73,7 +70,7 @@ public:
             if (isSmall())
                 toLarge();
 
-            roaring_bitmap_or_inplace( rb, r1.rb );
+            roaring_bitmap_or_inplace(rb, r1.rb);
         }
         else
         {
@@ -91,8 +88,8 @@ public:
         {
             toLarge();
             UInt32 cardinality;
-            readBinary( cardinality, in );
-            db_roaring_bitmap_add_many( in, rb, cardinality );
+            readBinary(cardinality, in);
+            db_roaring_bitmap_add_many(in, rb, cardinality);
         }
         else
             small.read(in);
@@ -105,22 +102,17 @@ public:
         if (isLarge())
         {
             UInt32 cardinality = roaring_bitmap_get_cardinality(rb);
-            writePODBinary( cardinality, out );
-            db_ra_to_uint32_array( out, &rb->high_low_container );
-        } else
+            writePODBinary(cardinality, out);
+            db_ra_to_uint32_array(out, &rb->high_low_container);
+        }
+        else
             small.write(out);
     }
 
 
-    roaring_bitmap_t * getRb() const
-    {
-        return rb;
-    }
+    roaring_bitmap_t * getRb() const { return rb; }
 
-    Small& getSmall() const
-    {
-        return small;
-    }
+    Small & getSmall() const { return small; }
 
     /**
      * Get a new roaring_bitmap_t from elements of small
@@ -139,11 +131,12 @@ public:
     void rb_and(const RoaringBitmapWithSmallSet & r1)
     {
         ValueBuffer buffer;
-        if(isSmall() &&  r1.isSmall())
+        if (isSmall() && r1.isSmall())
         {
             // intersect
             for (const auto & value : this->small)
-                if (r1.small.find(value) != r1.small.end())  buffer.push_back(value);
+                if (r1.small.find(value) != r1.small.end())
+                    buffer.push_back(value);
 
             // Clear out the original values
             this->small.clear();
@@ -152,9 +145,12 @@ public:
                 this->small.insert(value);
 
             buffer.clear();
-        } else if(isSmall() &&  r1.isLarge()){
+        }
+        else if (isSmall() && r1.isLarge())
+        {
             for (const auto & value : this->small)
-                if( roaring_bitmap_contains( r1.rb, value ) )  buffer.push_back( value );
+                if (roaring_bitmap_contains(r1.rb, value))
+                    buffer.push_back(value);
 
             // Clear out the original values
             this->small.clear();
@@ -163,30 +159,32 @@ public:
                 this->small.insert(value);
 
             buffer.clear();
-        } else {
+        }
+        else
+        {
             roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
-            roaring_bitmap_and_inplace( rb, rb1 );
-            if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+            roaring_bitmap_and_inplace(rb, rb1);
+            if (r1.isSmall())
+                roaring_bitmap_free(rb1);
         }
     }
 
     /**
      * Computes the union between two bitmaps.
      */
-    void rb_or(const RoaringBitmapWithSmallSet & r1)
-    {
-        this->merge( r1 );
-    }
+    void rb_or(const RoaringBitmapWithSmallSet & r1) { this->merge(r1); }
 
     /**
      * Computes the symmetric difference (xor) between two bitmaps.
      */
     void rb_xor(const RoaringBitmapWithSmallSet & r1)
     {
-        if( this->isSmall() ) toLarge();
+        if (this->isSmall())
+            toLarge();
         roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
-        roaring_bitmap_xor_inplace( rb, rb1 );
-        if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        roaring_bitmap_xor_inplace(rb, rb1);
+        if (r1.isSmall())
+            roaring_bitmap_free(rb1);
     }
 
     /**
@@ -195,11 +193,12 @@ public:
     void rb_andnot(const RoaringBitmapWithSmallSet & r1)
     {
         ValueBuffer buffer;
-        if(isSmall() &&  r1.isSmall())
+        if (isSmall() && r1.isSmall())
         {
             // subtract
             for (const auto & value : this->small)
-                if (r1.small.find(value) == r1.small.end())  buffer.push_back(value);
+                if (r1.small.find(value) == r1.small.end())
+                    buffer.push_back(value);
 
             // Clear out the original values
             this->small.clear();
@@ -208,9 +207,12 @@ public:
                 this->small.insert(value);
 
             buffer.clear();
-        } else if(isSmall() &&  r1.isLarge()){
+        }
+        else if (isSmall() && r1.isLarge())
+        {
             for (const auto & value : this->small)
-                if( !roaring_bitmap_contains( r1.rb, value ) )  buffer.push_back( value );
+                if (!roaring_bitmap_contains(r1.rb, value))
+                    buffer.push_back(value);
 
             // Clear out the original values
             this->small.clear();
@@ -219,10 +221,13 @@ public:
                 this->small.insert(value);
 
             buffer.clear();
-        } else {
+        }
+        else
+        {
             roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
-            roaring_bitmap_andnot_inplace( rb, rb1 );
-            if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+            roaring_bitmap_andnot_inplace(rb, rb1);
+            if (r1.isSmall())
+                roaring_bitmap_free(rb1);
         }
     }
 
@@ -232,16 +237,24 @@ public:
     UInt64 rb_and_cardinality(const RoaringBitmapWithSmallSet & r1) const
     {
         UInt64 retSize = 0;
-        if( isSmall() &&  r1.isSmall() ){
+        if (isSmall() && r1.isSmall())
+        {
             for (const auto & value : this->small)
-                if (r1.small.find( value ) != r1.small.end()) retSize++;
-        } else if( isSmall() &&  r1.isLarge() ){
+                if (r1.small.find(value) != r1.small.end())
+                    retSize++;
+        }
+        else if (isSmall() && r1.isLarge())
+        {
             for (const auto & value : this->small)
-                if( roaring_bitmap_contains( r1.rb, value ) )   retSize++;
-        } else {
+                if (roaring_bitmap_contains(r1.rb, value))
+                    retSize++;
+        }
+        else
+        {
             roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
-            retSize = roaring_bitmap_and_cardinality( rb, rb1 );
-            if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+            retSize = roaring_bitmap_and_cardinality(rb, rb1);
+            if (r1.isSmall())
+                roaring_bitmap_free(rb1);
         }
         return retSize;
     }
@@ -253,7 +266,7 @@ public:
     {
         UInt64 c1 = this->size();
         UInt64 c2 = r1.size();
-        UInt64 inter = this->rb_and_cardinality( r1 );
+        UInt64 inter = this->rb_and_cardinality(r1);
         return c1 + c2 - inter;
     }
 
@@ -264,7 +277,7 @@ public:
     {
         UInt64 c1 = this->size();
         UInt64 c2 = r1.size();
-        UInt64 inter = this->rb_and_cardinality( r1 );
+        UInt64 inter = this->rb_and_cardinality(r1);
         return c1 + c2 - 2 * inter;
     }
 
@@ -274,7 +287,7 @@ public:
     UInt64 rb_andnot_cardinality(const RoaringBitmapWithSmallSet & r1) const
     {
         UInt64 c1 = this->size();
-        UInt64 inter = this->rb_and_cardinality( r1 );
+        UInt64 inter = this->rb_and_cardinality(r1);
         return c1 - inter;
     }
 
@@ -283,10 +296,12 @@ public:
      */
     UInt8 rb_equals(const RoaringBitmapWithSmallSet & r1)
     {
-        if( this->isSmall() ) toLarge();
+        if (this->isSmall())
+            toLarge();
         roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
-        UInt8 is_true = roaring_bitmap_equals( rb, rb1 );
-        if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        UInt8 is_true = roaring_bitmap_equals(rb, rb1);
+        if (r1.isSmall())
+            roaring_bitmap_free(rb1);
         return is_true;
     }
 
@@ -295,10 +310,12 @@ public:
      */
     UInt8 rb_intersect(const RoaringBitmapWithSmallSet & r1)
     {
-        if( this->isSmall() ) toLarge();
+        if (this->isSmall())
+            toLarge();
         roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
-        UInt8 is_true = roaring_bitmap_intersect( rb, rb1 );
-        if ( r1.isSmall() ) roaring_bitmap_free(rb1);
+        UInt8 is_true = roaring_bitmap_intersect(rb, rb1);
+        if (r1.isSmall())
+            roaring_bitmap_free(rb1);
         return is_true;
     }
 
@@ -307,8 +324,9 @@ public:
      */
     void rb_remove(UInt64 offsetid)
     {
-        if( this->isSmall() ) toLarge();
-        roaring_bitmap_remove( rb, offsetid );
+        if (this->isSmall())
+            toLarge();
+        roaring_bitmap_remove(rb, offsetid);
     }
 
     /**
@@ -319,8 +337,9 @@ public:
      */
     void rb_flip(UInt64 offsetstart, UInt64 offsetend)
     {
-        if( this->isSmall() ) toLarge();
-        roaring_bitmap_flip_inplace( rb, offsetstart, offsetend );
+        if (this->isSmall())
+            toLarge();
+        roaring_bitmap_flip_inplace(rb, offsetstart, offsetend);
     }
 
     /**
@@ -328,8 +347,9 @@ public:
      */
     UInt64 rb_rank(UInt64 offsetid)
     {
-        if( this->isSmall() ) toLarge();
-        return roaring_bitmap_rank( rb, offsetid );
+        if (this->isSmall())
+            toLarge();
+        return roaring_bitmap_rank(rb, offsetid);
     }
 
     /**
@@ -346,10 +366,12 @@ public:
                 res_data.emplace_back(x);
                 count++;
             }
-        } else {
+        }
+        else
+        {
             roaring_uint32_iterator_t iterator;
             roaring_init_iterator(rb, &iterator);
-            while(iterator.has_value)
+            while (iterator.has_value)
             {
                 res_data.emplace_back(iterator.current_value);
                 roaring_advance_uint32_iterator(&iterator);
@@ -361,78 +383,80 @@ public:
 
 private:
     /// To read and write the DB Buffer directly, migrate code from CRoaring
-    void db_roaring_bitmap_add_many( DB::ReadBuffer & dbBuf, roaring_bitmap_t * r, size_t n_args ) {
-        void *container = NULL;  // hold value of last container touched
-        uint8_t typecode = 0;    // typecode of last container touched
-        uint32_t prev = 0;       // previous valued inserted
-        size_t i = 0;            // index of value
+    void db_roaring_bitmap_add_many(DB::ReadBuffer & dbBuf, roaring_bitmap_t * r, size_t n_args)
+    {
+        void * container = NULL; // hold value of last container touched
+        uint8_t typecode = 0; // typecode of last container touched
+        uint32_t prev = 0; // previous valued inserted
+        size_t i = 0; // index of value
         int containerindex = 0;
-        if (n_args == 0) return;
+        if (n_args == 0)
+            return;
         uint32_t val;
-        readBinary( val, dbBuf );
+        readBinary(val, dbBuf);
         container = containerptr_roaring_bitmap_add(r, val, &typecode, &containerindex);
         prev = val;
         i++;
-        for (; i < n_args; i++) {
-            readBinary( val, dbBuf );
-            if (((prev ^ val) >> 16) ==
-                0) {  // no need to seek the container, it is at hand
+        for (; i < n_args; i++)
+        {
+            readBinary(val, dbBuf);
+            if (((prev ^ val) >> 16) == 0)
+            { // no need to seek the container, it is at hand
                 // because we already have the container at hand, we can do the
                 // insertion
                 // automatically, bypassing the roaring_bitmap_add call
                 uint8_t newtypecode = typecode;
-                void * container2 =
-                        container_add(container, val & 0xFFFF, typecode, &newtypecode);
+                void * container2 = container_add(container, val & 0xFFFF, typecode, &newtypecode);
                 // rare instance when we need to
                 if (container2 != container)
                 {
                     // change the container type
                     container_free(container, typecode);
-                    ra_set_container_at_index(&r->high_low_container,
-                                              containerindex, container2,
-                                              newtypecode);
+                    ra_set_container_at_index(&r->high_low_container, containerindex, container2, newtypecode);
                     typecode = newtypecode;
                     container = container2;
                 }
-            } else {
-                container = containerptr_roaring_bitmap_add(r, val, &typecode,
-                                                            &containerindex);
+            }
+            else
+            {
+                container = containerptr_roaring_bitmap_add(r, val, &typecode, &containerindex);
             }
             prev = val;
         }
     }
 
-    void db_ra_to_uint32_array( DB::WriteBuffer & dbBuf, roaring_array_t * ra ) const {
+    void db_ra_to_uint32_array(DB::WriteBuffer & dbBuf, roaring_array_t * ra) const
+    {
         size_t ctr = 0;
         for (Int32 i = 0; i < ra->size; ++i)
         {
-            Int32 num_added = db_container_to_uint32_array( dbBuf, ra->containers[i], ra->typecodes[i],
-                                                             ((UInt32)ra->keys[i]) << 16);
+            Int32 num_added = db_container_to_uint32_array(dbBuf, ra->containers[i], ra->typecodes[i], ((UInt32)ra->keys[i]) << 16);
             ctr += num_added;
         }
     }
-	
-    UInt32 db_container_to_uint32_array( DB::WriteBuffer & dbBuf, const void * container, UInt8 typecode, UInt32 base) const {
+
+    UInt32 db_container_to_uint32_array(DB::WriteBuffer & dbBuf, const void * container, UInt8 typecode, UInt32 base) const
+    {
         container = container_unwrap_shared(container, &typecode);
-        switch (typecode) {
+        switch (typecode)
+        {
             case BITSET_CONTAINER_TYPE_CODE:
-                return db_bitset_container_to_uint32_array( dbBuf,
-                (const bitset_container_t *)container, base);
+                return db_bitset_container_to_uint32_array(dbBuf, (const bitset_container_t *)container, base);
             case ARRAY_CONTAINER_TYPE_CODE:
-                return db_array_container_to_uint32_array( dbBuf,
-                (const array_container_t *)container, base);
+                return db_array_container_to_uint32_array(dbBuf, (const array_container_t *)container, base);
             case RUN_CONTAINER_TYPE_CODE:
-                return db_run_container_to_uint32_array( dbBuf,
-                (const run_container_t *)container, base);
+                return db_run_container_to_uint32_array(dbBuf, (const run_container_t *)container, base);
         }
         return 0;
     }
-	
-    UInt32 db_bitset_container_to_uint32_array( DB::WriteBuffer & dbBuf, const bitset_container_t * cont, UInt32 base) const {
-        return (UInt32) db_bitset_extract_setbits( dbBuf, cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, base);
+
+    UInt32 db_bitset_container_to_uint32_array(DB::WriteBuffer & dbBuf, const bitset_container_t * cont, UInt32 base) const
+    {
+        return (UInt32)db_bitset_extract_setbits(dbBuf, cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, base);
     }
 
-    size_t db_bitset_extract_setbits( DB::WriteBuffer & dbBuf, UInt64 * bitset, size_t length, UInt32 base) const {
+    size_t db_bitset_extract_setbits(DB::WriteBuffer & dbBuf, UInt64 * bitset, size_t length, UInt32 base) const
+    {
         UInt32 outpos = 0;
         for (size_t i = 0; i < length; ++i)
         {
@@ -442,7 +466,7 @@ private:
                 UInt64 t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
                 UInt32 r = __builtin_ctzll(w); // on x64, should compile to TZCNT
                 UInt32 val = r + base;
-				writePODBinary( val, dbBuf );
+                writePODBinary(val, dbBuf);
                 outpos++;
                 w ^= t;
             }
@@ -451,18 +475,20 @@ private:
         return outpos;
     }
 
-    int db_array_container_to_uint32_array( DB::WriteBuffer & dbBuf, const array_container_t * cont, UInt32 base) const {
+    int db_array_container_to_uint32_array(DB::WriteBuffer & dbBuf, const array_container_t * cont, UInt32 base) const
+    {
         UInt32 outpos = 0;
-        for ( Int32 i = 0; i < cont->cardinality; ++i)
+        for (Int32 i = 0; i < cont->cardinality; ++i)
         {
             const UInt32 val = base + cont->array[i];
-			writePODBinary( val, dbBuf );
+            writePODBinary(val, dbBuf);
             outpos++;
         }
         return outpos;
     }
 
-    int db_run_container_to_uint32_array( DB::WriteBuffer & dbBuf, const run_container_t * cont, UInt32 base) const {
+    int db_run_container_to_uint32_array(DB::WriteBuffer & dbBuf, const run_container_t * cont, UInt32 base) const
+    {
         UInt32 outpos = 0;
         for (Int32 i = 0; i < cont->n_runs; ++i)
         {
@@ -471,20 +497,18 @@ private:
             for (Int32 j = 0; j <= le; ++j)
             {
                 UInt32 val = run_start + j;
-				writePODBinary( val, dbBuf );
+                writePODBinary(val, dbBuf);
                 outpos++;
             }
         }
         return outpos;
     }
-	
-	
 };
 
 template <typename T>
 struct AggregateFunctionGroupBitmapData
 {
-    RoaringBitmapWithSmallSet<T,32> rbs;
+    RoaringBitmapWithSmallSet<T, 32> rbs;
     static const char * name() { return "groupBitmap"; }
 };
 
diff --git a/dbms/src/Functions/FunctionsBitmap.h b/dbms/src/Functions/FunctionsBitmap.h
index 055607a9038..2fefaed9f2b 100644
--- a/dbms/src/Functions/FunctionsBitmap.h
+++ b/dbms/src/Functions/FunctionsBitmap.h
@@ -1,23 +1,21 @@
 #pragma once
 
-#include <Common/typeid_cast.h>
-#include <Columns/ColumnVector.h>
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionGroupBitmapData.h>
+#include <Columns/ColumnAggregateFunction.h>
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnConst.h>
 #include <Columns/ColumnFunction.h>
+#include <Columns/ColumnVector.h>
+#include <DataTypes/DataTypeAggregateFunction.h>
 #include <DataTypes/DataTypeArray.h>
 #include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeAggregateFunction.h>
-#include <Common/typeid_cast.h>
-#include <Columns/ColumnAggregateFunction.h>
-#include <Functions/IFunction.h>
 #include <Functions/FunctionHelpers.h>
-#include <AggregateFunctions/AggregateFunctionGroupBitmapData.h>
-#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <Functions/IFunction.h>
+#include <Common/typeid_cast.h>
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
@@ -80,13 +78,15 @@ public:
 
         auto array_type = typeid_cast<const DataTypeArray *>(arguments[0].get());
         if (!array_type)
-            throw Exception("First argument for function " + getName() + " must be an array but it has type "
-                            + arguments[0]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            throw Exception(
+                "First argument for function " + getName() + " must be an array but it has type " + arguments[0]->getName() + ".",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 
         auto nested_type = array_type->getNestedType();
         DataTypes argument_types = {nested_type};
         Array params_row;
-        AggregateFunctionPtr bitmap_function = AggregateFunctionFactory::instance().get(AggregateFunctionGroupBitmapData<UInt32>::name(), argument_types, params_row);
+        AggregateFunctionPtr bitmap_function
+            = AggregateFunctionFactory::instance().get(AggregateFunctionGroupBitmapData<UInt32>::name(), argument_types, params_row);
 
         return std::make_shared<DataTypeAggregateFunction>(bitmap_function, argument_types, params_row);
     }
@@ -102,18 +102,22 @@ public:
         DataTypes argument_types = {nested_type};
 
         WhichDataType which(nested_type);
-        if      (which.isUInt8()) executeBitmapData<UInt8>(block, argument_types, arguments, result);
-        else if (which.isUInt16()) executeBitmapData<UInt16>(block, argument_types, arguments, result);
-        else if (which.isUInt32()) executeBitmapData<UInt32>(block, argument_types, arguments, result);
-        else if (which.isUInt64()) executeBitmapData<UInt64>(block, argument_types, arguments, result);
+        if (which.isUInt8())
+            executeBitmapData<UInt8>(block, argument_types, arguments, result);
+        else if (which.isUInt16())
+            executeBitmapData<UInt16>(block, argument_types, arguments, result);
+        else if (which.isUInt32())
+            executeBitmapData<UInt32>(block, argument_types, arguments, result);
+        else if (which.isUInt64())
+            executeBitmapData<UInt64>(block, argument_types, arguments, result);
         else
-            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
-                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            throw Exception(
+                "Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
     }
 
 private:
     template <typename T>
-    void executeBitmapData( Block & block, DataTypes & argument_types, const ColumnNumbers & arguments, size_t result)
+    void executeBitmapData(Block & block, DataTypes & argument_types, const ColumnNumbers & arguments, size_t result)
     {
         // input data
         const ColumnArray * array = typeid_cast<const ColumnArray *>(block.getByPosition(arguments[0]).column.get());
@@ -124,15 +128,17 @@ private:
 
         // output data
         Array params_row;
-        AggregateFunctionPtr bitmap_function = AggregateFunctionFactory::instance().get(AggregateFunctionGroupBitmapData<UInt32>::name(), argument_types, params_row);
+        AggregateFunctionPtr bitmap_function
+            = AggregateFunctionFactory::instance().get(AggregateFunctionGroupBitmapData<UInt32>::name(), argument_types, params_row);
         auto col_to = ColumnAggregateFunction::create(bitmap_function);
         col_to->reserve(offsets.size());
 
         size_t pos = 0;
-        for(size_t i = 0; i < offsets.size(); ++i)
+        for (size_t i = 0; i < offsets.size(); ++i)
         {
             col_to->insertDefault();
-            AggregateFunctionGroupBitmapData<T>& bitmap_data = *reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>( col_to->getData()[i] );
+            AggregateFunctionGroupBitmapData<T> & bitmap_data
+                = *reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>(col_to->getData()[i]);
             for (; pos < offsets[i]; ++pos)
             {
                 bitmap_data.rbs.add(input_data[pos]);
@@ -158,11 +164,13 @@ public:
 
     DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
     {
-        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>( arguments[0].get() );
-        if (!aggr_type)
-            throw Exception("First argument for function " + getName() + " must be an AggregateFunction but it has type "
-                            + arguments[0]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
-        const DataTypePtr data_type = aggr_type->getArgumentsDataTypes()[0];
+        const DataTypeAggregateFunction * bitmap_type = typeid_cast<const DataTypeAggregateFunction *>(arguments[0].get());
+        if (!(bitmap_type && bitmap_type->getFunctionName() == AggregateFunctionGroupBitmapData<UInt32>::name()))
+            throw Exception(
+                "First argument for function " + getName() + " must be an bitmap but it has type " + arguments[0]->getName() + ".",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        const DataTypePtr data_type = bitmap_type->getArgumentsDataTypes()[0];
 
         return std::make_shared<DataTypeArray>(data_type);
     }
@@ -182,32 +190,40 @@ public:
         const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
         const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
         WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
-        if      (which.isUInt8()) executeIntType<UInt8>(block, arguments, input_rows_count, res_data, res_offsets);
-        else if (which.isUInt16()) executeIntType<UInt16>(block, arguments, input_rows_count, res_data, res_offsets);
-        else if (which.isUInt32()) executeIntType<UInt32>(block, arguments, input_rows_count, res_data, res_offsets);
-        else if (which.isUInt64()) executeIntType<UInt64>(block, arguments, input_rows_count, res_data, res_offsets);
+        if (which.isUInt8())
+            executeIntType<UInt8>(block, arguments, input_rows_count, res_data, res_offsets);
+        else if (which.isUInt16())
+            executeIntType<UInt16>(block, arguments, input_rows_count, res_data, res_offsets);
+        else if (which.isUInt32())
+            executeIntType<UInt32>(block, arguments, input_rows_count, res_data, res_offsets);
+        else if (which.isUInt64())
+            executeIntType<UInt64>(block, arguments, input_rows_count, res_data, res_offsets);
         else
-            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
-                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            throw Exception(
+                "Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 
         block.getByPosition(result).column = std::move(res_ptr);
     }
+
 private:
     using ToType = UInt64;
 
     template <typename T>
-    void executeIntType( Block & block, const ColumnNumbers & arguments, size_t input_rows_count,
-                         IColumn & res_data_col, ColumnArray::Offsets & res_offsets ) const
+    void executeIntType(
+        Block & block, const ColumnNumbers & arguments, size_t input_rows_count, IColumn & res_data_col, ColumnArray::Offsets & res_offsets)
+        const
     {
-        const ColumnAggregateFunction * column = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[0]).column.get());
+        const ColumnAggregateFunction * column
+            = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[0]).column.get());
 
         PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData();
         ColumnArray::Offset res_offset = 0;
 
-        for(size_t i = 0; i < input_rows_count; ++i)
+        for (size_t i = 0; i < input_rows_count; ++i)
         {
-            const AggregateFunctionGroupBitmapData<T>& bd1 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( column->getData()[i] );
-            UInt64 count = bd1.rbs.rb_to_array( res_data );
+            const AggregateFunctionGroupBitmapData<T> & bd1
+                = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>(column->getData()[i]);
+            UInt64 count = bd1.rbs.rb_to_array(res_data);
             res_offset += count;
             res_offsets.emplace_back(res_offset);
         }
@@ -228,8 +244,15 @@ public:
 
     size_t getNumberOfArguments() const override { return 1; }
 
-    DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments */) const override
-    { return std::make_shared<DataTypeNumber<ToType>>(); }
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        auto bitmap_type = typeid_cast<const DataTypeAggregateFunction *>(arguments[0].get());
+        if (!(bitmap_type && bitmap_type->getFunctionName() == AggregateFunctionGroupBitmapData<UInt32>::name()))
+            throw Exception(
+                "First argument for function " + getName() + " must be an bitmap but it has type " + arguments[0]->getName() + ".",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        return std::make_shared<DataTypeNumber<ToType>>();
+    }
 
     bool useDefaultImplementationForConstants() const override { return true; }
 
@@ -241,26 +264,34 @@ public:
 
         const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
         WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
-        if      (which.isUInt8()) executeIntType<UInt8>(block, arguments, input_rows_count, vec_to);
-        else if (which.isUInt16()) executeIntType<UInt16>(block, arguments, input_rows_count, vec_to);
-        else if (which.isUInt32()) executeIntType<UInt32>(block, arguments, input_rows_count, vec_to);
-        else if (which.isUInt64()) executeIntType<UInt64>(block, arguments, input_rows_count, vec_to);
+        if (which.isUInt8())
+            executeIntType<UInt8>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt16())
+            executeIntType<UInt16>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt32())
+            executeIntType<UInt32>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt64())
+            executeIntType<UInt64>(block, arguments, input_rows_count, vec_to);
         else
-            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
-                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            throw Exception(
+                "Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 
         block.getByPosition(result).column = std::move(col_to);
     }
+
 private:
     using ToType = UInt64;
 
     template <typename T>
-    void executeIntType( Block & block, const ColumnNumbers & arguments, size_t input_rows_count, typename ColumnVector<ToType>::Container & vec_to)
+    void executeIntType(
+        Block & block, const ColumnNumbers & arguments, size_t input_rows_count, typename ColumnVector<ToType>::Container & vec_to)
     {
-        const ColumnAggregateFunction * column = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[0]).column.get());
-        for(size_t i = 0; i < input_rows_count; ++i)
+        const ColumnAggregateFunction * column
+            = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[0]).column.get());
+        for (size_t i = 0; i < input_rows_count; ++i)
         {
-            const AggregateFunctionGroupBitmapData<T>& bd1 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( column->getData()[i] );
+            const AggregateFunctionGroupBitmapData<T> & bd1
+                = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>(column->getData()[i]);
             vec_to[i] = bd1.rbs.size();
         }
     }
@@ -270,10 +301,10 @@ template <typename T>
 struct BitmapAndCardinalityImpl
 {
     using ReturnType = UInt64;
-    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static UInt64 apply(const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
         // roaring_bitmap_and_cardinality( rb1, rb2 );
-        return bd1.rbs.rb_and_cardinality( bd2.rbs );
+        return bd1.rbs.rb_and_cardinality(bd2.rbs);
     }
 };
 
@@ -282,10 +313,10 @@ template <typename T>
 struct BitmapOrCardinalityImpl
 {
     using ReturnType = UInt64;
-    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static UInt64 apply(const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
         // return roaring_bitmap_or_cardinality( rb1, rb2 );
-        return bd1.rbs.rb_or_cardinality( bd2.rbs );
+        return bd1.rbs.rb_or_cardinality(bd2.rbs);
     }
 };
 
@@ -293,10 +324,10 @@ template <typename T>
 struct BitmapXorCardinalityImpl
 {
     using ReturnType = UInt64;
-    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static UInt64 apply(const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
         // return roaring_bitmap_xor_cardinality( rb1, rb2 );
-        return bd1.rbs.rb_xor_cardinality( bd2.rbs );
+        return bd1.rbs.rb_xor_cardinality(bd2.rbs);
     }
 };
 
@@ -304,10 +335,10 @@ template <typename T>
 struct BitmapAndnotCardinalityImpl
 {
     using ReturnType = UInt64;
-    static UInt64 apply( const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static UInt64 apply(const AggregateFunctionGroupBitmapData<T> & bd1, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
         // roaring_bitmap_andnot_cardinality( rb1, rb2 );
-        return bd1.rbs.rb_andnot_cardinality( bd2.rbs );
+        return bd1.rbs.rb_andnot_cardinality(bd2.rbs);
     }
 };
 
@@ -316,7 +347,7 @@ class FunctionBitmapCardinality : public IFunction
 {
 public:
     static constexpr auto name = Name::name;
-	
+
     static FunctionPtr create(const Context &) { return std::make_shared<FunctionBitmapCardinality>(); }
 
     String getName() const override { return name; }
@@ -325,8 +356,21 @@ public:
 
     size_t getNumberOfArguments() const override { return 2; }
 
-    DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments */) const override
-    { return std::make_shared<DataTypeNumber<ToType>>(); }
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        auto bitmap_type0 = typeid_cast<const DataTypeAggregateFunction *>(arguments[0].get());
+        if (!(bitmap_type0 && bitmap_type0->getFunctionName() == AggregateFunctionGroupBitmapData<UInt32>::name()))
+            throw Exception(
+                "First argument for function " + getName() + " must be an bitmap but it has type " + arguments[0]->getName() + ".",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        auto bitmap_type1 = typeid_cast<const DataTypeAggregateFunction *>(arguments[1].get());
+        if (!(bitmap_type1 && bitmap_type1->getFunctionName() == AggregateFunctionGroupBitmapData<UInt32>::name()))
+            throw Exception(
+                "Second argument for function " + getName() + " must be an bitmap but it has type " + arguments[1]->getName() + ".",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        return std::make_shared<DataTypeNumber<ToType>>();
+    }
 
     bool useDefaultImplementationForConstants() const override { return true; }
 
@@ -338,31 +382,39 @@ public:
 
         const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
         WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
-        if      (which.isUInt8()) executeIntType<UInt8>(block, arguments, input_rows_count, vec_to);
-        else if (which.isUInt16()) executeIntType<UInt16>(block, arguments, input_rows_count, vec_to);
-        else if (which.isUInt32()) executeIntType<UInt32>(block, arguments, input_rows_count, vec_to);
-        else if (which.isUInt64()) executeIntType<UInt64>(block, arguments, input_rows_count, vec_to);
+        if (which.isUInt8())
+            executeIntType<UInt8>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt16())
+            executeIntType<UInt16>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt32())
+            executeIntType<UInt32>(block, arguments, input_rows_count, vec_to);
+        else if (which.isUInt64())
+            executeIntType<UInt64>(block, arguments, input_rows_count, vec_to);
         else
-            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
-                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            throw Exception(
+                "Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 
         block.getByPosition(result).column = std::move(col_to);
     }
+
 private:
     using ToType = UInt64;
 
     template <typename T>
-    void executeIntType( Block & block, const ColumnNumbers & arguments, size_t input_rows_count, typename ColumnVector<ToType>::Container & vec_to)
+    void executeIntType(
+        Block & block, const ColumnNumbers & arguments, size_t input_rows_count, typename ColumnVector<ToType>::Container & vec_to)
     {
         const ColumnAggregateFunction * columns[2];
         for (size_t i = 0; i < 2; ++i)
             columns[i] = typeid_cast<const ColumnAggregateFunction *>(block.getByPosition(arguments[i]).column.get());
 
-        for(size_t i = 0; i < input_rows_count; ++i)
+        for (size_t i = 0; i < input_rows_count; ++i)
         {
-            const AggregateFunctionGroupBitmapData<T>& bd1 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( columns[0]->getData()[i] );
-            const AggregateFunctionGroupBitmapData<T>& bd2 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( columns[1]->getData()[i] );
-            vec_to[i] = Impl<T>::apply( bd1, bd2 );
+            const AggregateFunctionGroupBitmapData<T> & bd1
+                = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>(columns[0]->getData()[i]);
+            const AggregateFunctionGroupBitmapData<T> & bd2
+                = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>(columns[1]->getData()[i]);
+            vec_to[i] = Impl<T>::apply(bd1, bd2);
         }
     }
 };
@@ -370,36 +422,36 @@ private:
 template <typename T>
 struct BitmapAndImpl
 {
-    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static void apply(AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
-        toBd.rbs.rb_and( bd2.rbs );
+        toBd.rbs.rb_and(bd2.rbs);
     }
 };
 
 template <typename T>
 struct BitmapOrImpl
 {
-    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static void apply(AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
-        toBd.rbs.rb_or( bd2.rbs );
+        toBd.rbs.rb_or(bd2.rbs);
     }
 };
 
 template <typename T>
 struct BitmapXorImpl
 {
-    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static void apply(AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
-        toBd.rbs.rb_xor( bd2.rbs );
+        toBd.rbs.rb_xor(bd2.rbs);
     }
 };
 
 template <typename T>
 struct BitmapAndnotImpl
 {
-    static void apply( AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2 )
+    static void apply(AggregateFunctionGroupBitmapData<T> & toBd, const AggregateFunctionGroupBitmapData<T> & bd2)
     {
-        toBd.rbs.rb_andnot( bd2.rbs );
+        toBd.rbs.rb_andnot(bd2.rbs);
     }
 };
 
@@ -419,10 +471,17 @@ public:
 
     DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
     {
-        const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>( arguments[0].get() );
-        if (!aggr_type)
-            throw Exception("First argument for function " + getName() + " must be an AggregateFunction but it has type "
-                            + arguments[0]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+        auto bitmap_type0 = typeid_cast<const DataTypeAggregateFunction *>(arguments[0].get());
+        if (!(bitmap_type0 && bitmap_type0->getFunctionName() == AggregateFunctionGroupBitmapData<UInt32>::name()))
+            throw Exception(
+                "First argument for function " + getName() + " must be an bitmap but it has type " + arguments[0]->getName() + ".",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        auto bitmap_type1 = typeid_cast<const DataTypeAggregateFunction *>(arguments[1].get());
+        if (!(bitmap_type1 && bitmap_type1->getFunctionName() == AggregateFunctionGroupBitmapData<UInt32>::name()))
+            throw Exception(
+                "Second argument for function " + getName() + " must be an bitmap but it has type " + arguments[1]->getName() + ".",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
         return arguments[0];
     }
 
@@ -433,18 +492,22 @@ public:
         const IDataType * from_type = block.getByPosition(arguments[0]).type.get();
         const DataTypeAggregateFunction * aggr_type = typeid_cast<const DataTypeAggregateFunction *>(from_type);
         WhichDataType which(aggr_type->getArgumentsDataTypes()[0]);
-        if      (which.isUInt8()) executeBitmapData<UInt8>(block, arguments, result, input_rows_count);
-        else if (which.isUInt16()) executeBitmapData<UInt16>(block, arguments, result, input_rows_count);
-        else if (which.isUInt32()) executeBitmapData<UInt32>(block, arguments, result, input_rows_count);
-        else if (which.isUInt64()) executeBitmapData<UInt64>(block, arguments, result, input_rows_count);
+        if (which.isUInt8())
+            executeBitmapData<UInt8>(block, arguments, result, input_rows_count);
+        else if (which.isUInt16())
+            executeBitmapData<UInt16>(block, arguments, result, input_rows_count);
+        else if (which.isUInt32())
+            executeBitmapData<UInt32>(block, arguments, result, input_rows_count);
+        else if (which.isUInt64())
+            executeBitmapData<UInt64>(block, arguments, result, input_rows_count);
         else
-            throw Exception("Unexpected type " + from_type->getName() + " of argument of function " + getName(),
-                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            throw Exception(
+                "Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
     }
 
 private:
     template <typename T>
-    void executeBitmapData( Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count)
+    void executeBitmapData(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count)
     {
         const ColumnAggregateFunction * columns[2];
         for (size_t i = 0; i < 2; ++i)
@@ -454,46 +517,77 @@ private:
 
         col_to->reserve(input_rows_count);
 
-        for(size_t i = 0; i < input_rows_count; ++i)
+        for (size_t i = 0; i < input_rows_count; ++i)
         {
             col_to->insertFrom(columns[0]->getData()[i]);
-            AggregateFunctionGroupBitmapData<T>& toBd = *reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>( col_to->getData()[i] );
-            const AggregateFunctionGroupBitmapData<T>& bd2 = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>( columns[1]->getData()[i] );
-            Impl<T>::apply( toBd, bd2 );
+            AggregateFunctionGroupBitmapData<T> & toBd = *reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>(col_to->getData()[i]);
+            const AggregateFunctionGroupBitmapData<T> & bd2
+                = *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>(columns[1]->getData()[i]);
+            Impl<T>::apply(toBd, bd2);
         }
         block.getByPosition(result).column = std::move(col_to);
     }
 };
 
-struct NameBitmapBuild   { static constexpr auto name = "bitmapBuild"; };
+struct NameBitmapBuild
+{
+    static constexpr auto name = "bitmapBuild";
+};
 using FunctionBitmapBuild = FunctionBitmapBuildImpl<NameBitmapBuild>;
 
-struct NameBitmapToArray   { static constexpr auto name = "bitmapToArray"; };
+struct NameBitmapToArray
+{
+    static constexpr auto name = "bitmapToArray";
+};
 using FunctionBitmapToArray = FunctionBitmapToArrayImpl<NameBitmapToArray>;
 
-struct NameBitmapCardinality         { static constexpr auto name = "bitmapCardinality"; };
-struct NameBitmapAndCardinality      { static constexpr auto name = "bitmapAndCardinality"; };
-struct NameBitmapOrCardinality       { static constexpr auto name = "bitmapOrCardinality"; };
-struct NameBitmapXorCardinality      { static constexpr auto name = "bitmapXorCardinality"; };
-struct NameBitmapAndnotCardinality   { static constexpr auto name = "bitmapAndnotCardinality"; };
+struct NameBitmapCardinality
+{
+    static constexpr auto name = "bitmapCardinality";
+};
+struct NameBitmapAndCardinality
+{
+    static constexpr auto name = "bitmapAndCardinality";
+};
+struct NameBitmapOrCardinality
+{
+    static constexpr auto name = "bitmapOrCardinality";
+};
+struct NameBitmapXorCardinality
+{
+    static constexpr auto name = "bitmapXorCardinality";
+};
+struct NameBitmapAndnotCardinality
+{
+    static constexpr auto name = "bitmapAndnotCardinality";
+};
 
-using FunctionBitmapSelfCardinality   = FunctionBitmapSelfCardinalityImpl<NameBitmapCardinality>;
-using FunctionBitmapAndCardinality    = FunctionBitmapCardinality<BitmapAndCardinalityImpl, NameBitmapAndCardinality>;
-using FunctionBitmapOrCardinality     = FunctionBitmapCardinality<BitmapOrCardinalityImpl, NameBitmapOrCardinality>;
-using FunctionBitmapXorCardinality    = FunctionBitmapCardinality<BitmapXorCardinalityImpl, NameBitmapXorCardinality>;
+using FunctionBitmapSelfCardinality = FunctionBitmapSelfCardinalityImpl<NameBitmapCardinality>;
+using FunctionBitmapAndCardinality = FunctionBitmapCardinality<BitmapAndCardinalityImpl, NameBitmapAndCardinality>;
+using FunctionBitmapOrCardinality = FunctionBitmapCardinality<BitmapOrCardinalityImpl, NameBitmapOrCardinality>;
+using FunctionBitmapXorCardinality = FunctionBitmapCardinality<BitmapXorCardinalityImpl, NameBitmapXorCardinality>;
 using FunctionBitmapAndnotCardinality = FunctionBitmapCardinality<BitmapAndnotCardinalityImpl, NameBitmapAndnotCardinality>;
 
-struct NameBitmapAnd     { static constexpr auto name = "bitmapAnd"; };
-struct NameBitmapOr      { static constexpr auto name = "bitmapOr"; };
-struct NameBitmapXor     { static constexpr auto name = "bitmapXor"; };
-struct NameBitmapAndnot  { static constexpr auto name = "bitmapAndnot"; };
-using FunctionBitmapAnd    = FunctionBitmap<BitmapAndImpl, NameBitmapAnd>;
-using FunctionBitmapOr     = FunctionBitmap<BitmapOrImpl, NameBitmapOr>;
-using FunctionBitmapXor    = FunctionBitmap<BitmapXorImpl, NameBitmapXor>;
+struct NameBitmapAnd
+{
+    static constexpr auto name = "bitmapAnd";
+};
+struct NameBitmapOr
+{
+    static constexpr auto name = "bitmapOr";
+};
+struct NameBitmapXor
+{
+    static constexpr auto name = "bitmapXor";
+};
+struct NameBitmapAndnot
+{
+    static constexpr auto name = "bitmapAndnot";
+};
+using FunctionBitmapAnd = FunctionBitmap<BitmapAndImpl, NameBitmapAnd>;
+using FunctionBitmapOr = FunctionBitmap<BitmapOrImpl, NameBitmapOr>;
+using FunctionBitmapXor = FunctionBitmap<BitmapXorImpl, NameBitmapXor>;
 using FunctionBitmapAndnot = FunctionBitmap<BitmapAndnotImpl, NameBitmapAndnot>;
 
 
-
 }
-
-

From b499e2998d25debb437b254aeef38144856bf983 Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Sat, 9 Feb 2019 17:27:22 +0800
Subject: [PATCH 03/69] change test id from 00834 to 00829

---
 ..._bitmap_function.reference => 00829_bitmap_function.reference} | 0
 .../{00834_bitmap_function.sql => 00829_bitmap_function.sql}      | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename dbms/tests/queries/0_stateless/{00834_bitmap_function.reference => 00829_bitmap_function.reference} (100%)
 rename dbms/tests/queries/0_stateless/{00834_bitmap_function.sql => 00829_bitmap_function.sql} (100%)

diff --git a/dbms/tests/queries/0_stateless/00834_bitmap_function.reference b/dbms/tests/queries/0_stateless/00829_bitmap_function.reference
similarity index 100%
rename from dbms/tests/queries/0_stateless/00834_bitmap_function.reference
rename to dbms/tests/queries/0_stateless/00829_bitmap_function.reference
diff --git a/dbms/tests/queries/0_stateless/00834_bitmap_function.sql b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
similarity index 100%
rename from dbms/tests/queries/0_stateless/00834_bitmap_function.sql
rename to dbms/tests/queries/0_stateless/00829_bitmap_function.sql

From 897e12e6feab67f844990e67b203c47f1875a9df Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Sun, 10 Feb 2019 01:06:39 +0300
Subject: [PATCH 04/69] Update CMakeLists.txt

---
 dbms/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index acbe157c18b..dce2a0a1835 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -192,7 +192,7 @@ target_link_libraries (clickhouse_common_io
     ${RE2_LIBRARY}
     ${RE2_ST_LIBRARY}
     ${CITYHASH_LIBRARIES}
-	roaring
+    roaring
         PRIVATE
     ${ZLIB_LIBRARIES}
     ${EXECINFO_LIBRARY}

From ae4a842a1e435ff2f6d2a9623504aba8acb107c5 Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Thu, 14 Feb 2019 14:58:29 +0800
Subject: [PATCH 05/69] modify test cases

---
 dbms/tests/queries/0_stateless/00829_bitmap_function.sql | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
index e76a447db25..dd2e7eae12e 100644
--- a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
+++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
@@ -10,7 +10,6 @@ SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]));
 SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]));
 
 DROP TABLE IF EXISTS test.bitmap_test;
-
 CREATE TABLE test.bitmap_test(pickup_date Date, city_id UInt32, uid UInt32)ENGINE = Memory;
 INSERT INTO test.bitmap_test SELECT '2019-01-01', 1, number FROM numbers(1,50);
 INSERT INTO test.bitmap_test SELECT '2019-01-02', 1, number FROM numbers(11,60);

From 4d42a297ae78f28e42324161bcd86af0a3ff845d Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Thu, 14 Feb 2019 19:51:13 +0800
Subject: [PATCH 06/69] Adjust to the latest IAggregateFunctionDataHelper

---
 dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp | 2 +-
 dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h   | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
index 269476ce9ad..22fd583e6d3 100644
--- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
@@ -21,7 +21,7 @@ AggregateFunctionPtr createAggregateFunctionBitmap(const std::string & name, con
             + " is illegal, because it cannot be used in Bitmap operations",
             ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
 
-    AggregateFunctionPtr res(createWithUnsignedIntegerType<AggregateFunctionBitmap, Data>(*argument_types[0]));
+    AggregateFunctionPtr res(createWithUnsignedIntegerType<AggregateFunctionBitmap, Data>(*argument_types[0], argument_types[0]));
 
     if (!res)
         throw Exception("Illegal type " + argument_types[0]->getName() + " of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h
index 38b63b713e7..ccc851efbec 100644
--- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmap.h
@@ -14,6 +14,9 @@ template <typename T, typename Data>
 class AggregateFunctionBitmap final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitmap<T, Data>>
 {
 public:
+    AggregateFunctionBitmap(const DataTypePtr & type)
+            : IAggregateFunctionDataHelper<Data, AggregateFunctionBitmap<T, Data>>({type}, {}) {}
+
     String getName() const override { return Data::name(); }
 
     DataTypePtr getReturnType() const override

From 7978b508b4cfb56158ee3ef37e7b45e9a2b0b0a0 Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Fri, 15 Feb 2019 15:02:08 +0800
Subject: [PATCH 07/69] check 00281_compile_sizeof_packed test case error

---
 dbms/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index 6bbd8ba408a..1e9159c7ccf 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -191,7 +191,6 @@ target_link_libraries (clickhouse_common_io
     ${RE2_LIBRARY}
     ${RE2_ST_LIBRARY}
     ${CITYHASH_LIBRARIES}
-    roaring
         PRIVATE
     ${ZLIB_LIBRARIES}
     ${EXECINFO_LIBRARY}
@@ -204,6 +203,8 @@ target_link_libraries (clickhouse_common_io
     Threads::Threads
         PRIVATE
     ${CMAKE_DL_LIBS}
+        PUBLIC
+    roaring	
 )
 
 target_include_directories(clickhouse_common_io SYSTEM BEFORE PUBLIC ${PDQSORT_INCLUDE_DIR})

From 956b7a07437e2dd2f703569115a4d33a43f37b2c Mon Sep 17 00:00:00 2001
From: ogorbacheva <ogorbacheva@yandex-team.ru>
Date: Fri, 15 Feb 2019 12:56:34 +0300
Subject: [PATCH 08/69] Doc fix: SAMPLE n

---
 docs/en/query_language/select.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md
index 01240ba8302..db18ada315b 100644
--- a/docs/en/query_language/select.md
+++ b/docs/en/query_language/select.md
@@ -51,7 +51,7 @@ The SAMPLE clause allows for approximated query processing. Approximated query p
 `SAMPLE` has the `SAMPLE k`, where `k` is a decimal number from 0 to 1, or `SAMPLE n`, where 'n' is a sufficiently large integer.
 
 In the first case, the query will be executed on 'k' percent of data. For example, `SAMPLE 0.1` runs the query on 10% of data.
-In the second case, the query will be executed on a sample of no more than 'n' rows. For example, `SAMPLE 10000000` runs the query on a maximum of 10,000,000 rows.
+In the second case, the query will be executed on a sample of at least 'n' rows. For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows (but not significantly more than this).
 
 Example:
 
@@ -74,7 +74,7 @@ ORDER BY PageViews DESC LIMIT 1000
 
 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value 'count()' is manually multiplied by 10.
 
-When using something like `SAMPLE 10000000`, there isn't any information about which relative percent of data was processed or what the aggregate functions should be multiplied by, so this method of writing is not always appropriate to the situation.
+When using something like `SAMPLE 10000000`, there isn't any information about which relative percent of data was processed or what the aggregate functions should be multiplied by, so this method of writing is not always appropriate to the situation. 
 
 A sample with a relative coefficient is "consistent": if we look at all possible data that could be in the table, a sample (when using a single sampling expression specified during table creation) with the same coefficient always selects the same subset of possible data. In other words, a sample from different tables on different servers at different times is made the same way.
 

From c31785c08227c13cc855d872abb7a94907ada26f Mon Sep 17 00:00:00 2001
From: ogorbacheva <ogorbacheva@yandex-team.ru>
Date: Tue, 19 Feb 2019 10:02:32 +0300
Subject: [PATCH 09/69] Doc fix: change logic of SAMPLE n

---
 docs/en/query_language/select.md | 36 +++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md
index db18ada315b..2c5d3e75f93 100644
--- a/docs/en/query_language/select.md
+++ b/docs/en/query_language/select.md
@@ -48,10 +48,10 @@ The FINAL modifier can be used only for a SELECT from a CollapsingMergeTree tabl
 
 The SAMPLE clause allows for approximated query processing. Approximated query processing is only supported by the tables in the `MergeTree` family, and only if the sampling expression was specified during table creation (see the section [MergeTree engine](../operations/table_engines/mergetree.md)).
 
-`SAMPLE` has the `SAMPLE k`, where `k` is a decimal number from 0 to 1, or `SAMPLE n`, where 'n' is a sufficiently large integer.
+`SAMPLE` has the `SAMPLE k`, where `k` is a decimal number from 0 to 1, or `SAMPLE n`, where `n` is a sufficiently large integer.
 
-In the first case, the query will be executed on 'k' percent of data. For example, `SAMPLE 0.1` runs the query on 10% of data.
-In the second case, the query will be executed on a sample of at least 'n' rows. For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows (but not significantly more than this).
+In the first case, the query will be executed on `k` percent of data. For example, `SAMPLE 0.1` runs the query on 10% of data.
+In the second case, the query will be executed on a sample of at least `n` rows. For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows (but not significantly more than this).
 
 Example:
 
@@ -74,8 +74,30 @@ ORDER BY PageViews DESC LIMIT 1000
 
 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value 'count()' is manually multiplied by 10.
 
-When using something like `SAMPLE 10000000`, there isn't any information about which relative percent of data was processed or what the aggregate functions should be multiplied by, so this method of writing is not always appropriate to the situation. 
+When using something like `SAMPLE 10000000`, you do not know which relative percent of data was processed and what the aggregate functions should be multiplied by. In this case, you can use the `_sample_factor` virtual column as a relative coefficient. For example:
 
+``` sql
+SELECT sum(Duration * _sample_offset)
+FROM visits
+SAMPLE 10000000
+```   
+
+If you need to get the approximate count of rows in a `SELECT .. SAMPLE n` query, get the sum() of `_sample_offset` column instead of counting `count(column * _sample_column)` value. For example:
+
+``` sql
+SELECT sum(_sample_offset)
+FROM visits
+SAMPLE 10000000
+```  
+
+Note that to get the average value in a `SELECT .. SAMPLE n` query, you do not need to use `_sample_factor` column:
+
+``` sql
+SELECT avg(Duration)
+FROM visits
+SAMPLE 10000000
+```  
+ 
 A sample with a relative coefficient is "consistent": if we look at all possible data that could be in the table, a sample (when using a single sampling expression specified during table creation) with the same coefficient always selects the same subset of possible data. In other words, a sample from different tables on different servers at different times is made the same way.
 
 For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the IN clause, as well as for manually correlating results of different queries with samples.
@@ -720,10 +742,10 @@ DISTINCT is not supported if SELECT has at least one array column.
 
 ### LIMIT Clause
 
-LIMIT m allows you to select the first 'm' rows from the result.
-LIMIT n, m allows you to select the first 'm' rows from the result after skipping the first 'n' rows.
+`LIMIT m` allows you to select the first `m` rows from the result.
+`LIMIT n`, m allows you to select the first `m` rows from the result after skipping the first `n` rows.
 
-'n' and 'm' must be non-negative integers.
+`n` and `m` must be non-negative integers.
 
 If there isn't an ORDER BY clause that explicitly sorts results, the result may be arbitrary and nondeterministic.
 

From 8e47b15d8ab358505e404fcfeb922dcb51ca9949 Mon Sep 17 00:00:00 2001
From: ogorbacheva <ogorbacheva@yandex-team.ru>
Date: Tue, 19 Feb 2019 16:29:49 +0300
Subject: [PATCH 10/69] Fix SAMPLE desc

---
 docs/en/query_language/select.md | 33 +++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md
index 2c5d3e75f93..a0925b02f8d 100644
--- a/docs/en/query_language/select.md
+++ b/docs/en/query_language/select.md
@@ -48,12 +48,21 @@ The FINAL modifier can be used only for a SELECT from a CollapsingMergeTree tabl
 
 The SAMPLE clause allows for approximated query processing. Approximated query processing is only supported by the tables in the `MergeTree` family, and only if the sampling expression was specified during table creation (see the section [MergeTree engine](../operations/table_engines/mergetree.md)).
 
-`SAMPLE` has the `SAMPLE k`, where `k` is a decimal number from 0 to 1, or `SAMPLE n`, where `n` is a sufficiently large integer.
+The features of data sampling are listed below:
 
-In the first case, the query will be executed on `k` percent of data. For example, `SAMPLE 0.1` runs the query on 10% of data.
-In the second case, the query will be executed on a sample of at least `n` rows. For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows (but not significantly more than this).
+- Data sampling is a determined mechanism. The result of the same `SELECT .. SAMPLE` query is always the same.
+- Sampling works consistently for different tables. For tables with a single sampling key, a sample with the same coefficient always selects the same subset of possible data. For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the IN clause, as well as for manually correlating results of different queries with samples.
+- Sampling allows reading fewer data from a disk. Note that for this you must specify the sampling key correctly. For more details see the section [Creating a MergeTree Table](../operations/table_engines/mergetree.md##table_engine-mergetree-creating-a-table).
 
-Example:
+The `SAMPLE` clause can be specified in several ways:
+
+- `SAMPLE k`, where `k` is a decimal number from 0 to 1. The query is executed on `k` percent of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Details](#select-sample-k)
+- `SAMPLE n`, where `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Details](#select-sample-n)
+- `SAMPLE k OFFSET m` where `k` and `m` are numbers from 0 to 1. Sample `k` is offset by `m` percent of data. [Details](#select-sample-offset)
+
+#### SAMPLE K {#select-sample-k}
+
+In a `SAMPLE k` clause, `k` is a percent amount of data that the sample is taken from. The example is shown below:
 
 ``` sql
 SELECT
@@ -74,7 +83,13 @@ ORDER BY PageViews DESC LIMIT 1000
 
 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value 'count()' is manually multiplied by 10.
 
-When using something like `SAMPLE 10000000`, you do not know which relative percent of data was processed and what the aggregate functions should be multiplied by. In this case, you can use the `_sample_factor` virtual column as a relative coefficient. For example:
+#### SAMPLE N {#select-sample-n}
+
+When using the `SAMPLE n` clause, the relative coefficient is calculated dynamically. Since you do not know which relative percent of data was processed, you do not know the coefficient the aggregate functions should be multiplied by (for example, you do not know if the `SAMPLE 1000000` was taken from a set of 10,000,000 rows or from a set of 1,000,000,000 rows).
+
+ClickHouse stores relative coefficients in the `_sample_factor` virtual column. This column is created automatically when you create a table with the specified sampling key. For different rows, the relative coefficient can differ. 
+
+The example of the `_sample_factor` column usage is shown below:
 
 ``` sql
 SELECT sum(Duration * _sample_offset)
@@ -90,7 +105,7 @@ FROM visits
 SAMPLE 10000000
 ```  
 
-Note that to get the average value in a `SELECT .. SAMPLE n` query, you do not need to use `_sample_factor` column:
+Note that to calculate the average in a `SELECT .. SAMPLE n` query, you do not need to use `_sample_factor` column:
 
 ``` sql
 SELECT avg(Duration)
@@ -98,11 +113,7 @@ FROM visits
 SAMPLE 10000000
 ```  
  
-A sample with a relative coefficient is "consistent": if we look at all possible data that could be in the table, a sample (when using a single sampling expression specified during table creation) with the same coefficient always selects the same subset of possible data. In other words, a sample from different tables on different servers at different times is made the same way.
-
-For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the IN clause, as well as for manually correlating results of different queries with samples.
-
-**SAMPLE OFFSET**
+#### SAMPLE OFFSET {#select-sample-offset}
 
 You can specify the `SAMPLE k OFFSET n` clause, where `k` and `n` are numbers from 0 to 1. Examples are shown below.
 

From a8377af02f2502106d9c2fc130fe09b01bd2ca41 Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Wed, 20 Feb 2019 10:12:40 +0800
Subject: [PATCH 11/69] modify bitmap test sql

---
 dbms/tests/queries/0_stateless/00829_bitmap_function.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
index dd2e7eae12e..bc774f0311d 100644
--- a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
+++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
@@ -21,7 +21,7 @@ SELECT pickup_date, groupBitmap( uid ) AS user_num, bitmapToArray(groupBitmapSta
 SELECT
     bitmapCardinality(day_today) AS today_users,
     bitmapCardinality(day_before) AS before_users,
-    bitmapOrCardinality(day_today, day_before)ll_users,
+    bitmapOrCardinality(day_today, day_before) AS all_users,
     bitmapAndCardinality(day_today, day_before) AS old_users,
     bitmapAndnotCardinality(day_today, day_before) AS new_users,
     bitmapXorCardinality(day_today, day_before) AS diff_users

From 58a5a9dc0cdf5d1ff563fd33b68cba36f1a99dbe Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Sat, 16 Feb 2019 18:04:55 +0100
Subject: [PATCH 12/69] Fix "only_from_localhost" example

---
 dbms/programs/server/users.d/allow_only_from_localhost.xml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dbms/programs/server/users.d/allow_only_from_localhost.xml b/dbms/programs/server/users.d/allow_only_from_localhost.xml
index aad3a696521..bce5858ad41 100644
--- a/dbms/programs/server/users.d/allow_only_from_localhost.xml
+++ b/dbms/programs/server/users.d/allow_only_from_localhost.xml
@@ -4,7 +4,6 @@
         <default>
             <networks replace="replace">
                 <ip>::1</ip>
-                <ip>0.0.0.0</ip>
                 <ip>127.0.0.1</ip>
             </networks>
         </default>

From 9918f7cd2a6fc0e67132b8124e5381ae14d772f9 Mon Sep 17 00:00:00 2001
From: ogorbacheva <ogorbacheva@yandex-team.ru>
Date: Thu, 21 Feb 2019 16:43:09 +0300
Subject: [PATCH 13/69] fixes

---
 docs/en/query_language/select.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md
index 00ef09177aa..fb6ade02ab2 100644
--- a/docs/en/query_language/select.md
+++ b/docs/en/query_language/select.md
@@ -92,15 +92,15 @@ ClickHouse stores relative coefficients in the `_sample_factor` virtual column.
 The example of the `_sample_factor` column usage is shown below:
 
 ``` sql
-SELECT sum(Duration * _sample_offset)
+SELECT sum(Duration * _sample_factor)
 FROM visits
 SAMPLE 10000000
 ```   
 
-If you need to get the approximate count of rows in a `SELECT .. SAMPLE n` query, get the sum() of `_sample_offset` column instead of counting `count(column * _sample_column)` value. For example:
+If you need to get the approximate count of rows in a `SELECT .. SAMPLE n` query, get the sum() of `_sample_factor` column instead of counting `count(column * _sample_factor)` value. For example:
 
 ``` sql
-SELECT sum(_sample_offset)
+SELECT sum(_sample_factor)
 FROM visits
 SAMPLE 10000000
 ```  

From f62d4717d7c3432adc4d98d54920c9de6a8fb01d Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Fri, 22 Feb 2019 10:24:48 +0800
Subject: [PATCH 14/69] modify bitmap test sql

---
 dbms/tests/queries/0_stateless/00829_bitmap_function.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
index bc774f0311d..fc3a6b97aca 100644
--- a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
+++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
@@ -52,4 +52,5 @@ ALL LEFT JOIN
 )
 USING city_id;
 
+
 DROP TABLE IF EXISTS test.bitmap_test;

From f1a4330c69e0badd69f56f693679dcd89d5af821 Mon Sep 17 00:00:00 2001
From: Andy Yang <yangzhaohui168@gmail.com>
Date: Fri, 22 Feb 2019 16:14:35 +0800
Subject: [PATCH 15/69] add bitmap state test sql

---
 .../00829_bitmap_function.reference           |  2 ++
 .../0_stateless/00829_bitmap_function.sql     | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.reference b/dbms/tests/queries/0_stateless/00829_bitmap_function.reference
index ea26ab0d097..cc159d2aaf8 100644
--- a/dbms/tests/queries/0_stateless/00829_bitmap_function.reference
+++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.reference
@@ -13,3 +13,5 @@
 2019-01-02	60	[11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
 60	50	70	40	20	30
 60	50	70	40	20	30
+2019-01-01	50
+2019-01-02	60
diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
index fc3a6b97aca..55f920c8c55 100644
--- a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
+++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql
@@ -14,6 +14,7 @@ CREATE TABLE test.bitmap_test(pickup_date Date, city_id UInt32, uid UInt32)ENGIN
 INSERT INTO test.bitmap_test SELECT '2019-01-01', 1, number FROM numbers(1,50);
 INSERT INTO test.bitmap_test SELECT '2019-01-02', 1, number FROM numbers(11,60);
 
+
 SELECT groupBitmap( uid ) AS user_num FROM test.bitmap_test;
 
 SELECT pickup_date, groupBitmap( uid ) AS user_num, bitmapToArray(groupBitmapState( uid )) AS users FROM test.bitmap_test GROUP BY pickup_date;
@@ -53,4 +54,24 @@ ALL LEFT JOIN
 USING city_id;
 
 
+DROP TABLE IF EXISTS test.bitmap_state_test;
+CREATE TABLE test.bitmap_state_test
+(
+	pickup_date Date,
+	city_id UInt32,
+    uv AggregateFunction( groupBitmap, UInt32 )	
+)
+ENGINE = AggregatingMergeTree( pickup_date, ( pickup_date, city_id ), 8192);
+
+INSERT INTO test.bitmap_state_test SELECT 
+    pickup_date, 
+    city_id,
+    groupBitmapState(uid) AS uv
+FROM test.bitmap_test
+GROUP BY pickup_date, city_id;
+	
+SELECT pickup_date, groupBitmapMerge(uv) AS users from test.bitmap_state_test group by pickup_date;
+
 DROP TABLE IF EXISTS test.bitmap_test;
+DROP TABLE IF EXISTS test.bitmap_state_test;
+

From 218d9d6c63d71a2284fffa7185fde137a344e331 Mon Sep 17 00:00:00 2001
From: ogorbacheva <ogorbacheva@yandex-team.ru>
Date: Fri, 22 Feb 2019 16:28:26 +0300
Subject: [PATCH 16/69] fixes in process

---
 docs/en/query_language/select.md | 33 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md
index fb6ade02ab2..282b9e88a62 100644
--- a/docs/en/query_language/select.md
+++ b/docs/en/query_language/select.md
@@ -46,21 +46,21 @@ The FINAL modifier can be used only for a SELECT from a CollapsingMergeTree tabl
 
 ### SAMPLE Clause {#select-sample-clause}
 
-The SAMPLE clause allows for approximated query processing. Approximated query processing is only supported by the tables in the `MergeTree` family, and only if the sampling expression was specified during table creation (see the section [MergeTree engine](../operations/table_engines/mergetree.md)).
+The `SAMPLE` clause allows for approximated query processing. Approximated query processing is only supported by the tables in the `MergeTree` family, and only if the sampling expression was specified during table creation (see [MergeTree engine](../operations/table_engines/mergetree.md)).
 
 The features of data sampling are listed below:
 
 - Data sampling is a determined mechanism. The result of the same `SELECT .. SAMPLE` query is always the same.
-- Sampling works consistently for different tables. For tables with a single sampling key, a sample with the same coefficient always selects the same subset of possible data. For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the IN clause, as well as for manually correlating results of different queries with samples.
-- Sampling allows reading fewer data from a disk. Note that for this you must specify the sampling key correctly. For more details see the section [Creating a MergeTree Table](../operations/table_engines/mergetree.md##table_engine-mergetree-creating-a-table).
+- Sampling works consistently for different tables. For tables with a single sampling key, a sample with the same coefficient always selects the same subset of possible data. For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the `IN` clause, as well as for manually correlating results of different queries with samples.
+- Sampling allows reading fewer data from a disk. Note that for this you must specify the sampling key correctly. For more details see [Creating a MergeTree Table](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table).
 
 The `SAMPLE` clause can be specified in several ways:
 
-- `SAMPLE k`, where `k` is a decimal number from 0 to 1. The query is executed on `k` percent of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Details](#select-sample-k)
-- `SAMPLE n`, where `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Details](#select-sample-n)
-- `SAMPLE k OFFSET m` where `k` and `m` are numbers from 0 to 1. Sample `k` is offset by `m` percent of data. [Details](#select-sample-offset)
+- `SAMPLE k`, where `k` is a decimal number from 0 to 1. The query is executed on `k` percent of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k)
+- `SAMPLE n`, where `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n)
+- `SAMPLE k OFFSET m` where `k` and `m` are numbers from 0 to 1. The query is executed on a sample of `k` percent of the data. The data used for the sample is offset by `m` percent. [Read more](#select-sample-offset)
 
-#### SAMPLE K {#select-sample-k}
+#### SAMPLE k {#select-sample-k}
 
 In a `SAMPLE k` clause, `k` is a percent amount of data that the sample is taken from. The example is shown below:
 
@@ -72,24 +72,21 @@ FROM hits_distributed
 SAMPLE 0.1
 WHERE
     CounterID = 34
-    AND toDate(EventDate) >= toDate('2013-01-29')
-    AND toDate(EventDate) <= toDate('2013-02-04')
-    AND NOT DontCountHits
-    AND NOT Refresh
-    AND Title != ''
 GROUP BY Title
 ORDER BY PageViews DESC LIMIT 1000
 ```
 
 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value 'count()' is manually multiplied by 10.
 
-#### SAMPLE N {#select-sample-n}
+#### SAMPLE n {#select-sample-n}
 
-When using the `SAMPLE n` clause, the relative coefficient is calculated dynamically. Since you do not know which relative percent of data was processed, you do not know the coefficient the aggregate functions should be multiplied by (for example, you do not know if the `SAMPLE 1000000` was taken from a set of 10,000,000 rows or from a set of 1,000,000,000 rows).
+In this case, the query is executed on a sample of at least `n` rows, where `n` is a sufficiently large integer. For example, `SAMPLE 10000000`.
 
-ClickHouse stores relative coefficients in the `_sample_factor` virtual column. This column is created automatically when you create a table with the specified sampling key. For different rows, the relative coefficient can differ. 
+Since the minimum unit for data reading is one granule (its size is set by the `index_granularity` setting), it makes sense to set a sample that is much larger than the size of the granule.
 
-The example of the `_sample_factor` column usage is shown below:
+When using the `SAMPLE n` clause, the relative coefficient is calculated dynamically. Since you do not know which relative percent of data was processed, you do not know the coefficient the aggregate functions should be multiplied by (for example, you do not know if the `SAMPLE 1000000` was taken from a set of 10,000,000 rows or from a set of 1,000,000,000 rows). In this case, use the `_sample_factor` column to get the approximate result.
+
+The `_sample_factor` is a virtual column that ClickHouse stores relative coefficients in. This column is created automatically when you create a table with the specified sampling key. The usage example is shown below:
 
 ``` sql
 SELECT sum(Duration * _sample_factor)
@@ -113,9 +110,9 @@ FROM visits
 SAMPLE 10000000
 ```  
  
-#### SAMPLE OFFSET {#select-sample-offset}
+#### SAMPLE k OFFSET m {#select-sample-offset}
 
-You can specify the `SAMPLE k OFFSET n` clause, where `k` and `n` are numbers from 0 to 1. Examples are shown below.
+You can specify the `SAMPLE k OFFSET m` clause, where `k` and `m` are numbers from 0 to 1. Examples are shown below.
 
 Example 1.
 

From 55b43d541763f0ffff2a772e258c763bb595fc04 Mon Sep 17 00:00:00 2001
From: ogorbacheva <ogorbacheva@yandex-team.ru>
Date: Mon, 25 Feb 2019 15:54:30 +0300
Subject: [PATCH 17/69] Doc fix: edit some sentences

---
 docs/en/query_language/select.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md
index 282b9e88a62..38cfb3a5e37 100644
--- a/docs/en/query_language/select.md
+++ b/docs/en/query_language/select.md
@@ -50,9 +50,9 @@ The `SAMPLE` clause allows for approximated query processing. Approximated query
 
 The features of data sampling are listed below:
 
-- Data sampling is a determined mechanism. The result of the same `SELECT .. SAMPLE` query is always the same.
+- Data sampling is a deterministic mechanism. The result of the same `SELECT .. SAMPLE` query is always the same.
 - Sampling works consistently for different tables. For tables with a single sampling key, a sample with the same coefficient always selects the same subset of possible data. For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the `IN` clause, as well as for manually correlating results of different queries with samples.
-- Sampling allows reading fewer data from a disk. Note that for this you must specify the sampling key correctly. For more details see [Creating a MergeTree Table](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table).
+- Sampling allows reading less data from a disk. Note that for this you must specify the sampling key correctly. For more details see [Creating a MergeTree Table](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table).
 
 The `SAMPLE` clause can be specified in several ways:
 

From 88d63a0b3ad43fccb6eb429a995546eee47e6fd9 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Sun, 17 Feb 2019 21:56:27 +0100
Subject: [PATCH 18/69] Use pair of patterns to make combined RollupRules

---
 .../GraphiteRollupSortedBlockInputStream.cpp  | 90 +++++++++++++++----
 .../GraphiteRollupSortedBlockInputStream.h    | 39 ++++++--
 .../MergeTree/registerStorageMergeTree.cpp    | 29 ++++--
 .../Storages/System/StorageSystemGraphite.cpp | 19 +++-
 4 files changed, 145 insertions(+), 32 deletions(-)

diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp
index dc30a3e7a07..6c1983568bb 100644
--- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp
+++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp
@@ -23,8 +23,11 @@ GraphiteRollupSortedBlockInputStream::GraphiteRollupSortedBlockInputStream(
 
     for (const auto & pattern : params.patterns)
     {
-        max_size_of_aggregate_state = std::max(max_size_of_aggregate_state, pattern.function->sizeOfData());
-        max_alignment_of_aggregate_state = std::max(max_alignment_of_aggregate_state, pattern.function->alignOfData());
+        if (pattern.function)
+        {
+            max_size_of_aggregate_state = std::max(max_size_of_aggregate_state, pattern.function->sizeOfData());
+            max_alignment_of_aggregate_state = std::max(max_alignment_of_aggregate_state, pattern.function->alignOfData());
+        }
     }
 
     place_for_aggregate_state.reset(max_size_of_aggregate_state, max_alignment_of_aggregate_state);
@@ -41,13 +44,61 @@ GraphiteRollupSortedBlockInputStream::GraphiteRollupSortedBlockInputStream(
 }
 
 
-const Graphite::Pattern * GraphiteRollupSortedBlockInputStream::selectPatternForPath(StringRef path) const
+Graphite::RollupRule GraphiteRollupSortedBlockInputStream::selectPatternForPath(StringRef path) const
 {
+    const Graphite::Pattern * first_match = &undef_pattern;
+
     for (const auto & pattern : params.patterns)
         if (!pattern.regexp || pattern.regexp->match(path.data, path.size))
-            return &pattern;
+        {
+            if (!pattern.regexp)
+            {
+                /// Default pattern
+                if (first_match->type == first_match->TypeUndef && pattern.type == pattern.TypeAll)
+                {
+                    /// There is only default pattern for both retention and aggregation
+                    return std::pair(&pattern, &pattern);
+                }
+                if (pattern.type != first_match->type)
+                {
+                    if (first_match->type == first_match->TypeRetention)
+                    {
+                        return std::pair(first_match, &pattern);
+                    }
+                    if (first_match->type == first_match->TypeAggregation)
+                    {
+                        return std::pair(&pattern, first_match);
+                    }
+                }
+            }
+            else
+            {
+                /// General pattern with matched path
+                if (pattern.type == pattern.TypeAll)
+                {
+                   /// Only for not default patterns with both function and retention parameters
+                   return std::pair(&pattern, &pattern);
+                }
+                if (first_match->type == first_match->TypeUndef)
+                {
+                    first_match = &pattern;
+                    continue;
+                }
+                if (pattern.type != first_match->type)
+                {
+                    if (first_match->type == first_match->TypeRetention)
+                    {
+                        return std::pair(first_match, &pattern);
+                    }
+                    if (first_match->type == first_match->TypeAggregation)
+                    {
+                        return std::pair(&pattern, first_match);
+                    }
+                }
+            }
+        }
 
-    return nullptr;
+    return {nullptr, nullptr};
 }
 
 
@@ -142,14 +193,15 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
             if (started_rows)
                 accumulateRow(current_subgroup_newest_row);
 
-            const Graphite::Pattern * next_pattern = current_pattern;
+            Graphite::RollupRule next_rule = current_rule;
             if (new_path)
-                next_pattern = selectPatternForPath(next_path);
+                next_rule = selectPatternForPath(next_path);
 
+            const Graphite::RetentionPattern * retention_pattern = std::get<0>(next_rule);
             time_t next_time_rounded;
-            if (next_pattern)
+            if (retention_pattern)
             {
-                UInt32 precision = selectPrecision(next_pattern->retentions, next_row_time);
+                UInt32 precision = selectPrecision(retention_pattern->retentions, next_row_time);
                 next_time_rounded = roundTimeToPrecision(date_lut, next_row_time, precision);
             }
             else
@@ -177,7 +229,7 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
                 /// At this point previous row has been fully processed, so we can advance the loop
                 /// (substitute current_* values for next_*, advance the cursor).
 
-                startNextGroup(merged_columns, next_cursor, next_pattern);
+                startNextGroup(merged_columns, next_cursor, next_rule);
                 ++started_rows;
 
                 current_time_rounded = next_time_rounded;
@@ -229,8 +281,10 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
 
 template <typename TSortCursor>
 void GraphiteRollupSortedBlockInputStream::startNextGroup(MutableColumns & merged_columns, TSortCursor & cursor,
-                                                          const Graphite::Pattern * next_pattern)
+                                                          Graphite::RollupRule next_rule)
 {
+    const Graphite::AggregationPattern * aggregation_pattern = std::get<1>(next_rule);
+
     /// Copy unmodified column values (including path column).
     for (size_t i = 0, size = unmodified_column_numbers.size(); i < size; ++i)
     {
@@ -238,13 +292,13 @@ void GraphiteRollupSortedBlockInputStream::startNextGroup(MutableColumns & merge
         merged_columns[j]->insertFrom(*cursor->all_columns[j], cursor->pos);
     }
 
-    if (next_pattern)
+    if (aggregation_pattern)
     {
-        next_pattern->function->create(place_for_aggregate_state.data());
+        aggregation_pattern->function->create(place_for_aggregate_state.data());
         aggregate_state_created = true;
     }
 
-    current_pattern = next_pattern;
+    current_rule = next_rule;
 }
 
 
@@ -255,10 +309,11 @@ void GraphiteRollupSortedBlockInputStream::finishCurrentGroup(MutableColumns & m
     merged_columns[version_column_num]->insertFrom(
         *(*current_subgroup_newest_row.columns)[version_column_num], current_subgroup_newest_row.row_num);
 
+    const Graphite::AggregationPattern * aggregation_pattern = std::get<1>(current_rule);
     if (aggregate_state_created)
     {
-        current_pattern->function->insertResultInto(place_for_aggregate_state.data(), *merged_columns[value_column_num]);
-        current_pattern->function->destroy(place_for_aggregate_state.data());
+        aggregation_pattern->function->insertResultInto(place_for_aggregate_state.data(), *merged_columns[value_column_num]);
+        aggregation_pattern->function->destroy(place_for_aggregate_state.data());
         aggregate_state_created = false;
     }
     else
@@ -269,8 +324,9 @@ void GraphiteRollupSortedBlockInputStream::finishCurrentGroup(MutableColumns & m
 
 void GraphiteRollupSortedBlockInputStream::accumulateRow(RowRef & row)
 {
+    const Graphite::AggregationPattern * aggregation_pattern = std::get<1>(current_rule);
     if (aggregate_state_created)
-        current_pattern->function->add(place_for_aggregate_state.data(), &(*row.columns)[value_column_num], row.row_num, nullptr);
+        aggregation_pattern->function->add(place_for_aggregate_state.data(), &(*row.columns)[value_column_num], row.row_num, nullptr);
 }
 
 }
diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
index e18522d6d25..bb2f81fc81f 100644
--- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
+++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
@@ -27,11 +27,24 @@ namespace DB
   *
   * Each row in a table correspond to one value of one sensor.
   *
+  * Pattern should contain function, retention scheme, or both of them. The order of patterns does mean as well:
+  *   * Aggregation OR retention patterns should be first
+  *   * Then aggregation AND retention full patterns have to be placed
+  *   * default pattern without regexp must be the last
+  *
   * Rollup rules are specified in the following way:
   *
   * pattern
   *     regexp
   *     function
+  * pattern
+  *     regexp
+  *     age -> precision
+  *     age -> precision
+  *     ...
+  * pattern
+  *     regexp
+  *     function
   *     age -> precision
   *     age -> precision
   *     ...
@@ -54,6 +67,10 @@ namespace DB
   *
   * <graphite_rollup>
   *     <pattern>
+  *         <regexp>\.max$</regexp>
+  *         <function>max</function>
+  *     </pattern>
+  *     <pattern>
   *         <regexp>click_cost</regexp>
   *         <function>any</function>
   *         <retention>
@@ -98,9 +115,12 @@ namespace Graphite
         std::shared_ptr<OptimizedRegularExpression> regexp;
         AggregateFunctionPtr function;
         Retentions retentions;    /// Must be ordered by 'age' descending.
+        enum { TypeUndef, TypeRetention, TypeAggregation, TypeAll } type = TypeAll; /// The type of defined pattern, filled automatically
     };
 
     using Patterns = std::vector<Pattern>;
+    using RetentionPattern = Pattern;
+    using AggregationPattern = Pattern;
 
     struct Params
     {
@@ -110,6 +130,8 @@ namespace Graphite
         String version_column_name;
         Graphite::Patterns patterns;
     };
+
+    using RollupRule = std::pair<const RetentionPattern *, const AggregationPattern *>;
 }
 
 /** Merges several sorted streams into one.
@@ -135,7 +157,7 @@ public:
     ~GraphiteRollupSortedBlockInputStream() override
     {
         if (aggregate_state_created)
-            current_pattern->function->destroy(place_for_aggregate_state.data());
+            std::get<1>(current_rule)->function->destroy(place_for_aggregate_state.data());
     }
 
 protected:
@@ -186,11 +208,18 @@ private:
     time_t current_time = 0;
     time_t current_time_rounded = 0;
 
-    const Graphite::Pattern * current_pattern = nullptr;
+    Graphite::RollupRule current_rule = {nullptr, nullptr};
     AlignedBuffer place_for_aggregate_state;
-    bool aggregate_state_created = false; /// Invariant: if true then current_pattern is not NULL.
+    bool aggregate_state_created = false; /// Invariant: if true then current_rule is not NULL.
 
-    const Graphite::Pattern * selectPatternForPath(StringRef path) const;
+    const Graphite::Pattern undef_pattern =
+    { /// temporary empty pattern for selectPatternForPath
+        nullptr,
+        nullptr,
+        DB::Graphite::Retentions(),
+        undef_pattern.TypeUndef,
+    };
+    Graphite::RollupRule selectPatternForPath(StringRef path) const;
     UInt32 selectPrecision(const Graphite::Retentions & retentions, time_t time) const;
 
 
@@ -198,7 +227,7 @@ private:
 
     /// Insert the values into the resulting columns, which will not be changed in the future.
     template <typename TSortCursor>
-    void startNextGroup(MutableColumns & merged_columns, TSortCursor & cursor, const Graphite::Pattern * next_pattern);
+    void startNextGroup(MutableColumns & merged_columns, TSortCursor & cursor, Graphite::RollupRule next_pattern);
 
     /// Insert the calculated `time`, `value`, `version` values into the resulting columns by the last group of rows.
     void finishCurrentGroup(MutableColumns & merged_columns);
diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
index 1958b489023..103be508564 100644
--- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
+++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
@@ -126,17 +126,32 @@ static void appendGraphitePattern(
             throw Exception("Unknown element in config: " + key, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
     }
 
-    if (!pattern.function)
-        throw Exception("Aggregate function is mandatory for retention patterns in GraphiteMergeTree",
+    if (!pattern.function && pattern.retentions.empty())
+        throw Exception("At least one of an aggregate function or retention rules is mandatory for rollup patterns in GraphiteMergeTree",
             ErrorCodes::NO_ELEMENTS_IN_CONFIG);
 
-    if (pattern.function->allocatesMemoryInArena())
-        throw Exception("Aggregate function " + pattern.function->getName() + " isn't supported in GraphiteMergeTree",
-                        ErrorCodes::NOT_IMPLEMENTED);
+    if (!pattern.function)
+    {
+        pattern.type = pattern.TypeRetention;
+    }
+    else if (pattern.retentions.empty())
+    {
+        pattern.type = pattern.TypeAggregation;
+    }
+    else
+    {
+        pattern.type = pattern.TypeAll;
+    }
+
+    if (pattern.type & pattern.TypeAggregation) /// TypeAggregation or TypeAll
+        if (pattern.function->allocatesMemoryInArena())
+            throw Exception("Aggregate function " + pattern.function->getName() + " isn't supported in GraphiteMergeTree",
+                            ErrorCodes::NOT_IMPLEMENTED);
 
     /// retention should be in descending order of age.
-    std::sort(pattern.retentions.begin(), pattern.retentions.end(),
-        [] (const Graphite::Retention & a, const Graphite::Retention & b) { return a.age > b.age; });
+    if (pattern.type & pattern.TypeRetention) /// TypeRetention or TypeAll
+        std::sort(pattern.retentions.begin(), pattern.retentions.end(),
+            [] (const Graphite::Retention & a, const Graphite::Retention & b) { return a.age > b.age; });
 
     patterns.emplace_back(pattern);
 }
diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp
index 8cd466c050e..d75eb71841e 100644
--- a/dbms/src/Storages/System/StorageSystemGraphite.cpp
+++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp
@@ -148,13 +148,26 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context
         const auto patterns = readPatterns(config, section);
         for (const auto & pattern : patterns)
         {
-            for (const auto & ret : pattern.retentions)
+            if (!pattern.retentions.empty())
+            {
+                for (const auto & ret : pattern.retentions)
+                {
+                    res_columns[0]->insert(section);
+                    res_columns[1]->insert(pattern.regexp);
+                    res_columns[2]->insert(pattern.function);
+                    res_columns[3]->insert(ret.age);
+                    res_columns[4]->insert(ret.precision);
+                    res_columns[5]->insert(pattern.priority);
+                    res_columns[6]->insert(pattern.is_default);
+                }
+            }
+            else
             {
                 res_columns[0]->insert(section);
                 res_columns[1]->insert(pattern.regexp);
                 res_columns[2]->insert(pattern.function);
-                res_columns[3]->insert(ret.age);
-                res_columns[4]->insert(ret.precision);
+                res_columns[3]->insert(0);
+                res_columns[4]->insert(0);
                 res_columns[5]->insert(pattern.priority);
                 res_columns[6]->insert(pattern.is_default);
             }

From bc00ae383a99e641c46b5da6528775972ba3a4c8 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Wed, 20 Feb 2019 17:16:46 +0100
Subject: [PATCH 19/69] Update documentation for GraphiteMergeTree rollup

---
 .../table_engines/graphitemergetree.md        |  13 +-
 .../table_engines/graphitemergetree.md        |  19 +-
 .../table_engines/graphitemergetree.md        | 149 +----------
 docs/zh/operations/table_engines/mergetree.md | 236 +-----------------
 4 files changed, 25 insertions(+), 392 deletions(-)
 mode change 100644 => 120000 docs/zh/operations/table_engines/graphitemergetree.md
 mode change 100644 => 120000 docs/zh/operations/table_engines/mergetree.md

diff --git a/docs/en/operations/table_engines/graphitemergetree.md b/docs/en/operations/table_engines/graphitemergetree.md
index 5e30a67c645..4231fb00f43 100644
--- a/docs/en/operations/table_engines/graphitemergetree.md
+++ b/docs/en/operations/table_engines/graphitemergetree.md
@@ -75,6 +75,13 @@ Rollup configuration structure:
 
 ```
 required-columns
+pattern
+    regexp
+    function
+pattern
+    regexp
+    age + precision
+    ...
 pattern
     regexp
     function
@@ -88,15 +95,13 @@ default
     ...
 ```
 
-When processing a row, ClickHouse checks the rules in the `pattern` section. If the metric name matches the `regexp`, the rules from the `pattern`section are applied; otherwise, the rules from the `default` section are used.
-
-The rules are defined with fields `function` and `age + precision`.
+When processing a row, ClickHouse checks the rules in the `pattern` sections. Each of `pattern` sections could contain `function` parameter for aggregation, `retention` parameters or both. If the metric name matches the `regexp`, the rules from the `pattern` section (or sections) are applied; otherwise, the rules from the `default` section are used.
 
 Fields for `pattern` and `default` sections:
 
 - `regexp`– A pattern for the metric name.
 - `age` – The minimum age of the data in seconds.
-- `precision`– How precisely to define the age of the data in seconds.
+- `precision`– How precisely to define the age of the data in seconds. Should be a divisor for 86400 (seconds in a day).
 - `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`.
 
 The `required-columns`:
diff --git a/docs/ru/operations/table_engines/graphitemergetree.md b/docs/ru/operations/table_engines/graphitemergetree.md
index 816fddff7f2..9c9afc9c9a6 100644
--- a/docs/ru/operations/table_engines/graphitemergetree.md
+++ b/docs/ru/operations/table_engines/graphitemergetree.md
@@ -72,12 +72,19 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 
 ## Конфигурация rollup
 
-Настройки для прореживания данных задаются параметром [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) Имя параметра может быть любым. Можно создать несколько конфигураций и использовать их для разных таблиц.
+Настройки для прореживания данных задаются параметром [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup). Имя параметра может быть любым. Можно создать несколько конфигураций и использовать их для разных таблиц.
 
 Структура конфигурации rollup:
 
 ```
 required-columns
+pattern
+    regexp
+    function
+pattern
+    regexp
+    age + precision
+    ...
 pattern
     regexp
     function
@@ -91,15 +98,13 @@ default
     ...
 ```
 
-При обработке строки ClickHouse проверяет правила в разделе `pattern`. Если имя метрики соответствует шаблону `regexp`, то  применяются правила из раздела `pattern`, в противном случае из раздела `default`.
-
-Правила определяются с помощью полей `function` и `age + precision`.
+При обработке строки ClickHouse проверяет правила в разделах `pattern`. Каждый из разделов `pattern` может содержать параметр `function` для аггрегации, правила `retention` для прореживания или оба эти параметра. Если имя метрики соответствует шаблону `regexp`, то применяются правила из раздела (или разделов) `pattern`, в противном случае из раздела `default`.
 
 Поля для разделов `pattenrn` и `default`:
 
 - `regexp` – шаблон имени метрики.
 - `age` – минимальный возраст данных в секундах.
-- `precision` – точность определения возраста данных в секундах.
+- `precision` – точность определения возраста данных в секундах. Должен быть делителем для 86400 (количество секунд в дне).
 - `function` – имя агрегирующей функции, которую следует применить к данным, чей возраст оказался в интервале `[age, age + precision]`.
 
 `required-columns`:
@@ -117,6 +122,10 @@ default
     <time_column_name>Time</time_column_name>
     <value_column_name>Value</value_column_name>
     <version_column_name>Version</version_column_name>
+    <pattern>
+        <regexp>\.count$</regexp>
+        <function>sum</function>
+    </pattern>
     <pattern>
         <regexp>click_cost</regexp>
         <function>any</function>
diff --git a/docs/zh/operations/table_engines/graphitemergetree.md b/docs/zh/operations/table_engines/graphitemergetree.md
deleted file mode 100644
index fa15ab4daaf..00000000000
--- a/docs/zh/operations/table_engines/graphitemergetree.md
+++ /dev/null
@@ -1,148 +0,0 @@
-
-# GraphiteMergeTree
-
-This engine is designed for rollup (thinning and aggregating/averaging) [Graphite](http://graphite.readthedocs.io/en/latest/index.html) data. It may be helpful to developers who want to use ClickHouse as a data store for Graphite.
-
-You can use any ClickHouse table engine to store the Graphite data if you don't need rollup, but if you need a rollup use `GraphiteMergeTree`. The engine reduces the volume of storage and increases the efficiency of queries from Graphite.
-
-The engine inherits properties from [MergeTree](mergetree.md).
-
-## Creating a Table
-
-```sql
-CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
-(
-    Path String,
-    Time DateTime,
-    Value <Numeric_type>,
-    Version <Numeric_type>
-    ...
-) ENGINE = GraphiteMergeTree(config_section)
-[PARTITION BY expr]
-[ORDER BY expr]
-[SAMPLE BY expr]
-[SETTINGS name=value, ...]
-```
-
-For a description of request parameters, see [request description](../../query_language/create.md).
-
-A table for the Graphite date should have the following columns:
-
-- Column with the metric name (Graphite sensor). Data type: `String`.
-- Column with the time for measuring the metric. Data type: `DateTime`.
-- Column with the value of the metric. Data type: any numeric.
-- Column with the version of the metric with the same name and time of measurement. Data type: any numeric.
-
-    ClickHouse saves the rows with the highest version or the last written if versions are the same. Other rows are deleted during the merge of data parts.
-
-The names of these columns should be set in the rollup configuration.
-
-**GraphiteMergeTree parameters**
-
-- `config_section` — Name of the section in the configuration file, where are the rules of rollup set.
-
-**Query clauses**
-
-When creating a `GraphiteMergeTree` table, the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table.
-
-<details markdown="1"><summary>Deprecated Method for Creating a Table</summary>
-
-!!! attention
-    Do not use this method in new projects and, if possible, switch the old projects to the method described above.
-
-```sql
-CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
-(
-    EventDate Date,
-    Path String,
-    Time DateTime,
-    Value <Numeric_type>,
-    Version <Numeric_type>
-    ...
-) ENGINE [=] GraphiteMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, config_section)
-```
-
-All of the parameters excepting `config_section` have the same meaning as in `MergeTree`.
-
-- `config_section` — Name of the section in the configuration file, where are the rules of rollup set.
-</details>
-
-## Rollup configuration
-
-The settings for rollup are defined by the [graphite_rollup](../server_settings/settings.md) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables.
-
-Rollup configuration structure:
-
-```
-required-columns
-pattern
-    regexp
-    function
-    age + precision
-    ...
-pattern
-    ...
-default
-    function
-    age + precision
-    ...
-```
-
-When processing a row, ClickHouse checks the rules in the `pattern` section. If the metric name matches the `regexp`, the rules from the `pattern`section are applied; otherwise, the rules from the `default` section are used.
-
-The rules are defined with fields `function` and `age + precision`.
-
-Fields for `pattern` and `default` sections:
-
-- `regexp`– A pattern for the metric name.
-- `age` – The minimum age of the data in seconds.
-- `precision`– How precisely to define the age of the data in seconds.
-- `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`.
-
-The `required-columns`:
-
-- `path_column_name` — Column with the metric name (Graphite sensor).
-- `time_column_name` — Column with the time for measuring the metric.
-- `value_column_name` — Column with the value of the metric at the time set in `time_column_name`.
-- `version_column_name` — Column with the version timestamp of the metric with the same name and time remains in the database.
-
-
-Example of settings:
-
-```xml
-<graphite_rollup>
-    <path_column_name>Path</path_column_name>
-    <time_column_name>Time</time_column_name>
-    <value_column_name>Value</value_column_name>
-    <version_column_name>Version</version_column_name>
-    <pattern>
-        <regexp>click_cost</regexp>
-        <function>any</function>
-        <retention>
-            <age>0</age>
-            <precision>5</precision>
-        </retention>
-        <retention>
-            <age>86400</age>
-            <precision>60</precision>
-        </retention>
-    </pattern>
-    <default>
-        <function>max</function>
-        <retention>
-            <age>0</age>
-            <precision>60</precision>
-        </retention>
-        <retention>
-            <age>3600</age>
-            <precision>300</precision>
-        </retention>
-        <retention>
-            <age>86400</age>
-            <precision>3600</precision>
-        </retention>
-    </default>
-</graphite_rollup>
-```
-
-[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/graphitemergetree/) <!--hide-->
diff --git a/docs/zh/operations/table_engines/graphitemergetree.md b/docs/zh/operations/table_engines/graphitemergetree.md
new file mode 120000
index 00000000000..654425d050a
--- /dev/null
+++ b/docs/zh/operations/table_engines/graphitemergetree.md
@@ -0,0 +1 @@
+../../../en/operations/table_engines/graphitemergetree.md
\ No newline at end of file
diff --git a/docs/zh/operations/table_engines/mergetree.md b/docs/zh/operations/table_engines/mergetree.md
deleted file mode 100644
index abac921f9df..00000000000
--- a/docs/zh/operations/table_engines/mergetree.md
+++ /dev/null
@@ -1,235 +0,0 @@
-# MergeTree {#table_engines-mergetree}
-
-The `MergeTree` engine and other engines of this family (`*MergeTree`) are the most robust ClickHousе table engines.
-
-The basic idea for `MergeTree` engines family is the following. When you have tremendous amount of a data that should be inserted into the table, you should write them quickly part by part and then merge parts by some rules in background. This method is much more efficient than constantly rewriting data in the storage at the insert.
-
-Main features:
-
-- Stores data sorted by primary key.
-
-    This allows you to create a small sparse index that helps find data faster.
-
-- This allows you to use partitions if the [partitioning key](custom_partitioning_key.md) is specified.
-
-    ClickHouse supports certain operations with partitions that are more effective than general operations on the same data with the same result. ClickHouse also automatically cuts off the partition data where the partitioning key is specified in the query. This also increases the query performance.
-
-- Data replication support.
-
-    The family of `ReplicatedMergeTree` tables is used for this. For more information, see the [Data replication](replication.md) section.
-
-- Data sampling support.
-
-    If necessary, you can set the data sampling method in the table.
-
-!!! info
-    The [Merge](merge.md) engine does not belong to the `*MergeTree` family.
-
-
-## Creating a Table
-
-```
-CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
-(
-    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
-    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
-    ...
-) ENGINE = MergeTree()
-[PARTITION BY expr]
-[ORDER BY expr]
-[PRIMARY KEY expr]
-[SAMPLE BY expr]
-[SETTINGS name=value, ...]
-```
-
-For a description of request parameters, see [request description](../../query_language/create.md).
-
-**Query clauses**
-
-- `ENGINE` - Name and parameters of the engine. `ENGINE = MergeTree()`. `MergeTree` engine does not have parameters.
-
-- `PARTITION BY` — The [partitioning key](custom_partitioning_key.md).
-
-    For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](../../data_types/date.md). The partition names here have the `"YYYYMM"` format.
-
-- `ORDER BY` — The sorting key.
-
-    A tuple of columns or arbitrary expressions. Example: `ORDER BY (CounterID, EventDate)`.
-
-- `PRIMARY KEY` - The primary key if it [differs from the sorting key](mergetree.md).
-
-    By default the primary key is the same as the sorting key (which is specified by the `ORDER BY` clause).
-    Thus in most cases it is unnecessary to specify a separate `PRIMARY KEY` clause.
-
-- `SAMPLE BY` — An expression for sampling.
-
-    If a sampling expression is used, the primary key must contain it. Example:  
-    `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`.
-
-- `SETTINGS` — Additional parameters that control the behavior of the `MergeTree`:
-    - `index_granularity` — The granularity of an index. The number of data rows between the "marks" of an index. By default, 8192.
-
-**Example of sections setting**
-
-```
-ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192
-```
-
-In the example, we set partitioning by month.
-
-We also set an expression for sampling as a hash by the user ID. This allows you to pseudorandomize the data in the table for each `CounterID` and `EventDate`. If, when selecting the data, you define a [SAMPLE](../../query_language/select.md#select-sample-clause) clause, ClickHouse will return an evenly pseudorandom data sample for a subset of users.
-
-`index_granularity` could be omitted because 8192 is the default value.
-
-<details markdown="1"><summary>Deprecated Method for Creating a Table</summary>
-
-!!! attention
-    Do not use this method in new projects and, if possible, switch the old projects to the method described above.
-
-```
-CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
-(
-    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
-    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
-    ...
-) ENGINE [=] MergeTree(date-column [, sampling_expression], (primary, key), index_granularity)
-```
-
-**MergeTree() parameters**
-
-- `date-column` — The name of a column of the type [Date](../../data_types/date.md). ClickHouse automatically creates partitions by month on the basis of this column. The partition names are in the `"YYYYMM"` format.
-- `sampling_expression` — an expression for sampling.
-- `(primary, key)` — primary key. Type — [Tuple()](../../data_types/tuple.md- `index_granularity` — The granularity of an index. The number of data rows between the "marks" of an index. The value 8192 is appropriate for most tasks.
-
-**Example**
-
-```
-MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192)
-```
-
-The `MergeTree` engine is configured in the same way as in the example above for the main engine configuration method.
-</details>
-
-## Data Storage
-
-A table consists of data *parts* sorted by primary key.
-
-When data is inserted in a table, separate data parts are created and each of them is lexicographically sorted by primary key. For example, if the primary key is `(CounterID, Date)`, the data in the part is sorted by `CounterID`, and within each `CounterID`, it is ordered by `Date`.
-
-Data belonging to different partitions are separated into different parts. In the background, ClickHouse merges data parts for more efficient storage. Parts belonging to different partitions are not merged. The merge mechanism does not guarantee that all rows with the same primary key will be in the same data part.
-
-For each data part, ClickHouse creates an index file that contains the primary key value for each index row ("mark"). Index row numbers are defined as `n * index_granularity`. The maximum value `n` is equal to the integer part of dividing the total number of rows by the `index_granularity`. For each column, the "marks" are also written for the same index rows as the primary key. These "marks" allow you to find the data directly in the columns.
-
-You can use a single large table and continually add data to it in small chunks – this is what the `MergeTree` engine is intended for.
-
-## Primary Keys and Indexes in Queries
-
-Let's take the `(CounterID, Date)` primary key. In this case, the sorting and index can be illustrated as follows:
-
-```
-Whole data:     [-------------------------------------------------------------------------]
-CounterID:      [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll]
-Date:           [1111111222222233331233211111222222333211111112122222223111112223311122333]
-Marks:           |      |      |      |      |      |      |      |      |      |      |
-                a,1    a,2    a,3    b,3    e,2    e,3    g,1    h,2    i,1    i,3    l,3
-Marks numbers:   0      1      2      3      4      5      6      7      8      9      10
-```
-
-If the data query specifies:
-
-- `CounterID in ('a', 'h')`, the server reads the data in the ranges of marks `[0, 3)` and `[6, 8)`.
-- `CounterID IN ('a', 'h') AND Date = 3`, the server reads the data in the ranges of marks `[1, 3)` and `[7, 8)`.
-- `Date = 3`, the server reads the data in the range of marks `[1, 10]`.
-
-The examples above show that it is always more effective to use an index than a full scan.
-
-A sparse index allows extra strings to be read. When reading a single range of the primary key, up to `index_granularity * 2` extra rows in each data block can be read. In most cases, ClickHouse performance does not degrade when `index_granularity = 8192`.
-
-Sparse indexes allow you to work with a very large number of table rows, because such indexes are always stored in the computer's RAM.
-
-ClickHouse does not require a unique primary key. You can insert multiple rows with the same primary key.
-
-### Selecting the Primary Key
-
-The number of columns in the primary key is not explicitly limited. Depending on the data structure, you can include more or fewer columns in the primary key. This may:
-
-- Improve the performance of an index.
-
-    If the primary key is `(a, b)`, then adding another column `c` will improve the performance if the following conditions are met:
-    - There are queries with a condition on column `c`.
-    - Long data ranges (several times longer than the `index_granularity`) with identical values for `(a, b)` are common. In other words, when adding another column allows you to skip quite long data ranges.
-
-- Improve data compression.
-
-    ClickHouse sorts data by primary key, so the higher the consistency, the better the compression.
-
-- Provide additional logic when data parts merging in the [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) and [SummingMergeTree](summingmergetree.md) engines.
-
-    In this case it makes sense to specify the *sorting key* that is different from the primary key.
-
-A long primary key will negatively affect the insert performance and memory consumption, but extra columns in the primary key do not affect ClickHouse performance during `SELECT` queries.
-
-
-### Choosing the Primary Key that differs from the Sorting Key
-
-It is possible to specify the primary key (the expression, values of which are written into the index file
-for each mark) that is different from the sorting key (the expression for sorting the rows in data parts).
-In this case the primary key expression tuple must be a prefix of the sorting key expression tuple.
-
-This feature is helpful when using the [SummingMergeTree](summingmergetree.md) and
-[AggregatingMergeTree](aggregatingmergetree.md) table engines. In a common case when using these engines the
-table has two types of columns: *dimensions* and *measures*. Typical queries aggregate values of measure
-columns with arbitrary `GROUP BY` and filtering by dimensions. As SummingMergeTree and AggregatingMergeTree
-aggregate rows with the same value of the sorting key, it is natural to add all dimensions to it. As a result
-the key expression consists of a long list of columns and this list must be frequently updated with newly
-added dimensions.
-
-In this case it makes sense to leave only a few columns in the primary key that will provide efficient
-range scans and add the remaining dimension columns to the sorting key tuple.
-
-[ALTER of the sorting key](../../query_language/alter.md) is a
-lightweight operation because when a new column is simultaneously added to the table and to the sorting key
-data parts need not be changed (they remain sorted by the new sorting key expression).
-
-### Use of Indexes and Partitions in Queries
-
-For`SELECT` queries, ClickHouse analyzes whether an index can be used. An index can be used if the `WHERE/PREWHERE` clause has an expression (as one of the conjunction elements, or entirely) that represents an equality or inequality comparison operation, or if it has `IN` or `LIKE` with a fixed prefix on columns or expressions that are in the primary key or partitioning key, or on certain partially repetitive functions of these columns, or logical relationships of these expressions.
-
-Thus, it is possible to quickly run queries on one or many ranges of the primary key. In this example, queries will be fast when run for a specific tracking tag; for a specific tag and date range; for a specific tag and date; for multiple tags with a date range, and so on.
-
-Let's look at the engine configured as follows:
-
-```
-ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192
-```
-
-In this case, in queries:
-
-``` sql
-SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34
-SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42)
-SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01'))
-```
-
-ClickHouse will use the primary key index to trim improper data and the monthly partitioning key to trim partitions that are in improper date ranges.
-
-The queries above show that the index is used even for complex expressions. Reading from the table is organized so that using the index can't be slower than a full scan.
-
-In the example below, the index can't be used.
-
-``` sql
-SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%'
-```
-
-To check whether ClickHouse can use the index when running a query, use the settings [force_index_by_date](../settings/settings.md#settings-force_index_by_date) and [force_primary_key](../settings/settings.md).
-
-The key for partitioning by month allows reading only those data blocks which contain dates from the proper range. In this case, the data block may contain data for many dates (up to an entire month). Within a block, data is sorted by primary key, which might not contain the date as the first column. Because of this, using a query with only a date condition that does not specify the primary key prefix will cause more data to be read than for a single date.
-
-## Concurrent Data Access
-
-For concurrent table access, we use multi-versioning. In other words, when a table is simultaneously read and updated, data is read from a set of parts that is current at the time of the query. There are no lengthy locks. Inserts do not get in the way of read operations.
-
-Reading from a table is automatically parallelized.
-
-
-[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/mergetree/) <!--hide-->
diff --git a/docs/zh/operations/table_engines/mergetree.md b/docs/zh/operations/table_engines/mergetree.md
new file mode 120000
index 00000000000..cc6ac1e5297
--- /dev/null
+++ b/docs/zh/operations/table_engines/mergetree.md
@@ -0,0 +1 @@
+../../../en/operations/table_engines/mergetree.md
\ No newline at end of file

From a1ed5d8eae70cccde8935ee1b05a16a8a62f094c Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Wed, 20 Feb 2019 17:45:51 +0100
Subject: [PATCH 20/69] Apply pep8 to test_graphite_merge_tree

---
 .../test_graphite_merge_tree/test.py          | 102 +++++++++++-------
 1 file changed, 64 insertions(+), 38 deletions(-)

diff --git a/dbms/tests/integration/test_graphite_merge_tree/test.py b/dbms/tests/integration/test_graphite_merge_tree/test.py
index a2e7b5cc0f7..ff37be9371c 100644
--- a/dbms/tests/integration/test_graphite_merge_tree/test.py
+++ b/dbms/tests/integration/test_graphite_merge_tree/test.py
@@ -8,31 +8,38 @@ from helpers.test_tools import TSV
 
 
 cluster = ClickHouseCluster(__file__)
-instance = cluster.add_instance('instance', main_configs=['configs/graphite_rollup.xml'])
+instance = cluster.add_instance('instance',
+                                main_configs=['configs/graphite_rollup.xml'])
+q = instance.query
+
 
 @pytest.fixture(scope="module")
 def started_cluster():
     try:
         cluster.start()
-        instance.query('CREATE DATABASE test')
+        q('CREATE DATABASE test')
 
         yield cluster
 
     finally:
         cluster.shutdown()
 
+
 @pytest.fixture
 def graphite_table(started_cluster):
-    instance.query('''
+    q('''
 DROP TABLE IF EXISTS test.graphite;
 CREATE TABLE test.graphite
     (metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
-    ENGINE = GraphiteMergeTree(date, (metric, timestamp), 8192, 'graphite_rollup');
+    ENGINE = GraphiteMergeTree('graphite_rollup')
+    PARTITION BY toYYYYMM(date)
+    ORDER BY (metric, timestamp)
+    SETTINGS index_granularity=8192;
 ''')
 
     yield
 
-    instance.query('DROP TABLE test.graphite')
+    q('DROP TABLE test.graphite')
 
 
 def test_rollup_versions(graphite_table):
@@ -40,13 +47,14 @@ def test_rollup_versions(graphite_table):
     rounded_timestamp = timestamp - timestamp % 60
     date = datetime.date.today().isoformat()
 
-    q = instance.query
-
-    # Insert rows with timestamps relative to the current time so that the first retention clause is active.
+    # Insert rows with timestamps relative to the current time so that the
+    # first retention clause is active.
     # Two parts are created.
     q('''
-INSERT INTO test.graphite (metric, value, timestamp, date, updated) VALUES ('one_min.x1', 100, {timestamp}, '{date}', 1);
-INSERT INTO test.graphite (metric, value, timestamp, date, updated) VALUES ('one_min.x1', 200, {timestamp}, '{date}', 2);
+INSERT INTO test.graphite (metric, value, timestamp, date, updated)
+      VALUES ('one_min.x1', 100, {timestamp}, '{date}', 1);
+INSERT INTO test.graphite (metric, value, timestamp, date, updated)
+      VALUES ('one_min.x1', 200, {timestamp}, '{date}', 2);
 '''.format(timestamp=timestamp, date=date))
 
     expected1 = '''\
@@ -54,7 +62,9 @@ one_min.x1	100	{timestamp}	{date}	1
 one_min.x1	200	{timestamp}	{date}	2
 '''.format(timestamp=timestamp, date=date)
 
-    assert TSV(q('SELECT * FROM test.graphite ORDER BY updated')) == TSV(expected1)
+    assert TSV(
+        q('SELECT * FROM test.graphite ORDER BY updated')
+    ) == TSV(expected1)
 
     q('OPTIMIZE TABLE test.graphite')
 
@@ -67,8 +77,6 @@ one_min.x1	200	{timestamp}	{date}	2
 
 
 def test_rollup_aggregation(graphite_table):
-    q = instance.query
-
     # This query essentially emulates what rollup does.
     result1 = q('''
 SELECT avg(v), max(upd)
@@ -91,7 +99,8 @@ FROM (SELECT timestamp,
 '''
     assert TSV(result1) == TSV(expected1)
 
-    # Timestamp 1111111111 is in sufficiently distant past so that the last retention clause is active.
+    # Timestamp 1111111111 is in sufficiently distant past
+    # so that the last retention clause is active.
     result2 = q('''
 INSERT INTO test.graphite
     SELECT 'one_min.x' AS metric,
@@ -114,7 +123,7 @@ one_min.x	999634.9918367347	1111444200	2017-02-02	499999
 
 
 def test_rollup_aggregation_2(graphite_table):
-    result = instance.query('''
+    result = q('''
 INSERT INTO test.graphite
     SELECT 'one_min.x' AS metric,
            toFloat64(number) AS value,
@@ -136,7 +145,7 @@ one_min.x	24	1111110600	2017-02-02	100
 
 
 def test_multiple_paths_and_versions(graphite_table):
-    result = instance.query('''
+    result = q('''
 INSERT INTO test.graphite
     SELECT 'one_min.x' AS metric,
            toFloat64(number) AS value,
@@ -163,7 +172,9 @@ OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL;
 SELECT * FROM test.graphite;
 ''')
 
-    with open(p.join(p.dirname(__file__), 'test_multiple_paths_and_versions.reference')) as reference:
+    with open(p.join(p.dirname(__file__),
+                     'test_multiple_paths_and_versions.reference')
+              ) as reference:
         assert TSV(result) == TSV(reference)
 
 
@@ -177,14 +188,18 @@ def test_multiple_output_blocks(graphite_table):
 
         for j in range(3):
             cur_time = rolled_up_time + 100 * j
-            to_insert += 'one_min.x1	{}	{}	2001-09-09	1\n'.format(10 * j, cur_time)
-            to_insert += 'one_min.x1	{}	{}	2001-09-09	2\n'.format(10 * (j + 1), cur_time)
+            to_insert += 'one_min.x1	{}	{}	2001-09-09	1\n'.format(
+                10 * j, cur_time
+            )
+            to_insert += 'one_min.x1	{}	{}	2001-09-09	2\n'.format(
+                10 * (j + 1), cur_time
+            )
 
         expected += 'one_min.x1	20	{}	2001-09-09	2\n'.format(rolled_up_time)
 
-    instance.query('INSERT INTO test.graphite FORMAT TSV', to_insert)
+    q('INSERT INTO test.graphite FORMAT TSV', to_insert)
 
-    result = instance.query('''
+    result = q('''
 OPTIMIZE TABLE test.graphite PARTITION 200109 FINAL;
 
 SELECT * FROM test.graphite;
@@ -200,14 +215,14 @@ zzzzzzzz	100	1000000001	2001-09-09	1
 zzzzzzzz	200	1000000001	2001-09-09	2
 '''
 
-    instance.query('INSERT INTO test.graphite FORMAT TSV', to_insert)
+    q('INSERT INTO test.graphite FORMAT TSV', to_insert)
 
     expected = '''\
 one_min.x1	100	999999600	2001-09-09	1
 zzzzzzzz	200	1000000001	2001-09-09	2
 '''
 
-    result = instance.query('''
+    result = q('''
 OPTIMIZE TABLE test.graphite PARTITION 200109 FINAL;
 
 SELECT * FROM test.graphite;
@@ -215,27 +230,38 @@ SELECT * FROM test.graphite;
 
     assert TSV(result) == TSV(expected)
 
+
 def test_path_dangling_pointer(graphite_table):
-    instance.query('''
+    q('''
 DROP TABLE IF EXISTS test.graphite2;
 CREATE TABLE test.graphite2
-  (metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
-  ENGINE = GraphiteMergeTree(date, (metric, timestamp), 1, 'graphite_rollup');
-  ''')
+    (metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
+    ENGINE = GraphiteMergeTree('graphite_rollup')
+    PARTITION BY toYYYYMM(date)
+    ORDER BY (metric, timestamp)
+    SETTINGS index_granularity=1;
+    ''')
 
-    path = 'abcd' * 4000000 # 16MB
-    instance.query('INSERT INTO test.graphite2 FORMAT TSV', "{}\t0.0\t0\t2018-01-01\t100\n".format(path))
-    instance.query('INSERT INTO test.graphite2 FORMAT TSV', "{}\t0.0\t0\t2018-01-01\t101\n".format(path))
+    path = 'abcd' * 4000000  # 16MB
+    q('INSERT INTO test.graphite2 FORMAT TSV',
+      "{}\t0.0\t0\t2018-01-01\t100\n".format(path))
+    q('INSERT INTO test.graphite2 FORMAT TSV',
+      "{}\t0.0\t0\t2018-01-01\t101\n".format(path))
     for version in range(10):
-        instance.query('INSERT INTO test.graphite2 FORMAT TSV', "{}\t0.0\t0\t2018-01-01\t{}\n".format(path, version))
+        q('INSERT INTO test.graphite2 FORMAT TSV',
+          "{}\t0.0\t0\t2018-01-01\t{}\n".format(path, version))
 
     while True:
-      instance.query('OPTIMIZE TABLE test.graphite2 PARTITION 201801 FINAL')
-      parts = int(instance.query("SELECT count() FROM system.parts WHERE active AND database='test' AND table='graphite2'"))
-      if parts == 1:
-        break
-      print "Parts", parts
+        q('OPTIMIZE TABLE test.graphite2 PARTITION 201801 FINAL')
+        parts = int(q("SELECT count() FROM system.parts "
+                      "WHERE active AND database='test' "
+                      "AND table='graphite2'"))
+        if parts == 1:
+            break
+        print('Parts', parts)
 
-    assert TSV(instance.query("SELECT value, timestamp, date, updated FROM test.graphite2")) == TSV("0\t0\t2018-01-01\t101\n")
+    assert TSV(
+        q("SELECT value, timestamp, date, updated FROM test.graphite2")
+    ) == TSV("0\t0\t2018-01-01\t101\n")
 
-    instance.query('DROP TABLE test.graphite2')
\ No newline at end of file
+    q('DROP TABLE test.graphite2')

From 4cfe93a4d0df5e88f1f41e9a135092dbf2f90879 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Thu, 21 Feb 2019 22:34:08 +0100
Subject: [PATCH 21/69] Add necessary integration tests

---
 .../configs/graphite_rollup.xml               |  72 ++++++++++
 .../test_graphite_merge_tree/test.py          | 133 ++++++++++++++++++
 2 files changed, 205 insertions(+)

diff --git a/dbms/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml b/dbms/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml
index 1390d151731..6d1907f3da7 100644
--- a/dbms/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml
+++ b/dbms/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml
@@ -5,6 +5,29 @@
         <time_column_name>timestamp</time_column_name>
         <value_column_name>value</value_column_name>
         <version_column_name>updated</version_column_name>
+        <pattern>
+            <regexp>\.count$</regexp>
+            <function>sum</function>
+        </pattern>
+        <pattern>
+            <regexp>\.max$</regexp>
+            <function>max</function>
+        </pattern>
+        <pattern>
+          <regexp>^five_min\.</regexp>
+          <retention>
+              <age>0</age>
+              <precision>300</precision>
+          </retention>
+          <retention>
+              <age>5184000</age>
+              <precision>3600</precision>
+          </retention>
+          <retention>
+              <age>31536000</age>
+              <precision>14400</precision>
+          </retention>
+        </pattern>
         <pattern>
             <regexp>^one_min</regexp>
             <function>avg</function>
@@ -22,4 +45,53 @@
             </retention>
         </pattern>
     </graphite_rollup>
+    <graphite_rollup_with_default>
+        <path_column_name>metric</path_column_name>
+        <time_column_name>timestamp</time_column_name>
+        <value_column_name>value</value_column_name>
+        <version_column_name>updated</version_column_name>
+        <pattern>
+            <regexp>\.count$</regexp>
+            <function>sum</function>
+        </pattern>
+        <pattern>
+            <regexp>\.max$</regexp>
+            <function>max</function>
+        </pattern>
+        <default>
+            <function>any</function>
+            <retention>
+                <age>0</age>
+                <precision>60</precision>
+            </retention>
+            <retention>
+                <age>7776000</age>
+                <precision>300</precision>
+            </retention>
+            <retention>
+                <age>31536000</age>
+                <precision>600</precision>
+            </retention>
+        </default>
+    </graphite_rollup_with_default>
+    <graphite_rollup_broken>
+        <path_column_name>metric</path_column_name>
+        <time_column_name>timestamp</time_column_name>
+        <value_column_name>value</value_column_name>
+        <version_column_name>updated</version_column_name>
+        <default>
+            <retention>
+                <age>0</age>
+                <precision>60</precision>
+            </retention>
+            <retention>
+                <age>7776000</age>
+                <precision>300</precision>
+            </retention>
+            <retention>
+                <age>31536000</age>
+                <precision>600</precision>
+            </retention>
+        </default>
+    </graphite_rollup_broken>
 </yandex>
diff --git a/dbms/tests/integration/test_graphite_merge_tree/test.py b/dbms/tests/integration/test_graphite_merge_tree/test.py
index ff37be9371c..8e98c97e077 100644
--- a/dbms/tests/integration/test_graphite_merge_tree/test.py
+++ b/dbms/tests/integration/test_graphite_merge_tree/test.py
@@ -265,3 +265,136 @@ CREATE TABLE test.graphite2
     ) == TSV("0\t0\t2018-01-01\t101\n")
 
     q('DROP TABLE test.graphite2')
+
+
+def test_combined_rules(graphite_table):
+    # 1487970000 ~ Sat 25 Feb 00:00:00 MSK 2017
+    to_insert = 'INSERT INTO test.graphite VALUES '
+    expected_unmerged = ''
+    for i in range(384):
+        to_insert += "('five_min.count', {v}, {t}, toDate({t}), 1), ".format(
+            v=1, t=1487970000+(i*300)
+        )
+        to_insert += "('five_min.max', {v}, {t}, toDate({t}), 1), ".format(
+            v=i, t=1487970000+(i*300)
+        )
+        expected_unmerged += ("five_min.count\t{v1}\t{t}\n"
+                              "five_min.max\t{v2}\t{t}\n").format(
+                                  v1=1, v2=i,
+                                  t=1487970000+(i*300)
+                              )
+
+    q(to_insert)
+    assert TSV(q('SELECT metric, value, timestamp FROM test.graphite'
+               ' ORDER BY (timestamp, metric)')) == TSV(expected_unmerged)
+
+    q('OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL')
+    expected_merged = '''
+        five_min.count	48	1487970000	2017-02-25	1
+        five_min.count	48	1487984400	2017-02-25	1
+        five_min.count	48	1487998800	2017-02-25	1
+        five_min.count	48	1488013200	2017-02-25	1
+        five_min.count	48	1488027600	2017-02-25	1
+        five_min.count	48	1488042000	2017-02-25	1
+        five_min.count	48	1488056400	2017-02-26	1
+        five_min.count	48	1488070800	2017-02-26	1
+        five_min.max	47	1487970000	2017-02-25	1
+        five_min.max	95	1487984400	2017-02-25	1
+        five_min.max	143	1487998800	2017-02-25	1
+        five_min.max	191	1488013200	2017-02-25	1
+        five_min.max	239	1488027600	2017-02-25	1
+        five_min.max	287	1488042000	2017-02-25	1
+        five_min.max	335	1488056400	2017-02-26	1
+        five_min.max	383	1488070800	2017-02-26	1
+    '''
+    assert TSV(q('SELECT * FROM test.graphite'
+                 ' ORDER BY (metric, timestamp)')) == TSV(expected_merged)
+
+
+def test_combined_rules_with_default(graphite_table):
+    q('''
+DROP TABLE IF EXISTS test.graphite;
+CREATE TABLE test.graphite
+    (metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
+    ENGINE = GraphiteMergeTree('graphite_rollup_with_default')
+    PARTITION BY toYYYYMM(date)
+    ORDER BY (metric, timestamp)
+    SETTINGS index_granularity=1;
+      ''')
+    # 1487970000 ~ Sat 25 Feb 00:00:00 MSK 2017
+    to_insert = 'INSERT INTO test.graphite VALUES '
+    expected_unmerged = ''
+    for i in range(100):
+        to_insert += "('top_level.count', {v}, {t}, toDate({t}), 1), ".format(
+            v=1, t=1487970000+(i*60)
+        )
+        to_insert += "('top_level.max', {v}, {t}, toDate({t}), 1), ".format(
+            v=i, t=1487970000+(i*60)
+        )
+        expected_unmerged += ("top_level.count\t{v1}\t{t}\n"
+                              "top_level.max\t{v2}\t{t}\n").format(
+                                  v1=1, v2=i,
+                                  t=1487970000+(i*60)
+                              )
+
+    q(to_insert)
+    assert TSV(q('SELECT metric, value, timestamp FROM test.graphite'
+                 ' ORDER BY (timestamp, metric)')) == TSV(expected_unmerged)
+
+    q('OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL')
+    expected_merged = '''
+        top_level.count	10	1487970000	2017-02-25	1
+        top_level.count	10	1487970600	2017-02-25	1
+        top_level.count	10	1487971200	2017-02-25	1
+        top_level.count	10	1487971800	2017-02-25	1
+        top_level.count	10	1487972400	2017-02-25	1
+        top_level.count	10	1487973000	2017-02-25	1
+        top_level.count	10	1487973600	2017-02-25	1
+        top_level.count	10	1487974200	2017-02-25	1
+        top_level.count	10	1487974800	2017-02-25	1
+        top_level.count	10	1487975400	2017-02-25	1
+        top_level.max	9	1487970000	2017-02-25	1
+        top_level.max	19	1487970600	2017-02-25	1
+        top_level.max	29	1487971200	2017-02-25	1
+        top_level.max	39	1487971800	2017-02-25	1
+        top_level.max	49	1487972400	2017-02-25	1
+        top_level.max	59	1487973000	2017-02-25	1
+        top_level.max	69	1487973600	2017-02-25	1
+        top_level.max	79	1487974200	2017-02-25	1
+        top_level.max	89	1487974800	2017-02-25	1
+        top_level.max	99	1487975400	2017-02-25	1
+    '''
+    assert TSV(q('SELECT * FROM test.graphite'
+                 ' ORDER BY (metric, timestamp)')) == TSV(expected_merged)
+
+
+def test_broken_partial_rollup(graphite_table):
+    q('''
+DROP TABLE IF EXISTS test.graphite;
+CREATE TABLE test.graphite
+    (metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
+    ENGINE = GraphiteMergeTree('graphite_rollup_broken')
+    PARTITION BY toYYYYMM(date)
+    ORDER BY (metric, timestamp)
+    SETTINGS index_granularity=1;
+      ''')
+    to_insert = '''\
+one_min.x1	100	1000000000	2001-09-09	1
+zzzzzzzz	100	1000000001	2001-09-09	1
+zzzzzzzz	200	1000000001	2001-09-09	2
+'''
+
+    q('INSERT INTO test.graphite FORMAT TSV', to_insert)
+
+    expected = '''\
+one_min.x1	100	1000000000	2001-09-09	1
+zzzzzzzz	200	1000000001	2001-09-09	2
+'''
+
+    result = q('''
+OPTIMIZE TABLE test.graphite PARTITION 200109 FINAL;
+
+SELECT * FROM test.graphite;
+''')
+
+    assert TSV(result) == TSV(expected)

From e79235f93257cf08be11f8a128e5fa48c4876933 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Thu, 28 Feb 2019 09:52:33 +0100
Subject: [PATCH 22/69] Fix typo in parameter name

---
 docs/ru/operations/table_engines/graphitemergetree.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/operations/table_engines/graphitemergetree.md b/docs/ru/operations/table_engines/graphitemergetree.md
index 9c9afc9c9a6..37420f317cd 100644
--- a/docs/ru/operations/table_engines/graphitemergetree.md
+++ b/docs/ru/operations/table_engines/graphitemergetree.md
@@ -100,7 +100,7 @@ default
 
 При обработке строки ClickHouse проверяет правила в разделах `pattern`. Каждый из разделов `pattern` может содержать параметр `function` для аггрегации, правила `retention` для прореживания или оба эти параметра. Если имя метрики соответствует шаблону `regexp`, то применяются правила из раздела (или разделов) `pattern`, в противном случае из раздела `default`.
 
-Поля для разделов `pattenrn` и `default`:
+Поля для разделов `pattern` и `default`:
 
 - `regexp` – шаблон имени метрики.
 - `age` – минимальный возраст данных в секундах.

From ae8e84c532f2dd536498e7b57772a36c704af5e7 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Thu, 28 Feb 2019 10:44:30 +0100
Subject: [PATCH 23/69] Add note about patterns order to docs

---
 docs/en/operations/table_engines/graphitemergetree.md | 9 ++++++++-
 docs/ru/operations/table_engines/graphitemergetree.md | 8 +++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/en/operations/table_engines/graphitemergetree.md b/docs/en/operations/table_engines/graphitemergetree.md
index 4231fb00f43..cd0e17652ae 100644
--- a/docs/en/operations/table_engines/graphitemergetree.md
+++ b/docs/en/operations/table_engines/graphitemergetree.md
@@ -95,7 +95,14 @@ default
     ...
 ```
 
-When processing a row, ClickHouse checks the rules in the `pattern` sections. Each of `pattern` sections could contain `function` parameter for aggregation, `retention` parameters or both. If the metric name matches the `regexp`, the rules from the `pattern` section (or sections) are applied; otherwise, the rules from the `default` section are used.
+**Important:** The order of patterns should be next:
+
+1. Patterns *without* `function` *or* `retention`.
+1. Patterns *with* both `function` *and* `retention`.
+1. Pattern `dafault`.
+
+
+When processing a row, ClickHouse checks the rules in the `pattern` sections. Each of `pattern` (including `default`) sections could contain `function` parameter for aggregation, `retention` parameters or both. If the metric name matches the `regexp`, the rules from the `pattern` section (or sections) are applied; otherwise, the rules from the `default` section are used.
 
 Fields for `pattern` and `default` sections:
 
diff --git a/docs/ru/operations/table_engines/graphitemergetree.md b/docs/ru/operations/table_engines/graphitemergetree.md
index 37420f317cd..169df60b0d3 100644
--- a/docs/ru/operations/table_engines/graphitemergetree.md
+++ b/docs/ru/operations/table_engines/graphitemergetree.md
@@ -98,7 +98,13 @@ default
     ...
 ```
 
-При обработке строки ClickHouse проверяет правила в разделах `pattern`. Каждый из разделов `pattern` может содержать параметр `function` для аггрегации, правила `retention` для прореживания или оба эти параметра. Если имя метрики соответствует шаблону `regexp`, то применяются правила из раздела (или разделов) `pattern`, в противном случае из раздела `default`.
+**Важно**: порядок разделов `pattern` должен быть следующим:
+
+1. Разделы *без* параметра `function` *или* `retention`.
+1. Разделы *с* параметрами `function` *и* `retention`.
+1. Раздел `default`.
+
+При обработке строки ClickHouse проверяет правила в разделах `pattern`. Каждый из разделов `pattern` (включая `default`) может содержать параметр `function` для аггрегации, правила `retention` для прореживания или оба эти параметра. Если имя метрики соответствует шаблону `regexp`, то применяются правила из раздела (или разделов) `pattern`, в противном случае из раздела `default`.
 
 Поля для разделов `pattern` и `default`:
 

From 315b6f3878182c7950d75de93c842b0646479fe9 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 3 Mar 2019 23:08:39 +0300
Subject: [PATCH 24/69] Better string comparison (development)

---
 dbms/src/Columns/ColumnFixedString.cpp   |   4 +-
 dbms/src/Columns/ColumnFixedString.h     |   5 +-
 dbms/src/Columns/ColumnString.cpp        |  13 +-
 dbms/src/Columns/ColumnString.h          |  12 +-
 dbms/src/Common/memcmpSmall.h            | 219 +++++++++++++++
 dbms/src/Common/memcpySmall.h            |   1 -
 dbms/src/Functions/EmptyImpl.h           |  11 +-
 dbms/src/Functions/FunctionsComparison.h | 342 +++++++++++------------
 dbms/src/Functions/arrayIndex.h          |  36 ++-
 9 files changed, 429 insertions(+), 214 deletions(-)
 create mode 100644 dbms/src/Common/memcmpSmall.h

diff --git a/dbms/src/Columns/ColumnFixedString.cpp b/dbms/src/Columns/ColumnFixedString.cpp
index 955af04ee25..728ae095901 100644
--- a/dbms/src/Columns/ColumnFixedString.cpp
+++ b/dbms/src/Columns/ColumnFixedString.cpp
@@ -4,6 +4,7 @@
 #include <Common/Arena.h>
 #include <Common/SipHash.h>
 #include <Common/memcpySmall.h>
+#include <Common/memcmpSmall.h>
 
 #include <DataStreams/ColumnGathererStream.h>
 
@@ -106,8 +107,7 @@ struct ColumnFixedString::less
     explicit less(const ColumnFixedString & parent_) : parent(parent_) {}
     bool operator()(size_t lhs, size_t rhs) const
     {
-        /// TODO: memcmp slows down.
-        int res = memcmp(&parent.chars[lhs * parent.n], &parent.chars[rhs * parent.n], parent.n);
+        int res = memcmpSmallAllowOverflow15(parent.chars.data() + lhs * parent.n, parent.chars.data() + rhs * parent.n, parent.n);
         return positive ? (res < 0) : (res > 0);
     }
 };
diff --git a/dbms/src/Columns/ColumnFixedString.h b/dbms/src/Columns/ColumnFixedString.h
index 96f9b05189c..941314b8888 100644
--- a/dbms/src/Columns/ColumnFixedString.h
+++ b/dbms/src/Columns/ColumnFixedString.h
@@ -1,8 +1,7 @@
 #pragma once
 
-#include <string.h> // memcmp
-
 #include <Common/PODArray.h>
+#include <Common/memcmpSmall.h>
 #include <Columns/IColumn.h>
 #include <Columns/ColumnVectorHelper.h>
 
@@ -98,7 +97,7 @@ public:
     int compareAt(size_t p1, size_t p2, const IColumn & rhs_, int /*nan_direction_hint*/) const override
     {
         const ColumnFixedString & rhs = static_cast<const ColumnFixedString &>(rhs_);
-        return memcmp(&chars[p1 * n], &rhs.chars[p2 * n], n);
+        return memcmpSmallAllowOverflow15(chars.data() + p1 * n, rhs.chars.data() + p2 * n, n);
     }
 
     void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp
index 86521e3dfb5..5b16aaef75e 100644
--- a/dbms/src/Columns/ColumnString.cpp
+++ b/dbms/src/Columns/ColumnString.cpp
@@ -1,5 +1,6 @@
 #include <Core/Defines.h>
 #include <Common/Arena.h>
+#include <Common/memcmpSmall.h>
 #include <Columns/Collator.h>
 #include <Columns/ColumnString.h>
 #include <Columns/ColumnsCommon.h>
@@ -239,15 +240,11 @@ struct ColumnString::less
     explicit less(const ColumnString & parent_) : parent(parent_) {}
     bool operator()(size_t lhs, size_t rhs) const
     {
-        size_t left_len = parent.sizeAt(lhs);
-        size_t right_len = parent.sizeAt(rhs);
+        int res = memcmpSmallAllowOverflow15(
+            parent.chars.data() + parent.offsetAt(lhs), parent.sizeAt(lhs),
+            parent.chars.data() + parent.offsetAt(rhs), parent.sizeAt(rhs));
 
-        int res = memcmp(&parent.chars[parent.offsetAt(lhs)], &parent.chars[parent.offsetAt(rhs)], std::min(left_len, right_len));
-
-        if (res != 0)
-            return positive ? (res < 0) : (res > 0);
-        else
-            return positive ? (left_len < right_len) : (left_len > right_len);
+        return positive ? (res < 0) : (res > 0);
     }
 };
 
diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h
index 6b2e58d54d1..7f6da068f52 100644
--- a/dbms/src/Columns/ColumnString.h
+++ b/dbms/src/Columns/ColumnString.h
@@ -6,6 +6,7 @@
 #include <Common/PODArray.h>
 #include <Common/SipHash.h>
 #include <Common/memcpySmall.h>
+#include <Common/memcmpSmall.h>
 
 
 class Collator;
@@ -210,16 +211,7 @@ public:
     int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override
     {
         const ColumnString & rhs = static_cast<const ColumnString &>(rhs_);
-
-        const size_t size = sizeAt(n);
-        const size_t rhs_size = rhs.sizeAt(m);
-
-        int cmp = memcmp(&chars[offsetAt(n)], &rhs.chars[rhs.offsetAt(m)], std::min(size, rhs_size));
-
-        if (cmp != 0)
-            return cmp;
-        else
-            return size > rhs_size ? 1 : (size < rhs_size ? -1 : 0);
+        return memcmpSmallAllowOverflow15(chars.data() + offsetAt(n), sizeAt(n), rhs.chars.data() + rhs.offsetAt(m), rhs.sizeAt(m));
     }
 
     /// Variant of compareAt for string comparison with respect of collation.
diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
new file mode 100644
index 00000000000..8c2c8420412
--- /dev/null
+++ b/dbms/src/Common/memcmpSmall.h
@@ -0,0 +1,219 @@
+#pragma once
+
+#include <string.h>
+#include <algorithm>
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+
+
+namespace detail
+{
+
+template <typename T>
+inline int cmp(T a, T b)
+{
+    if (a < b)
+        return -1;
+    if (a > b)
+        return 1;
+    return 0;
+}
+
+}
+
+/** All functions works under the following assumptions:
+  * - it's possible to read up to 15 excessive bytes after end of 'a' and 'b' region;
+  * - memory regions are relatively small and extra loop unrolling is not worth to do.
+  */
+
+/** Variant when memory regions may have different sizes.
+  */
+template <typename Char>
+inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
+{
+    size_t min_size = std::min(a_size, b_size);
+    size_t size_to_compare_sse = (min_size + 15) / 16 * 16;
+
+    for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
+    {
+        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+
+        if (mask)
+        {
+            offset += __builtin_ctz(mask);
+
+            if (offset >= min_size)
+                break;
+
+            return detail::cmp(a[offset], b[offset]);
+        }
+    }
+
+    return detail::cmp(a_size, b_size);
+}
+
+
+/** Variant when memory regions have same size.
+  * TODO Check if the compiler can optimize previous function when the caller pass identical sizes.
+  */
+template <typename Char>
+inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size)
+{
+    size_t size_to_compare_sse = (size + 15) / 16 * 16;
+
+    for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
+    {
+        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+
+        if (mask)
+        {
+            offset += __builtin_ctz(mask);
+
+            if (offset >= size)
+                return 0;
+
+            return detail::cmp(a[offset], b[offset]);
+        }
+    }
+
+    return 0;
+}
+
+
+/** Compare memory regions for equality.
+  */
+template <typename Char>
+inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
+{
+    if (a_size != b_size)
+        return false;
+
+    size_t size_to_compare_sse = (a_size + 15) / 16 * 16;
+
+    for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
+    {
+        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+
+        if (mask)
+        {
+            offset += __builtin_ctz(mask);
+            return offset >= a_size;
+        }
+    }
+
+    return true;
+}
+
+
+/** Variant when the caller know in advance that the size is a multiple of 16.
+  */
+template <typename Char>
+inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
+{
+    for (size_t offset = 0; offset < size; offset += 16)
+    {
+        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+
+        if (mask)
+        {
+            offset += __builtin_ctz(mask);
+            return detail::cmp(a[offset], b[offset]);
+        }
+    }
+
+    return 0;
+}
+
+
+/** Variant when the size is 16 exactly.
+  */
+template <typename Char>
+inline int memcmp16(const Char * a, const Char * b)
+{
+    auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(a)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
+
+    if (mask)
+    {
+        auto offset = __builtin_ctz(mask);
+        return detail::cmp(a[offset], b[offset]);
+    }
+
+    return 0;
+}
+
+
+/** Variant when the size is 16 exactly.
+  */
+inline bool memequal16(const void * a, const void * b)
+{
+    return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(a)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
+}
+
+
+/** Compare memory region to zero */
+inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
+{
+    const __m128 zero16 = _mm_setzero_ps();
+
+    for (size_t offset = 0; offset < size; offset += 16)
+    {
+        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(zero16,
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(reinterpret_cast<const char *>(data) + offset))));
+
+        if (mask)
+        {
+            offset += __builtin_ctz(mask);
+            return offset >= size;
+        }
+    }
+
+    return true;
+}
+
+
+#else
+
+template <typename Char>
+inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
+{
+    return memcmp(a, b, std::min(a_size, b_size));
+}
+
+template <typename Char>
+inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size)
+{
+    return memcmp(a, b, size);
+}
+
+template <typename Char>
+inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
+{
+    return memcmp(a, b, size);
+}
+
+inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
+{
+    const char * pos = reinterpret_cast<const char *>(data);
+    const char * end = pos + size;
+
+    for (; pos < end; ++pos)
+        if (*pos)
+            return false;
+
+    return true;
+}
+
+#endif
diff --git a/dbms/src/Common/memcpySmall.h b/dbms/src/Common/memcpySmall.h
index 34050f3c57f..aaedfb81fe5 100644
--- a/dbms/src/Common/memcpySmall.h
+++ b/dbms/src/Common/memcpySmall.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <string.h>
-#include <Core/Defines.h>
 
 #ifdef __SSE2__
 #include <emmintrin.h>
diff --git a/dbms/src/Functions/EmptyImpl.h b/dbms/src/Functions/EmptyImpl.h
index abd406e0920..e1abd502988 100644
--- a/dbms/src/Functions/EmptyImpl.h
+++ b/dbms/src/Functions/EmptyImpl.h
@@ -1,5 +1,6 @@
 #pragma once
-#include <cstring>
+
+#include <Common/memcmpSmall.h>
 #include <Columns/ColumnString.h>
 #include <Functions/FunctionFactory.h>
 
@@ -38,11 +39,9 @@ struct EmptyImpl
 
     static void vector_fixed_to_vector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
     {
-        std::vector<char> empty_chars(n);
-        size_t size = data.size() / n;
-
-        for (size_t i = 0; i < size; ++i)
-            res[i] = negative ^ (0 == memcmp(&data[i * n], empty_chars.data(), n));
+        size_t size = data.size();
+        for (size_t i = 0; i < size; i += n)
+            res[i] = negative ^ memoryIsZeroSmallAllowOverflow15(data.data() + i, n);
     }
 
     static void array(const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
diff --git a/dbms/src/Functions/FunctionsComparison.h b/dbms/src/Functions/FunctionsComparison.h
index 47fa401f926..13db04ed4ba 100644
--- a/dbms/src/Functions/FunctionsComparison.h
+++ b/dbms/src/Functions/FunctionsComparison.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <Common/memcmpSmall.h>
+
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnConst.h>
 #include <Columns/ColumnDecimal.h>
@@ -115,30 +117,6 @@ struct NumComparisonImpl
 };
 
 
-inline int memcmp16(const void * a, const void * b)
-{
-    /// Assuming little endian.
-
-    UInt64 a_hi = __builtin_bswap64(unalignedLoad<UInt64>(a));
-    UInt64 b_hi = __builtin_bswap64(unalignedLoad<UInt64>(b));
-
-    if (a_hi < b_hi)
-        return -1;
-    if (a_hi > b_hi)
-        return 1;
-
-    UInt64 a_lo = __builtin_bswap64(unalignedLoad<UInt64>(reinterpret_cast<const char *>(a) + 8));
-    UInt64 b_lo = __builtin_bswap64(unalignedLoad<UInt64>(reinterpret_cast<const char *>(b) + 8));
-
-    if (a_lo < b_lo)
-        return -1;
-    if (a_lo > b_lo)
-        return 1;
-
-    return 0;
-}
-
-
 template <typename Op>
 struct StringComparisonImpl
 {
@@ -148,27 +126,17 @@ struct StringComparisonImpl
         PaddedPODArray<UInt8> & c)
     {
         size_t size = a_offsets.size();
+        ColumnString::Offset prev_a_offset = 0;
+        ColumnString::Offset prev_b_offset = 0;
 
         for (size_t i = 0; i < size; ++i)
         {
-            /// Trailing zero byte of the smaller string is included in the comparison.
-            size_t a_size;
-            size_t b_size;
-            int res;
-            if (i == 0)
-            {
-                a_size = a_offsets[0];
-                b_size = b_offsets[0];
-                res = memcmp(a_data.data(), b_data.data(), std::min(a_size, b_size));
-            }
-            else
-            {
-                a_size = a_offsets[i] - a_offsets[i - 1];
-                b_size = b_offsets[i] - b_offsets[i - 1];
-                res = memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], std::min(a_size, b_size));
-            }
+            c[i] = Op::apply(memcmpSmallAllowOverflow15(
+                a_data.data() + prev_a_offset, a_offsets[i] - prev_a_offset - 1,
+                b_data.data() + prev_b_offset, b_offsets[i] - prev_b_offset - 1), 0);
 
-            c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_size, b_size));
+            prev_a_offset = a_offsets[i];
+            prev_b_offset = b_offsets[i];
         }
     }
 
@@ -178,43 +146,33 @@ struct StringComparisonImpl
         PaddedPODArray<UInt8> & c)
     {
         size_t size = a_offsets.size();
+        ColumnString::Offset prev_a_offset = 0;
+
         for (size_t i = 0; i < size; ++i)
         {
-            if (i == 0)
-            {
-                int res = memcmp(a_data.data(), b_data.data(), std::min(a_offsets[0] - 1, b_n));
-                c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_offsets[0], b_n + 1));
-            }
-            else
-            {
-                int res = memcmp(&a_data[a_offsets[i - 1]], &b_data[i * b_n],
-                    std::min(a_offsets[i] - a_offsets[i - 1] - 1, b_n));
-                c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_offsets[i] - a_offsets[i - 1], b_n + 1));
-            }
+            c[i] = Op::apply(memcmpSmallAllowOverflow15(
+                a_data.data() + prev_a_offset, a_offsets[i] - prev_a_offset - 1,
+                b_data.data() + i * b_n, b_n), 0);
+
+            prev_a_offset = a_offsets[i];
         }
     }
 
     static void NO_INLINE string_vector_constant(
         const ColumnString::Chars & a_data, const ColumnString::Offsets & a_offsets,
-        const std::string & b,
+        const ColumnString::Chars & b_data, ColumnString::Offset b_size,
         PaddedPODArray<UInt8> & c)
     {
         size_t size = a_offsets.size();
-        ColumnString::Offset b_size = b.size() + 1;
-        const UInt8 * b_data = reinterpret_cast<const UInt8 *>(b.data());
+        ColumnString::Offset prev_a_offset = 0;
+
         for (size_t i = 0; i < size; ++i)
         {
-            /// Trailing zero byte of the smaller string is included in the comparison.
-            if (i == 0)
-            {
-                int res = memcmp(a_data.data(), b_data, std::min(a_offsets[0], b_size));
-                c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_offsets[0], b_size));
-            }
-            else
-            {
-                int res = memcmp(&a_data[a_offsets[i - 1]], b_data, std::min(a_offsets[i] - a_offsets[i - 1], b_size));
-                c[i] = Op::apply(res, 0) || (res == 0 && Op::apply(a_offsets[i] - a_offsets[i - 1], b_size));
-            }
+            c[i] = Op::apply(memcmpSmallAllowOverflow15(
+                a_data.data() + prev_a_offset, a_offsets[i] - prev_a_offset - 1,
+                b_data.data(), b_size), 0);
+
+            prev_a_offset = a_offsets[i];
         }
     }
 
@@ -239,13 +197,13 @@ struct StringComparisonImpl
 
     static void NO_INLINE fixed_string_vector_constant_16(
         const ColumnString::Chars & a_data,
-        const std::string & b,
+        const ColumnString::Chars & b_data,
         PaddedPODArray<UInt8> & c)
     {
         size_t size = a_data.size();
 
         for (size_t i = 0, j = 0; i < size; i += 16, ++j)
-            c[j] = Op::apply(memcmp16(&a_data[i], b.data()), 0);
+            c[j] = Op::apply(memcmp16(&a_data[i], &b_data[0]), 0);
     }
 
     static void NO_INLINE fixed_string_vector_fixed_string_vector(
@@ -253,74 +211,73 @@ struct StringComparisonImpl
         const ColumnString::Chars & b_data, ColumnString::Offset b_n,
         PaddedPODArray<UInt8> & c)
     {
-        /** Specialization if both sizes are 16.
-          * To more efficient comparison of IPv6 addresses stored in FixedString(16).
-          */
         if (a_n == 16 && b_n == 16)
         {
+            /** Specialization if both sizes are 16.
+              * To more efficient comparison of IPv6 addresses stored in FixedString(16).
+              */
             fixed_string_vector_fixed_string_vector_16(a_data, b_data, c);
         }
+        else if (a_n == b_n)
+        {
+            size_t size = a_data.size();
+            for (size_t i = 0, j = 0; i < size; i += a_n, ++j)
+                c[j] = Op::apply(memcmpSmallAllowOverflow15(a_data.data() + i, b_data.data() + i, a_n), 0);
+        }
         else
         {
-            /// Generic implementation, less efficient.
-            size_t size = a_data.size();
+            size_t size = a_data.size() / a_n;
 
-            for (size_t i = 0, j = 0; i < size; i += a_n, ++j)
-            {
-                int res = memcmp(&a_data[i], &b_data[i], std::min(a_n, b_n));
-                c[j] = Op::apply(res, 0) || (res == 0 && Op::apply(a_n, b_n));
-            }
+            for (size_t i = 0; i < size; ++i)
+                c[i] = Op::apply(memcmpSmallAllowOverflow15(a_data.data() + i * a_n, a_n, b_data.data() + i * b_n, b_n), 0);
         }
     }
 
     static void NO_INLINE fixed_string_vector_constant(
         const ColumnString::Chars & a_data, ColumnString::Offset a_n,
-        const std::string & b,
+        const ColumnString::Chars & b_data, ColumnString::Offset b_size,
         PaddedPODArray<UInt8> & c)
     {
-        ColumnString::Offset b_n = b.size();
-        if (a_n == 16 && b_n == 16)
+        if (a_n == 16 && b_size == 16)
         {
-            fixed_string_vector_constant_16(a_data, b, c);
+            fixed_string_vector_constant_16(a_data, b_data, c);
+        }
+        else if (a_n == b_size)
+        {
+            size_t size = a_data.size();
+            for (size_t i = 0, j = 0; i < size; i += a_n, ++j)
+                c[j] = Op::apply(memcmpSmallAllowOverflow15(a_data.data() + i, b_data.data(), a_n), 0);
         }
         else
         {
             size_t size = a_data.size();
-            const UInt8 * b_data = reinterpret_cast<const UInt8 *>(b.data());
             for (size_t i = 0, j = 0; i < size; i += a_n, ++j)
-            {
-                int res = memcmp(&a_data[i], b_data, std::min(a_n, b_n));
-                c[j] = Op::apply(res, 0) || (res == 0 && Op::apply(a_n, b_n));
-            }
+                c[j] = Op::apply(memcmpSmallAllowOverflow15(a_data.data() + i, a_n, b_data.data(), b_size), 0);
         }
     }
 
     static void constant_string_vector(
-        const std::string & a,
+        const ColumnString::Chars & a_data, ColumnString::Offset a_size,
         const ColumnString::Chars & b_data, const ColumnString::Offsets & b_offsets,
         PaddedPODArray<UInt8> & c)
     {
-        StringComparisonImpl<typename Op::SymmetricOp>::string_vector_constant(b_data, b_offsets, a, c);
+        StringComparisonImpl<typename Op::SymmetricOp>::string_vector_constant(b_data, b_offsets, a_data, a_size, c);
     }
 
     static void constant_fixed_string_vector(
-        const std::string & a,
+        const ColumnString::Chars & a_data, ColumnString::Offset a_size,
         const ColumnString::Chars & b_data, ColumnString::Offset b_n,
         PaddedPODArray<UInt8> & c)
     {
-        StringComparisonImpl<typename Op::SymmetricOp>::fixed_string_vector_constant(b_data, b_n, a, c);
+        StringComparisonImpl<typename Op::SymmetricOp>::fixed_string_vector_constant(b_data, b_n, a_data, a_size, c);
     }
 
     static void constant_constant(
-        const std::string & a,
-        const std::string & b,
+        const ColumnString::Chars & a_data, ColumnString::Offset a_size,
+        const ColumnString::Chars & b_data, ColumnString::Offset b_size,
         UInt8 & c)
     {
-        size_t a_n = a.size();
-        size_t b_n = b.size();
-
-        int res = memcmp(a.data(), b.data(), std::min(a_n, b_n));
-        c = Op::apply(res, 0) || (res == 0 && Op::apply(a_n, b_n));
+        c = Op::apply(memcmpSmallAllowOverflow15(a_data.data(), a_size, b_data.data(), b_size), 0);
     }
 };
 
@@ -335,11 +292,21 @@ struct StringEqualsImpl
         PaddedPODArray<UInt8> & c)
     {
         size_t size = a_offsets.size();
+        ColumnString::Offset prev_a_offset = 0;
+        ColumnString::Offset prev_b_offset = 0;
+
         for (size_t i = 0; i < size; ++i)
-            c[i] = positive == ((i == 0)
-                ? (a_offsets[0] == b_offsets[0] && !memcmp(a_data.data(), b_data.data(), a_offsets[0] - 1))
-                : (a_offsets[i] - a_offsets[i - 1] == b_offsets[i] - b_offsets[i - 1]
-                    && !memcmp(&a_data[a_offsets[i - 1]], &b_data[b_offsets[i - 1]], a_offsets[i] - a_offsets[i - 1] - 1)));
+        {
+            auto a_size = a_offsets[i] - prev_a_offset - 1;
+            auto b_size = b_offsets[i] - prev_b_offset - 1;
+
+            c[i] = positive == memequalSmallAllowOverflow15(
+                a_data.data() + prev_a_offset, a_size,
+                b_data.data() + prev_b_offset, b_size);
+
+            prev_a_offset = a_offsets[i];
+            prev_b_offset = b_offsets[i];
+        }
     }
 
     static void NO_INLINE string_vector_fixed_string_vector(
@@ -348,76 +315,65 @@ struct StringEqualsImpl
         PaddedPODArray<UInt8> & c)
     {
         size_t size = a_offsets.size();
+        ColumnString::Offset prev_a_offset = 0;
+
         for (size_t i = 0; i < size; ++i)
-            c[i] = positive == ((i == 0)
-                ? (a_offsets[0] == b_n + 1 && !memcmp(a_data.data(), b_data.data(), b_n))
-                : (a_offsets[i] - a_offsets[i - 1] == b_n + 1
-                    && !memcmp(&a_data[a_offsets[i - 1]], &b_data[b_n * i], b_n)));
+        {
+            auto a_size = a_offsets[i] - prev_a_offset - 1;
+
+            c[i] = positive == memequalSmallAllowOverflow15(
+                a_data.data() + prev_a_offset, a_size,
+                b_data.data() + b_n * i, b_n);
+
+            prev_a_offset = a_offsets[i];
+        }
     }
 
     static void NO_INLINE string_vector_constant(
         const ColumnString::Chars & a_data, const ColumnString::Offsets & a_offsets,
-        const std::string & b,
+        const ColumnString::Chars & b_data, ColumnString::Offset b_size,
         PaddedPODArray<UInt8> & c)
     {
         size_t size = a_offsets.size();
-        ColumnString::Offset b_n = b.size();
-        const UInt8 * b_data = reinterpret_cast<const UInt8 *>(b.data());
+        ColumnString::Offset prev_a_offset = 0;
+
         for (size_t i = 0; i < size; ++i)
-            c[i] = positive == ((i == 0)
-                ? (a_offsets[0] == b_n + 1 && !memcmp(a_data.data(), b_data, b_n))
-                : (a_offsets[i] - a_offsets[i - 1] == b_n + 1
-                    && !memcmp(&a_data[a_offsets[i - 1]], b_data, b_n)));
+        {
+            auto a_size = a_offsets[i] - prev_a_offset - 1;
+
+            c[i] = positive == memequalSmallAllowOverflow15(
+                a_data.data() + prev_a_offset, a_size,
+                b_data.data(), b_size);
+
+            prev_a_offset = a_offsets[i];
+        }
     }
 
-#ifdef __SSE2__
     static void NO_INLINE fixed_string_vector_fixed_string_vector_16(
         const ColumnString::Chars & a_data,
         const ColumnString::Chars & b_data,
         PaddedPODArray<UInt8> & c)
     {
-        size_t size = c.size();
+        size_t size = a_data.size() / 16;
 
-        const __m128i * a_pos = reinterpret_cast<const __m128i *>(a_data.data());
-        const __m128i * b_pos = reinterpret_cast<const __m128i *>(b_data.data());
-        UInt8 * c_pos = c.data();
-        UInt8 * c_end = c_pos + size;
-
-        while (c_pos < c_end)
-        {
-            *c_pos = positive == (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
-                _mm_loadu_si128(a_pos),
-                _mm_loadu_si128(b_pos))));
-
-            ++a_pos;
-            ++b_pos;
-            ++c_pos;
-        }
+        for (size_t i = 0; i < size; ++i)
+            c[i] = positive == memequal16(
+                a_data.data() + i * 16,
+                b_data.data() + i * 16);
     }
 
     static void NO_INLINE fixed_string_vector_constant_16(
         const ColumnString::Chars & a_data,
-        const std::string & b,
+        const ColumnString::Chars & b_data,
         PaddedPODArray<UInt8> & c)
     {
-        size_t size = c.size();
+        size_t size = a_data.size() / 16;
 
-        const __m128i * a_pos = reinterpret_cast<const __m128i *>(a_data.data());
-        const __m128i b_value = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b.data()));
-        UInt8 * c_pos = c.data();
-        UInt8 * c_end = c_pos + size;
-
-        while (c_pos < c_end)
-        {
-            *c_pos = positive == (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
-                _mm_loadu_si128(a_pos),
-                b_value)));
-
-            ++a_pos;
-            ++c_pos;
-        }
+        for (size_t i = 0; i < size; ++i)
+            c[i] = positive == memequal16(
+                a_data.data() + i * 16,
+                b_data.data());
     }
-#endif
 
     static void NO_INLINE fixed_string_vector_fixed_string_vector(
         const ColumnString::Chars & a_data, ColumnString::Offset a_n,
@@ -427,38 +383,32 @@ struct StringEqualsImpl
         /** Specialization if both sizes are 16.
           * To more efficient comparison of IPv6 addresses stored in FixedString(16).
           */
-#ifdef __SSE2__
         if (a_n == 16 && b_n == 16)
         {
             fixed_string_vector_fixed_string_vector_16(a_data, b_data, c);
         }
         else
-#endif
         {
-            size_t size = a_data.size();
-            for (size_t i = 0, j = 0; i < size; i += a_n, ++j)
-                c[j] = positive == (a_n == b_n && !memcmp(&a_data[i], &b_data[i], a_n));
+            size_t size = a_data.size() / a_n;
+            for (size_t i = 0; i < size; ++i)
+                c[i] = positive == memequalSmallAllowOverflow15(a_data.data() + i * a_n, a_n, b_data.data() + i * b_n, b_n);
         }
     }
 
     static void NO_INLINE fixed_string_vector_constant(
         const ColumnString::Chars & a_data, ColumnString::Offset a_n,
-        const std::string & b,
+        const ColumnString::Chars & b_data, ColumnString::Offset b_size,
         PaddedPODArray<UInt8> & c)
     {
-        ColumnString::Offset b_n = b.size();
-#ifdef __SSE2__
-        if (a_n == 16 && b_n == 16)
+        if (a_n == 16 && b_size == 16)
         {
-            fixed_string_vector_constant_16(a_data, b, c);
+            fixed_string_vector_constant_16(a_data, b_data, c);
         }
         else
-#endif
         {
-            size_t size = a_data.size();
-            const UInt8 * b_data = reinterpret_cast<const UInt8 *>(b.data());
-            for (size_t i = 0, j = 0; i < size; i += a_n, ++j)
-                c[j] = positive == (a_n == b_n && !memcmp(&a_data[i], b_data, a_n));
+            size_t size = a_data.size() / a_n;
+            for (size_t i = 0; i < size; ++i)
+                c[i] = positive == memequalSmallAllowOverflow15(a_data.data() + i * a_n, a_n, b_data.data(), b_size);
         }
     }
 
@@ -471,27 +421,27 @@ struct StringEqualsImpl
     }
 
     static void constant_string_vector(
-        const std::string & a,
+        const ColumnString::Chars & a_data, ColumnString::Offset a_size,
         const ColumnString::Chars & b_data, const ColumnString::Offsets & b_offsets,
         PaddedPODArray<UInt8> & c)
     {
-        string_vector_constant(b_data, b_offsets, a, c);
+        string_vector_constant(b_data, b_offsets, a_data, a_size, c);
     }
 
     static void constant_fixed_string_vector(
-        const std::string & a,
+        const ColumnString::Chars & a_data, ColumnString::Offset a_size,
         const ColumnString::Chars & b_data, ColumnString::Offset b_n,
         PaddedPODArray<UInt8> & c)
     {
-        fixed_string_vector_constant(b_data, b_n, a, c);
+        fixed_string_vector_constant(b_data, b_n, a_data, a_size, c);
     }
 
     static void constant_constant(
-        const std::string & a,
-        const std::string & b,
+        const ColumnString::Chars & a_data, ColumnString::Offset a_size,
+        const ColumnString::Chars & b_data, ColumnString::Offset b_size,
         UInt8 & c)
     {
-        c = positive == (a == b);
+        c = positive == memequalSmallAllowOverflow15(a_data.data(), a_size, b_data.data(), b_size);
     }
 };
 
@@ -744,18 +694,62 @@ private:
         const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
         const ColumnFixedString * c0_fixed_string = checkAndGetColumn<ColumnFixedString>(c0);
         const ColumnFixedString * c1_fixed_string = checkAndGetColumn<ColumnFixedString>(c1);
+
         const ColumnConst * c0_const = checkAndGetColumnConstStringOrFixedString(c0);
         const ColumnConst * c1_const = checkAndGetColumnConstStringOrFixedString(c1);
 
         if (!((c0_string || c0_fixed_string || c0_const) && (c1_string || c1_fixed_string || c1_const)))
             return false;
 
+        const ColumnString::Chars * c0_const_chars = nullptr;
+        const ColumnString::Chars * c1_const_chars = nullptr;
+        ColumnString::Offset c0_const_size = 0;
+        ColumnString::Offset c1_const_size = 0;
+
+        if (c0_const)
+        {
+            const ColumnString * c0_const_string = checkAndGetColumn<ColumnString>(&c0_const->getDataColumn());
+            const ColumnFixedString * c0_const_fixed_string = checkAndGetColumn<ColumnFixedString>(&c0_const->getDataColumn());
+
+            if (c0_const_string)
+            {
+                c0_const_chars = &c0_const_string->getChars();
+                c0_const_size = c0_const_string->getDataAt(0).size;
+            }
+            else if (c0_const_fixed_string)
+            {
+                c0_const_chars = &c0_const_fixed_string->getChars();
+                c0_const_size = c0_const_fixed_string->getN();
+            }
+            else
+                throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
+        }
+
+        if (c1_const)
+        {
+            const ColumnString * c1_const_string = checkAndGetColumn<ColumnString>(&c1_const->getDataColumn());
+            const ColumnFixedString * c1_const_fixed_string = checkAndGetColumn<ColumnFixedString>(&c1_const->getDataColumn());
+
+            if (c1_const_string)
+            {
+                c1_const_chars = &c1_const_string->getChars();
+                c1_const_size = c1_const_string->getDataAt(0).size;
+            }
+            else if (c1_const_fixed_string)
+            {
+                c1_const_chars = &c1_const_fixed_string->getChars();
+                c1_const_size = c1_const_fixed_string->getN();
+            }
+            else
+                throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
+        }
+
         using StringImpl = StringComparisonImpl<Op<int, int>>;
 
         if (c0_const && c1_const)
         {
             UInt8 res = 0;
-            StringImpl::constant_constant(c0_const->getValue<String>(), c1_const->getValue<String>(), res);
+            StringImpl::constant_constant(*c0_const_chars, c0_const_size, *c1_const_chars, c1_const_size, res);
             block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(c0_const->size(), toField(res));
             return true;
         }
@@ -778,7 +772,7 @@ private:
             else if (c0_string && c1_const)
                 StringImpl::string_vector_constant(
                     c0_string->getChars(), c0_string->getOffsets(),
-                    c1_const->getValue<String>(),
+                    *c1_const_chars, c1_const_size,
                     c_res->getData());
             else if (c0_fixed_string && c1_string)
                 StringImpl::fixed_string_vector_string_vector(
@@ -793,16 +787,16 @@ private:
             else if (c0_fixed_string && c1_const)
                 StringImpl::fixed_string_vector_constant(
                     c0_fixed_string->getChars(), c0_fixed_string->getN(),
-                    c1_const->getValue<String>(),
+                    *c1_const_chars, c1_const_size,
                     c_res->getData());
             else if (c0_const && c1_string)
                 StringImpl::constant_string_vector(
-                    c0_const->getValue<String>(),
+                    *c0_const_chars, c0_const_size,
                     c1_string->getChars(), c1_string->getOffsets(),
                     c_res->getData());
             else if (c0_const && c1_fixed_string)
                 StringImpl::constant_fixed_string_vector(
-                    c0_const->getValue<String>(),
+                    *c0_const_chars, c0_const_size,
                     c1_fixed_string->getChars(), c1_fixed_string->getN(),
                     c_res->getData());
             else
diff --git a/dbms/src/Functions/arrayIndex.h b/dbms/src/Functions/arrayIndex.h
index fdb5dcc8109..027a7aede63 100644
--- a/dbms/src/Functions/arrayIndex.h
+++ b/dbms/src/Functions/arrayIndex.h
@@ -6,9 +6,11 @@
 #include <DataTypes/DataTypesNumber.h>
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnString.h>
+#include <Columns/ColumnFixedString.h>
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnNullable.h>
 #include <Common/FieldVisitors.h>
+#include <Common/memcmpSmall.h>
 
 
 namespace DB
@@ -272,8 +274,7 @@ struct ArrayIndexNumNullImpl
     }
 };
 
-/// Implementation for arrays of strings when the 2nd function argument
-/// is a NULL value.
+/// Implementation for arrays of strings when the 2nd function argument is a NULL value.
 template <typename IndexConv>
 struct ArrayIndexStringNullImpl
 {
@@ -311,12 +312,11 @@ struct ArrayIndexStringImpl
 {
     static void vector_const(
         const ColumnString::Chars & data, const ColumnArray::Offsets & offsets, const ColumnString::Offsets & string_offsets,
-        const String & value,
+        const ColumnString::Chars & value, ColumnString::Offset value_size,
         PaddedPODArray<typename IndexConv::ResultType> & result,
         const PaddedPODArray<UInt8> * null_map_data)
     {
         const auto size = offsets.size();
-        const auto value_size = value.size();
         result.resize(size);
 
         ColumnArray::Offset current_offset = 0;
@@ -331,12 +331,12 @@ struct ArrayIndexStringImpl
                     ? 0
                     : string_offsets[current_offset + j - 1];
 
-                ColumnArray::Offset string_size = string_offsets[current_offset + j] - string_pos;
+                ColumnArray::Offset string_size = string_offsets[current_offset + j] - string_pos - 1;
 
                 if (null_map_data && (*null_map_data)[current_offset + j])
                 {
                 }
-                else if (string_size == value_size + 1 && 0 == memcmp(value.data(), &data[string_pos], value_size))
+                else if (memequalSmallAllowOverflow15(value.data(), value_size, &data[string_pos], string_size))
                 {
                     if (!IndexConv::apply(j, current))
                         break;
@@ -381,7 +381,7 @@ struct ArrayIndexStringImpl
                     if (null_map_item && (*null_map_item)[i])
                         hit = true;
                 }
-                else if (string_size == value_size && 0 == memcmp(&item_values[value_pos], &data[string_pos], value_size))
+                else if (memequalSmallAllowOverflow15(&item_values[value_pos], value_size, &data[string_pos], string_size))
                     hit = true;
 
                 if (hit)
@@ -708,16 +708,32 @@ private:
         const auto item_arg = block.getByPosition(arguments[1]).column.get();
 
         if (item_arg->onlyNull())
+        {
             ArrayIndexStringNullImpl<IndexConv>::vector_const(col_nested->getChars(), col_array->getOffsets(),
                 col_nested->getOffsets(), col_res->getData(), null_map_data);
+        }
         else if (const auto item_arg_const = checkAndGetColumnConstStringOrFixedString(item_arg))
-            ArrayIndexStringImpl<IndexConv>::vector_const(col_nested->getChars(), col_array->getOffsets(),
-                col_nested->getOffsets(), item_arg_const->getValue<String>(), col_res->getData(),
-                null_map_data);
+        {
+            const ColumnString * item_const_string = checkAndGetColumn<ColumnString>(item_arg_const);
+            const ColumnFixedString * item_const_fixedstring = checkAndGetColumn<ColumnFixedString>(item_arg_const);
+
+            if (item_const_string)
+                ArrayIndexStringImpl<IndexConv>::vector_const(col_nested->getChars(), col_array->getOffsets(), col_nested->getOffsets(),
+                    item_const_string->getChars(), item_const_string->getDataAt(0).size,
+                    col_res->getData(), null_map_data);
+            else if (item_const_fixedstring)
+                ArrayIndexStringImpl<IndexConv>::vector_const(col_nested->getChars(), col_array->getOffsets(), col_nested->getOffsets(),
+                    item_const_fixedstring->getChars(), item_const_fixedstring->getN(),
+                    col_res->getData(), null_map_data);
+            else
+                throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
+        }
         else if (const auto item_arg_vector = checkAndGetColumn<ColumnString>(item_arg))
+        {
             ArrayIndexStringImpl<IndexConv>::vector_vector(col_nested->getChars(), col_array->getOffsets(),
                 col_nested->getOffsets(), item_arg_vector->getChars(), item_arg_vector->getOffsets(),
                 col_res->getData(), null_map_data, null_map_item);
+        }
         else
             return false;
 

From efe3d53894edf119f02bf1cdb49a4750c54c54de Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 00:05:55 +0300
Subject: [PATCH 25/69] Fixed build with gcc-8

---
 dbms/src/Common/memcmpSmall.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
index 8c2c8420412..c37d361342d 100644
--- a/dbms/src/Common/memcmpSmall.h
+++ b/dbms/src/Common/memcmpSmall.h
@@ -166,7 +166,7 @@ inline bool memequal16(const void * a, const void * b)
 /** Compare memory region to zero */
 inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 {
-    const __m128 zero16 = _mm_setzero_ps();
+    __m128 zero16 = _mm_setzero_ps();
 
     for (size_t offset = 0; offset < size; offset += 16)
     {

From 084586a9587d7a50d3912ddcc8312fdb85801ed2 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 00:07:38 +0300
Subject: [PATCH 26/69] Fixed build with gcc-8

---
 dbms/src/Common/memcmpSmall.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
index c37d361342d..f0e06cda63f 100644
--- a/dbms/src/Common/memcmpSmall.h
+++ b/dbms/src/Common/memcmpSmall.h
@@ -166,7 +166,7 @@ inline bool memequal16(const void * a, const void * b)
 /** Compare memory region to zero */
 inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 {
-    __m128 zero16 = _mm_setzero_ps();
+    const __m128 zero16 = _mm_setzero_si128();
 
     for (size_t offset = 0; offset < size; offset += 16)
     {

From ee9cfae7f95678847639433988e44cc8aefc53b4 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 00:07:52 +0300
Subject: [PATCH 27/69] Fixed build with gcc-8

---
 dbms/src/Common/memcmpSmall.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
index f0e06cda63f..8de0f1ebb0e 100644
--- a/dbms/src/Common/memcmpSmall.h
+++ b/dbms/src/Common/memcmpSmall.h
@@ -166,7 +166,7 @@ inline bool memequal16(const void * a, const void * b)
 /** Compare memory region to zero */
 inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 {
-    const __m128 zero16 = _mm_setzero_si128();
+    const __m128i zero16 = _mm_setzero_si128();
 
     for (size_t offset = 0; offset < size; offset += 16)
     {

From 82cb9f51b0b7e0b0aabced70d51a77a8a9ca7ef6 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 01:46:46 +0300
Subject: [PATCH 28/69] Fixed error

---
 dbms/src/Common/memcmpSmall.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
index 8de0f1ebb0e..ed20429c825 100644
--- a/dbms/src/Common/memcmpSmall.h
+++ b/dbms/src/Common/memcmpSmall.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <string.h>
+#include <cstdint>
 #include <algorithm>
 
 #ifdef __SSE2__
@@ -37,7 +37,7 @@ inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char
 
     for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
     {
-        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
 
@@ -66,7 +66,7 @@ inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t siz
 
     for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
     {
-        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
 
@@ -97,7 +97,7 @@ inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Ch
 
     for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
     {
-        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
 
@@ -119,7 +119,7 @@ inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
 {
     for (size_t offset = 0; offset < size; offset += 16)
     {
-        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
 
@@ -139,7 +139,7 @@ inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
 template <typename Char>
 inline int memcmp16(const Char * a, const Char * b)
 {
-    auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+    uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
         _mm_loadu_si128(reinterpret_cast<const __m128i *>(a)),
         _mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
 
@@ -170,7 +170,7 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 
     for (size_t offset = 0; offset < size; offset += 16)
     {
-        auto mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(zero16,
+        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(zero16,
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(reinterpret_cast<const char *>(data) + offset))));
 
         if (mask)
@@ -186,6 +186,8 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 
 #else
 
+#include <cstring>
+
 template <typename Char>
 inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
 {

From f0185785956ea1002b2d9a825f0be33ec9bf6674 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 02:12:50 +0300
Subject: [PATCH 29/69] Better code #4564

---
 dbms/src/Common/memcmpSmall.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
index ed20429c825..104ed6d975b 100644
--- a/dbms/src/Common/memcmpSmall.h
+++ b/dbms/src/Common/memcmpSmall.h
@@ -33,9 +33,8 @@ template <typename Char>
 inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
 {
     size_t min_size = std::min(a_size, b_size);
-    size_t size_to_compare_sse = (min_size + 15) / 16 * 16;
 
-    for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
+    for (size_t offset = 0; offset < min_size; offset += 16)
     {
         uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
@@ -62,9 +61,7 @@ inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char
 template <typename Char>
 inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size)
 {
-    size_t size_to_compare_sse = (size + 15) / 16 * 16;
-
-    for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
+    for (size_t offset = 0; offset < size; offset += 16)
     {
         uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
@@ -93,9 +90,7 @@ inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Ch
     if (a_size != b_size)
         return false;
 
-    size_t size_to_compare_sse = (a_size + 15) / 16 * 16;
-
-    for (size_t offset = 0; offset < size_to_compare_sse; offset += 16)
+    for (size_t offset = 0; offset < a_size; offset += 16)
     {
         uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),

From 2612bd892dfec796f62205017fab9ca7cafbe153 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 02:16:38 +0300
Subject: [PATCH 30/69] Additions #4564

---
 dbms/src/Common/memcmpSmall.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
index 104ed6d975b..89e547ffbd4 100644
--- a/dbms/src/Common/memcmpSmall.h
+++ b/dbms/src/Common/memcmpSmall.h
@@ -195,12 +195,29 @@ inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t siz
     return memcmp(a, b, size);
 }
 
+template <typename Char>
+inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
+{
+    return a_size == b_size && 0 == memcmp(a, b, a_size);
+}
+
 template <typename Char>
 inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
 {
     return memcmp(a, b, size);
 }
 
+template <typename Char>
+inline int memcmp16(const Char * a, const Char * b)
+{
+    return memcmp(a, b, 16);
+}
+
+inline bool memequal16(const void * a, const void * b)
+{
+    return 0 == memcmp(a, b, 16);
+}
+
 inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 {
     const char * pos = reinterpret_cast<const char *>(data);

From f0801f37fe50b25a2e2bc91b8a031fbc098cbf52 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 02:44:41 +0300
Subject: [PATCH 31/69] Added a test

---
 .../00912_string_comparison.reference         | 369 ++++++++++++++++++
 .../0_stateless/00912_string_comparison.sql   |  18 +
 2 files changed, 387 insertions(+)
 create mode 100644 dbms/tests/queries/0_stateless/00912_string_comparison.reference
 create mode 100644 dbms/tests/queries/0_stateless/00912_string_comparison.sql

diff --git a/dbms/tests/queries/0_stateless/00912_string_comparison.reference b/dbms/tests/queries/0_stateless/00912_string_comparison.reference
new file mode 100644
index 00000000000..5d6a0c2f3d0
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00912_string_comparison.reference
@@ -0,0 +1,369 @@
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+1	0	0	1	1
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	1	0	1	0
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+1	0	0	1	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	0	1	0	1
+0	1	0	1	0
+0	0	1	0	1
+0	1	0	1	0
+0	0	1	0	1
+0	1	0	1	0
+0	0	1	0	1
+0	1	0	1	0
+0	0	1	0	1
+1
diff --git a/dbms/tests/queries/0_stateless/00912_string_comparison.sql b/dbms/tests/queries/0_stateless/00912_string_comparison.sql
new file mode 100644
index 00000000000..857da41098e
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00912_string_comparison.sql
@@ -0,0 +1,18 @@
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'x' AS a, prefix || 'y' AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'y' AS a, prefix || 'x' AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'x' AS a, prefix || 'x' AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'x' || prefix AS a, prefix || 'y' || prefix AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'y' || prefix AS a, prefix || 'x' || prefix AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'x' || prefix AS a, prefix || 'x' || prefix AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'x' || prefix AS a, prefix || 'y' AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'y' || prefix AS a, prefix || 'x' AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+WITH substring('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', 1, number) AS prefix, prefix || 'x' || prefix AS a, prefix || 'x' AS b SELECT a = b, a < b, a > b, a <= b, a >= b FROM numbers(40);
+
+WITH arrayJoin(['aaa', 'bbb']) AS a, 'aaa\0bbb' AS b SELECT a = b, a < b, a > b, a <= b, a >= b;
+WITH arrayJoin(['aaa', 'zzz']) AS a, 'aaa\0bbb' AS b SELECT a = b, a < b, a > b, a <= b, a >= b;
+WITH arrayJoin(['aaa', 'bbb']) AS a, materialize('aaa\0bbb') AS b SELECT a = b, a < b, a > b, a <= b, a >= b;
+WITH arrayJoin(['aaa', 'zzz']) AS a, materialize('aaa\0bbb') AS b SELECT a = b, a < b, a > b, a <= b, a >= b;
+
+SELECT empty(toFixedString('', randConstant() % 100));

From f33cbe90d3b65bba3b46bcfc482191c6eda582d3 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 02:47:15 +0300
Subject: [PATCH 32/69] Fixed test

---
 dbms/tests/queries/0_stateless/00912_string_comparison.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/tests/queries/0_stateless/00912_string_comparison.sql b/dbms/tests/queries/0_stateless/00912_string_comparison.sql
index 857da41098e..089ec4ab3b1 100644
--- a/dbms/tests/queries/0_stateless/00912_string_comparison.sql
+++ b/dbms/tests/queries/0_stateless/00912_string_comparison.sql
@@ -15,4 +15,4 @@ WITH arrayJoin(['aaa', 'zzz']) AS a, 'aaa\0bbb' AS b SELECT a = b, a < b, a > b,
 WITH arrayJoin(['aaa', 'bbb']) AS a, materialize('aaa\0bbb') AS b SELECT a = b, a < b, a > b, a <= b, a >= b;
 WITH arrayJoin(['aaa', 'zzz']) AS a, materialize('aaa\0bbb') AS b SELECT a = b, a < b, a > b, a <= b, a >= b;
 
-SELECT empty(toFixedString('', randConstant() % 100));
+SELECT empty(toFixedString('', 1 + randConstant() % 100));

From c3fabfb93f30328a0d29189ea441379f7cf5b158 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 03:03:20 +0300
Subject: [PATCH 33/69] Improvement #4564

---
 dbms/src/Columns/ColumnString.cpp | 4 ++--
 dbms/src/Columns/ColumnString.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp
index 5b16aaef75e..863f39f9a52 100644
--- a/dbms/src/Columns/ColumnString.cpp
+++ b/dbms/src/Columns/ColumnString.cpp
@@ -241,8 +241,8 @@ struct ColumnString::less
     bool operator()(size_t lhs, size_t rhs) const
     {
         int res = memcmpSmallAllowOverflow15(
-            parent.chars.data() + parent.offsetAt(lhs), parent.sizeAt(lhs),
-            parent.chars.data() + parent.offsetAt(rhs), parent.sizeAt(rhs));
+            parent.chars.data() + parent.offsetAt(lhs), parent.sizeAt(lhs) - 1,
+            parent.chars.data() + parent.offsetAt(rhs), parent.sizeAt(rhs) - 1);
 
         return positive ? (res < 0) : (res > 0);
     }
diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h
index 7f6da068f52..7117bab0d05 100644
--- a/dbms/src/Columns/ColumnString.h
+++ b/dbms/src/Columns/ColumnString.h
@@ -211,7 +211,7 @@ public:
     int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override
     {
         const ColumnString & rhs = static_cast<const ColumnString &>(rhs_);
-        return memcmpSmallAllowOverflow15(chars.data() + offsetAt(n), sizeAt(n), rhs.chars.data() + rhs.offsetAt(m), rhs.sizeAt(m));
+        return memcmpSmallAllowOverflow15(chars.data() + offsetAt(n), sizeAt(n) - 1, rhs.chars.data() + rhs.offsetAt(m), rhs.sizeAt(m) - 1);
     }
 
     /// Variant of compareAt for string comparison with respect of collation.

From a0b6b78c01e4b90d3d7de745846bd81fc43b04dc Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 03:20:08 +0300
Subject: [PATCH 34/69] Added performance test #4564

---
 .../performance/string_sort/string_sort.xml   | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 dbms/tests/performance/string_sort/string_sort.xml

diff --git a/dbms/tests/performance/string_sort/string_sort.xml b/dbms/tests/performance/string_sort/string_sort.xml
new file mode 100644
index 00000000000..d0dce1b210e
--- /dev/null
+++ b/dbms/tests/performance/string_sort/string_sort.xml
@@ -0,0 +1,55 @@
+<test>
+    <name>String sorting</name>
+
+    <preconditions>
+        <table_exists>hits_10m_single</table_exists>
+    </preconditions>
+
+    <type>loop</type>
+
+    <stop_conditions>
+        <all_of>
+            <iterations>5</iterations>
+            <min_time_not_changing_for_ms>10000</min_time_not_changing_for_ms>
+        </all_of>
+        <any_of>
+            <iterations>50</iterations>
+            <total_time_ms>60000</total_time_ms>
+        </any_of>
+    </stop_conditions>
+
+    <substitutions>
+       <substitution>
+           <name>str1</name>
+           <values>
+               <value>URL</value>
+               <value>Referer</value>
+               <value>Title</value>
+               <value>SearchPhrase</value>
+               <value>MobilePhoneModel</value>
+               <value>PageCharset</value>
+           </values>
+       </substitution>
+       <substitution>
+           <name>str2</name>
+           <values>
+               <value>URL</value>
+               <value>Referer</value>
+               <value>Title</value>
+               <value>SearchPhrase</value>
+               <value>MobilePhoneModel</value>
+               <value>PageCharset</value>
+           </values>
+       </substitution>
+    </substitutions>
+
+    <query><![CDATA[SELECT {str1} FROM hits_10m_single ORDER BY {str1} LIMIT 10]]></query>
+    <query><![CDATA[SELECT {str1} FROM hits_10m_single ORDER BY {str1} LIMIT 9000000, 10]]></query>
+
+    <query><![CDATA[SELECT {str1}, {str2} FROM hits_10m_single ORDER BY {str1}, {str2} LIMIT 10]]></query>
+    <query><![CDATA[SELECT {str1}, {str2} FROM hits_10m_single ORDER BY {str1}, {str2} LIMIT 9000000, 10]]></query>
+
+    <main_metric>
+        <min_time/>
+    </main_metric>
+</test>

From b29c24d3d97183bad332ce0ec9685fa32fe1a986 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 18:13:33 +0300
Subject: [PATCH 35/69] Fixed error #4564

---
 dbms/src/Functions/arrayIndex.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dbms/src/Functions/arrayIndex.h b/dbms/src/Functions/arrayIndex.h
index 027a7aede63..9b9a5bbb602 100644
--- a/dbms/src/Functions/arrayIndex.h
+++ b/dbms/src/Functions/arrayIndex.h
@@ -714,8 +714,8 @@ private:
         }
         else if (const auto item_arg_const = checkAndGetColumnConstStringOrFixedString(item_arg))
         {
-            const ColumnString * item_const_string = checkAndGetColumn<ColumnString>(item_arg_const);
-            const ColumnFixedString * item_const_fixedstring = checkAndGetColumn<ColumnFixedString>(item_arg_const);
+            const ColumnString * item_const_string = checkAndGetColumn<ColumnString>(&item_arg_const->getDataColumn());
+            const ColumnFixedString * item_const_fixedstring = checkAndGetColumn<ColumnFixedString>(&item_arg_const->getDataColumn());
 
             if (item_const_string)
                 ArrayIndexStringImpl<IndexConv>::vector_const(col_nested->getChars(), col_array->getOffsets(), col_nested->getOffsets(),

From 3033d329eb6d54a1cbab130dfdbe78a186768a9c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 18:50:23 +0300
Subject: [PATCH 36/69] Fixed error #4564

---
 dbms/src/Common/memcmpSmall.h  | 18 ++++++++++++------
 dbms/src/Functions/EmptyImpl.h |  6 +++---
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/dbms/src/Common/memcmpSmall.h b/dbms/src/Common/memcmpSmall.h
index 89e547ffbd4..13269a13b29 100644
--- a/dbms/src/Common/memcmpSmall.h
+++ b/dbms/src/Common/memcmpSmall.h
@@ -36,9 +36,10 @@ inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char
 
     for (size_t offset = 0; offset < min_size; offset += 16)
     {
-        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+        mask = ~mask;
 
         if (mask)
         {
@@ -63,9 +64,10 @@ inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t siz
 {
     for (size_t offset = 0; offset < size; offset += 16)
     {
-        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+        mask = ~mask;
 
         if (mask)
         {
@@ -92,9 +94,10 @@ inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Ch
 
     for (size_t offset = 0; offset < a_size; offset += 16)
     {
-        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+        mask = ~mask;
 
         if (mask)
         {
@@ -114,9 +117,10 @@ inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
 {
     for (size_t offset = 0; offset < size; offset += 16)
     {
-        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+        uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
+        mask = ~mask;
 
         if (mask)
         {
@@ -134,9 +138,10 @@ inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
 template <typename Char>
 inline int memcmp16(const Char * a, const Char * b)
 {
-    uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(
+    uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
         _mm_loadu_si128(reinterpret_cast<const __m128i *>(a)),
         _mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
+    mask = ~mask;
 
     if (mask)
     {
@@ -165,8 +170,9 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
 
     for (size_t offset = 0; offset < size; offset += 16)
     {
-        uint16_t mask = ~_mm_movemask_epi8(_mm_cmpeq_epi8(zero16,
+        uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(zero16,
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(reinterpret_cast<const char *>(data) + offset))));
+        mask = ~mask;
 
         if (mask)
         {
diff --git a/dbms/src/Functions/EmptyImpl.h b/dbms/src/Functions/EmptyImpl.h
index e1abd502988..3b4d8f294a7 100644
--- a/dbms/src/Functions/EmptyImpl.h
+++ b/dbms/src/Functions/EmptyImpl.h
@@ -39,9 +39,9 @@ struct EmptyImpl
 
     static void vector_fixed_to_vector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
     {
-        size_t size = data.size();
-        for (size_t i = 0; i < size; i += n)
-            res[i] = negative ^ memoryIsZeroSmallAllowOverflow15(data.data() + i, n);
+        size_t size = data.size() / n;
+        for (size_t i = 0; i < size; ++i)
+            res[i] = negative ^ memoryIsZeroSmallAllowOverflow15(data.data() + i * n, n);
     }
 
     static void array(const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)

From c3a2e73f323a5cb9b77ad44d7f5d80374d650013 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 18:53:51 +0300
Subject: [PATCH 37/69] Fixed build

---
 .../Interpreters/FindIdentifierBestTableVisitor.cpp  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dbms/src/Interpreters/FindIdentifierBestTableVisitor.cpp b/dbms/src/Interpreters/FindIdentifierBestTableVisitor.cpp
index 8c9f7403898..8173ce3256a 100644
--- a/dbms/src/Interpreters/FindIdentifierBestTableVisitor.cpp
+++ b/dbms/src/Interpreters/FindIdentifierBestTableVisitor.cpp
@@ -16,13 +16,13 @@ void FindIdentifierBestTableData::visit(ASTIdentifier & identifier, ASTPtr &)
 
     if (!identifier.compound())
     {
-        for (const auto & [table, names] : tables)
+        for (const auto & table_names : tables)
         {
-            if (std::find(names.begin(), names.end(), identifier.name) != names.end())
+            if (std::find(table_names.second.begin(), table_names.second.end(), identifier.name) != table_names.second.end())
             {
                 // TODO: make sure no collision ever happens
                 if (!best_table)
-                    best_table = &table;
+                    best_table = &table_names.first;
             }
         }
     }
@@ -30,13 +30,13 @@ void FindIdentifierBestTableData::visit(ASTIdentifier & identifier, ASTPtr &)
     {
         // FIXME: make a better matcher using `names`?
         size_t best_match = 0;
-        for (const auto & [table, names] : tables)
+        for (const auto & table_names : tables)
         {
-            if (size_t match = IdentifierSemantic::canReferColumnToTable(identifier, table))
+            if (size_t match = IdentifierSemantic::canReferColumnToTable(identifier, table_names.first))
                 if (match > best_match)
                 {
                     best_match = match;
-                    best_table = &table;
+                    best_table = &table_names.first;
                 }
         }
     }

From 05f187c122ef5a03d2f02d94add644086cf46667 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 4 Mar 2019 19:36:52 +0300
Subject: [PATCH 38/69] Disable compile expressions by default

---
 dbms/src/Interpreters/Settings.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index 3be282721ff..16bceb90d76 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -75,7 +75,7 @@ struct Settings
     M(SettingFloat, totals_auto_threshold, 0.5, "The threshold for totals_mode = 'auto'.") \
     \
     M(SettingBool, compile, false, "Whether query compilation is enabled.") \
-    M(SettingBool, compile_expressions, true, "Compile some scalar functions and operators to native code.") \
+    M(SettingBool, compile_expressions, false, "Compile some scalar functions and operators to native code.") \
     M(SettingUInt64, min_count_to_compile, 3, "The number of structurally identical queries before they are compiled.") \
     M(SettingUInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled") \
     M(SettingUInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.") \

From d2e18d47fbc088e0f96052ed5e1436575ef17f56 Mon Sep 17 00:00:00 2001
From: proller <proller@users.noreply.github.com>
Date: Mon, 4 Mar 2019 20:47:31 +0300
Subject: [PATCH 39/69] Fix link in split mode (#4574)

---
 contrib/arrow-cmake/CMakeLists.txt            |  2 +-
 dbms/CMakeLists.txt                           |  4 +-
 dbms/programs/CMakeLists.txt                  | 69 ++++++++++++++++++-
 dbms/programs/benchmark/CMakeLists.txt        | 16 ++---
 dbms/programs/client/CMakeLists.txt           | 14 ++--
 dbms/programs/compressor/CMakeLists.txt       | 14 ++--
 dbms/programs/config_tools.h.in               |  2 +-
 dbms/programs/copier/CMakeLists.txt           | 11 ++-
 .../extract-from-config/CMakeLists.txt        | 11 ++-
 dbms/programs/format/CMakeLists.txt           | 12 ++--
 dbms/programs/local/CMakeLists.txt            | 13 ++--
 dbms/programs/main.cpp                        |  4 +-
 dbms/programs/obfuscator/CMakeLists.txt       | 12 ++--
 dbms/programs/odbc-bridge/CMakeLists.txt      | 44 +++++++-----
 .../programs/odbc-bridge/tests/CMakeLists.txt |  3 +-
 dbms/programs/performance-test/CMakeLists.txt | 37 +++++-----
 dbms/programs/server/CMakeLists.txt           | 40 +++++------
 utils/compressor/CMakeLists.txt               |  6 --
 18 files changed, 176 insertions(+), 138 deletions(-)

diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt
index 2af5765c5d0..bec827354cd 100644
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@@ -198,7 +198,7 @@ list(APPEND PARQUET_SRCS
 add_library(${PARQUET_LIBRARY} ${LINK_MODE} ${PARQUET_SRCS})
 target_include_directories(${PARQUET_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src)
 include(${ClickHouse_SOURCE_DIR}/contrib/thrift/build/cmake/ConfigureChecks.cmake) # makes config.h
-target_link_libraries(${PARQUET_LIBRARY} PRIVATE ${ARROW_LIBRARY} ${THRIFT_LIBRARY} ${Boost_REGEX_LIBRARY})
+target_link_libraries(${PARQUET_LIBRARY} PUBLIC ${ARROW_LIBRARY} PRIVATE ${THRIFT_LIBRARY} ${Boost_REGEX_LIBRARY})
 target_include_directories(${PARQUET_LIBRARY} PRIVATE ${Boost_INCLUDE_DIRS})
 
 if(SANITIZE STREQUAL "undefined")
diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index 9eca53c82b7..b9aab4aeb2a 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -137,12 +137,10 @@ endif ()
 add_subdirectory(src/Common/ZooKeeper)
 add_subdirectory(src/Common/Config)
 
-if (MAKE_STATIC_LIBRARIES)
+if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES)
     add_library(dbms ${dbms_headers} ${dbms_sources})
 else ()
     add_library(dbms SHARED ${dbms_headers} ${dbms_sources})
-    set_target_properties (dbms PROPERTIES SOVERSION ${VERSION_MAJOR}.${VERSION_MINOR} VERSION ${VERSION_SO} OUTPUT_NAME clickhouse)
-    install (TARGETS dbms LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT clickhouse)
 endif ()
 
 if (USE_EMBEDDED_COMPILER)
diff --git a/dbms/programs/CMakeLists.txt b/dbms/programs/CMakeLists.txt
index 44befd634f9..57067074527 100644
--- a/dbms/programs/CMakeLists.txt
+++ b/dbms/programs/CMakeLists.txt
@@ -7,7 +7,7 @@ option (ENABLE_CLICKHOUSE_SERVER "Enable clickhouse-server" ${ENABLE_CLICKHOUSE_
 option (ENABLE_CLICKHOUSE_CLIENT "Enable clickhouse-client" ${ENABLE_CLICKHOUSE_ALL})
 option (ENABLE_CLICKHOUSE_LOCAL "Enable clickhouse-local" ${ENABLE_CLICKHOUSE_ALL})
 option (ENABLE_CLICKHOUSE_BENCHMARK "Enable clickhouse-benchmark" ${ENABLE_CLICKHOUSE_ALL})
-option (ENABLE_CLICKHOUSE_PERFORMANCE "Enable clickhouse-performance-test" ${ENABLE_CLICKHOUSE_ALL})
+option (ENABLE_CLICKHOUSE_PERFORMANCE_TEST "Enable clickhouse-performance-test" ${ENABLE_CLICKHOUSE_ALL})
 option (ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG "Enable clickhouse-extract-from-config" ${ENABLE_CLICKHOUSE_ALL})
 option (ENABLE_CLICKHOUSE_COMPRESSOR "Enable clickhouse-compressor" ${ENABLE_CLICKHOUSE_ALL})
 option (ENABLE_CLICKHOUSE_COPIER "Enable clickhouse-copier" ${ENABLE_CLICKHOUSE_ALL})
@@ -15,8 +15,64 @@ option (ENABLE_CLICKHOUSE_FORMAT "Enable clickhouse-format" ${ENABLE_CLICKHOUSE_
 option (ENABLE_CLICKHOUSE_OBFUSCATOR "Enable clickhouse-obfuscator" ${ENABLE_CLICKHOUSE_ALL})
 option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "Enable clickhouse-odbc-bridge" ${ENABLE_CLICKHOUSE_ALL})
 
+if(NOT (MAKE_STATIC_LIBRARIES OR SPLIT_SHARED_LIBRARIES))
+    set(CLICKHOUSE_ONE_SHARED 1)
+endif()
+
 configure_file (config_tools.h.in ${CMAKE_CURRENT_BINARY_DIR}/config_tools.h)
 
+
+
+macro(clickhouse_target_link_split_lib target name)
+    if(NOT CLICKHOUSE_ONE_SHARED)
+        target_link_libraries(${target} PRIVATE clickhouse-${name}-lib)
+    else()
+        target_link_libraries(${target} PRIVATE clickhouse-lib)
+    endif()
+endmacro()
+
+macro(clickhouse_program_link_split_binary name)
+    clickhouse_target_link_split_lib(clickhouse-${name} ${name})
+endmacro()
+
+macro(clickhouse_program_add_library name)
+    string(TOUPPER ${name} name_uc)
+    string(REPLACE "-" "_" name_uc ${name_uc})
+
+    # Some dark magic
+    set(CLICKHOUSE_${name_uc}_SOURCES ${CLICKHOUSE_${name_uc}_SOURCES} PARENT_SCOPE)
+    set(CLICKHOUSE_${name_uc}_LINK ${CLICKHOUSE_${name_uc}_LINK} PARENT_SCOPE)
+    set(CLICKHOUSE_${name_uc}_INCLUDE ${CLICKHOUSE_${name_uc}_INCLUDE} PARENT_SCOPE)
+
+    if(NOT CLICKHOUSE_ONE_SHARED)
+        add_library(clickhouse-${name}-lib ${LINK_MODE} ${CLICKHOUSE_${name_uc}_SOURCES})
+
+        set(_link ${CLICKHOUSE_${name_uc}_LINK}) # can't use ${} in if()
+        if(_link)
+            target_link_libraries(clickhouse-${name}-lib ${CLICKHOUSE_${name_uc}_LINK})
+        endif()
+
+        set(_include ${CLICKHOUSE_${name_uc}_INCLUDE}) # can't use ${} in if()
+        if (_include)
+            target_include_directories(clickhouse-${name}-lib ${CLICKHOUSE_${name_uc}_INCLUDE})
+        endif()
+    endif()
+endmacro()
+
+macro(clickhouse_program_add_executable name)
+    if(CLICKHOUSE_SPLIT_BINARY)
+        add_executable(clickhouse-${name} clickhouse-${name}.cpp)
+        clickhouse_program_link_split_binary(${name})
+        install(TARGETS clickhouse-${name} ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
+    endif()
+endmacro()
+
+macro(clickhouse_program_add name)
+    clickhouse_program_add_library(${name})
+    clickhouse_program_add_executable(${name})
+endmacro()
+
+
 add_subdirectory (server)
 add_subdirectory (client)
 add_subdirectory (local)
@@ -33,6 +89,13 @@ if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
     add_subdirectory (odbc-bridge)
 endif ()
 
+if (CLICKHOUSE_ONE_SHARED)
+    add_library(clickhouse-lib SHARED ${CLICKHOUSE_SERVER_SOURCES} ${CLICKHOUSE_CLIENT_SOURCES} ${CLICKHOUSE_LOCAL_SOURCES} ${CLICKHOUSE_BENCHMARK_SOURCES} ${CLICKHOUSE_PERFORMANCE_TEST_SOURCES} ${CLICKHOUSE_COPIER_SOURCES} ${CLICKHOUSE_EXTRACT_FROM_CONFIG_SOURCES} ${CLICKHOUSE_COMPRESSOR_SOURCES} ${CLICKHOUSE_FORMAT_SOURCES} ${CLICKHOUSE_OBFUSCATOR_SOURCES} ${CLICKHOUSE_COMPILER_SOURCES} ${CLICKHOUSE_ODBC_BRIDGE_SOURCES})
+    target_link_libraries(clickhouse-lib PUBLIC ${CLICKHOUSE_SERVER_LINK} ${CLICKHOUSE_CLIENT_LINK} ${CLICKHOUSE_LOCAL_LINK} ${CLICKHOUSE_BENCHMARK_LINK} ${CLICKHOUSE_PERFORMANCE_TEST_LINK} ${CLICKHOUSE_COPIER_LINK} ${CLICKHOUSE_EXTRACT_FROM_CONFIG_LINK} ${CLICKHOUSE_COMPRESSOR_LINK} ${CLICKHOUSE_FORMAT_LINK} ${CLICKHOUSE_OBFUSCATOR_LINK} ${CLICKHOUSE_COMPILER_LINK}  ${CLICKHOUSE_ODBC_BRIDGE_LINK})
+    set_target_properties(clickhouse-lib PROPERTIES SOVERSION ${VERSION_MAJOR}.${VERSION_MINOR} VERSION ${VERSION_SO} OUTPUT_NAME clickhouse)
+    target_include_directories(clickhouse-lib ${CLICKHOUSE_SERVER_INCLUDE} ${CLICKHOUSE_CLIENT_INCLUDE} ${CLICKHOUSE_LOCAL_INCLUDE} ${CLICKHOUSE_BENCHMARK_INCLUDE} ${CLICKHOUSE_PERFORMANCE_TEST_INCLUDE} ${CLICKHOUSE_COPIER_INCLUDE} ${CLICKHOUSE_EXTRACT_FROM_CONFIG_INCLUDE} ${CLICKHOUSE_COMPRESSOR_INCLUDE} ${CLICKHOUSE_FORMAT_INCLUDE} ${CLICKHOUSE_OBFUSCATOR_INCLUDE} ${CLICKHOUSE_COMPILER_INCLUDE} ${CLICKHOUSE_ODBC_BRIDGE_INCLUDE})
+endif()
+
 if (CLICKHOUSE_SPLIT_BINARY)
     set (CLICKHOUSE_ALL_TARGETS clickhouse-server clickhouse-client clickhouse-local clickhouse-benchmark clickhouse-performance-test
             clickhouse-extract-from-config clickhouse-compressor clickhouse-format clickhouse-copier)
@@ -71,7 +134,7 @@ else ()
     if (ENABLE_CLICKHOUSE_BENCHMARK)
         target_link_libraries (clickhouse PRIVATE clickhouse-benchmark-lib)
     endif ()
-    if (ENABLE_CLICKHOUSE_PERFORMANCE)
+    if (ENABLE_CLICKHOUSE_PERFORMANCE_TEST)
         target_link_libraries (clickhouse PRIVATE clickhouse-performance-test-lib)
     endif ()
     if (ENABLE_CLICKHOUSE_COPIER)
@@ -114,7 +177,7 @@ else ()
         install (FILES ${CMAKE_CURRENT_BINARY_DIR}/clickhouse-benchmark DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
         list(APPEND CLICKHOUSE_BUNDLE clickhouse-benchmark)
     endif ()
-    if (ENABLE_CLICKHOUSE_PERFORMANCE)
+    if (ENABLE_CLICKHOUSE_PERFORMANCE_TEST)
         add_custom_target (clickhouse-performance-test ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-performance-test DEPENDS clickhouse)
         install (FILES ${CMAKE_CURRENT_BINARY_DIR}/clickhouse-performance-test DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
         list(APPEND CLICKHOUSE_BUNDLE clickhouse-performance-test)
diff --git a/dbms/programs/benchmark/CMakeLists.txt b/dbms/programs/benchmark/CMakeLists.txt
index 9814fac9875..ccbefc0453a 100644
--- a/dbms/programs/benchmark/CMakeLists.txt
+++ b/dbms/programs/benchmark/CMakeLists.txt
@@ -1,9 +1,9 @@
-add_library (clickhouse-benchmark-lib ${LINK_MODE} Benchmark.cpp)
-target_link_libraries (clickhouse-benchmark-lib PRIVATE clickhouse_aggregate_functions clickhouse-client-lib clickhouse_common_config clickhouse_common_io ${Boost_PROGRAM_OPTIONS_LIBRARY})
-target_include_directories (clickhouse-benchmark-lib SYSTEM PRIVATE ${PCG_RANDOM_INCLUDE_DIR})
+set(CLICKHOUSE_BENCHMARK_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Benchmark.cpp)
+set(CLICKHOUSE_BENCHMARK_LINK PRIVATE clickhouse_aggregate_functions clickhouse_common_config clickhouse_common_io ${Boost_PROGRAM_OPTIONS_LIBRARY})
+set(CLICKHOUSE_BENCHMARK_INCLUDE SYSTEM PRIVATE ${PCG_RANDOM_INCLUDE_DIR})
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-benchmark clickhouse-benchmark.cpp)
-    target_link_libraries (clickhouse-benchmark PRIVATE clickhouse-benchmark-lib clickhouse_aggregate_functions)
-    install (TARGETS clickhouse-benchmark ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+clickhouse_program_add(benchmark)
+
+if(NOT CLICKHOUSE_ONE_SHARED)
+    target_link_libraries (clickhouse-benchmark-lib PRIVATE clickhouse-client-lib)
+endif()
diff --git a/dbms/programs/client/CMakeLists.txt b/dbms/programs/client/CMakeLists.txt
index 462720dea0e..ce02d45d0e0 100644
--- a/dbms/programs/client/CMakeLists.txt
+++ b/dbms/programs/client/CMakeLists.txt
@@ -1,13 +1,7 @@
-add_library (clickhouse-client-lib ${LINK_MODE} Client.cpp)
-target_link_libraries (clickhouse-client-lib PRIVATE clickhouse_common_config clickhouse_functions clickhouse_aggregate_functions clickhouse_common_io ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY})
-if (READLINE_INCLUDE_DIR)
-    target_include_directories (clickhouse-client-lib SYSTEM PRIVATE ${READLINE_INCLUDE_DIR})
-endif ()
+set(CLICKHOUSE_CLIENT_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Client.cpp)
+set(CLICKHOUSE_CLIENT_LINK PRIVATE clickhouse_common_config clickhouse_functions clickhouse_aggregate_functions clickhouse_common_io ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY})
+set(CLICKHOUSE_CLIENT_INCLUDE SYSTEM PRIVATE ${READLINE_INCLUDE_DIR})
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-client clickhouse-client.cpp)
-    target_link_libraries (clickhouse-client PRIVATE clickhouse-client-lib)
-    install (TARGETS clickhouse-client ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+clickhouse_program_add(client)
 
 install (FILES clickhouse-client.xml DESTINATION ${CLICKHOUSE_ETC_DIR}/clickhouse-client COMPONENT clickhouse-client RENAME config.xml)
diff --git a/dbms/programs/compressor/CMakeLists.txt b/dbms/programs/compressor/CMakeLists.txt
index a76986173a5..46fd4816ba2 100644
--- a/dbms/programs/compressor/CMakeLists.txt
+++ b/dbms/programs/compressor/CMakeLists.txt
@@ -1,9 +1,7 @@
-add_library (clickhouse-compressor-lib ${LINK_MODE} Compressor.cpp)
-target_link_libraries (clickhouse-compressor-lib PRIVATE clickhouse_compression clickhouse_common_io ${Boost_PROGRAM_OPTIONS_LIBRARY})
+# Also in utils
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    # Also in utils
-    add_executable (clickhouse-compressor clickhouse-compressor.cpp)
-    target_link_libraries (clickhouse-compressor PRIVATE clickhouse-compressor-lib)
-    install (TARGETS clickhouse-compressor ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+set(CLICKHOUSE_COMPRESSOR_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Compressor.cpp)
+set(CLICKHOUSE_COMPRESSOR_LINK PRIVATE clickhouse_compression clickhouse_common_io ${Boost_PROGRAM_OPTIONS_LIBRARY})
+#set(CLICKHOUSE_COMPRESSOR_INCLUDE SYSTEM PRIVATE ...)
+
+clickhouse_program_add(compressor)
diff --git a/dbms/programs/config_tools.h.in b/dbms/programs/config_tools.h.in
index 13aebfd3c83..ff0a62d8171 100644
--- a/dbms/programs/config_tools.h.in
+++ b/dbms/programs/config_tools.h.in
@@ -6,7 +6,7 @@
 #cmakedefine01 ENABLE_CLICKHOUSE_CLIENT
 #cmakedefine01 ENABLE_CLICKHOUSE_LOCAL
 #cmakedefine01 ENABLE_CLICKHOUSE_BENCHMARK
-#cmakedefine01 ENABLE_CLICKHOUSE_PERFORMANCE
+#cmakedefine01 ENABLE_CLICKHOUSE_PERFORMANCE_TEST
 #cmakedefine01 ENABLE_CLICKHOUSE_COPIER
 #cmakedefine01 ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG
 #cmakedefine01 ENABLE_CLICKHOUSE_COMPRESSOR
diff --git a/dbms/programs/copier/CMakeLists.txt b/dbms/programs/copier/CMakeLists.txt
index 158080ffce6..c9f8e44bce8 100644
--- a/dbms/programs/copier/CMakeLists.txt
+++ b/dbms/programs/copier/CMakeLists.txt
@@ -1,8 +1,5 @@
-add_library (clickhouse-copier-lib ${LINK_MODE} ClusterCopier.cpp)
-target_link_libraries (clickhouse-copier-lib PRIVATE clickhouse-server-lib clickhouse_functions clickhouse_aggregate_functions daemon)
+set(CLICKHOUSE_COPIER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/ClusterCopier.cpp)
+set(CLICKHOUSE_COPIER_LINK PRIVATE clickhouse_functions clickhouse_aggregate_functions daemon)
+#set(CLICKHOUSE_COPIER_INCLUDE SYSTEM PRIVATE ...)
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-copier clickhouse-copier.cpp)
-    target_link_libraries (clickhouse-copier clickhouse-copier-lib)
-    install (TARGETS clickhouse-copier ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+clickhouse_program_add(copier)
diff --git a/dbms/programs/extract-from-config/CMakeLists.txt b/dbms/programs/extract-from-config/CMakeLists.txt
index 9d2ddcd7c2a..4c01cd9c999 100644
--- a/dbms/programs/extract-from-config/CMakeLists.txt
+++ b/dbms/programs/extract-from-config/CMakeLists.txt
@@ -1,8 +1,5 @@
-add_library (clickhouse-extract-from-config-lib ${LINK_MODE} ExtractFromConfig.cpp)
-target_link_libraries (clickhouse-extract-from-config-lib PRIVATE clickhouse_common_config clickhouse_common_io ${Boost_PROGRAM_OPTIONS_LIBRARY})
+set(CLICKHOUSE_EXTRACT_FROM_CONFIG_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/ExtractFromConfig.cpp)
+set(CLICKHOUSE_EXTRACT_FROM_CONFIG_LINK PRIVATE clickhouse_common_config clickhouse_common_io ${Boost_PROGRAM_OPTIONS_LIBRARY})
+#set(CLICKHOUSE_EXTRACT_FROM_CONFIG_INCLUDE SYSTEM PRIVATE ...)
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-extract-from-config clickhouse-extract-from-config.cpp)
-    target_link_libraries (clickhouse-extract-from-config PRIVATE clickhouse-extract-from-config-lib)
-    install (TARGETS clickhouse-extract-from-config ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+clickhouse_program_add(extract-from-config)
diff --git a/dbms/programs/format/CMakeLists.txt b/dbms/programs/format/CMakeLists.txt
index 67033730b07..aac72d641e6 100644
--- a/dbms/programs/format/CMakeLists.txt
+++ b/dbms/programs/format/CMakeLists.txt
@@ -1,7 +1,5 @@
-add_library (clickhouse-format-lib ${LINK_MODE} Format.cpp)
-target_link_libraries (clickhouse-format-lib PRIVATE dbms clickhouse_common_io clickhouse_parsers ${Boost_PROGRAM_OPTIONS_LIBRARY})
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-format clickhouse-format.cpp)
-    target_link_libraries (clickhouse-format PRIVATE clickhouse-format-lib)
-    install (TARGETS clickhouse-format ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+set(CLICKHOUSE_FORMAT_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Format.cpp)
+set(CLICKHOUSE_FORMAT_LINK PRIVATE dbms clickhouse_common_io clickhouse_parsers ${Boost_PROGRAM_OPTIONS_LIBRARY})
+#set(CLICKHOUSE_FORMAT_INCLUDE SYSTEM PRIVATE ...)
+
+clickhouse_program_add(format)
diff --git a/dbms/programs/local/CMakeLists.txt b/dbms/programs/local/CMakeLists.txt
index 70abc32a737..299458ef913 100644
--- a/dbms/programs/local/CMakeLists.txt
+++ b/dbms/programs/local/CMakeLists.txt
@@ -1,8 +1,9 @@
-add_library (clickhouse-local-lib ${LINK_MODE} LocalServer.cpp)
-target_link_libraries (clickhouse-local-lib PRIVATE clickhouse_dictionaries clickhouse_common_io clickhouse-server-lib clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Boost_PROGRAM_OPTIONS_LIBRARY})
+set(CLICKHOUSE_LOCAL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/LocalServer.cpp)
+set(CLICKHOUSE_LOCAL_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Boost_PROGRAM_OPTIONS_LIBRARY})
+#set(CLICKHOUSE_LOCAL_INCLUDE SYSTEM PRIVATE ...)
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-local clickhouse-local.cpp)
-    target_link_libraries (clickhouse-local PRIVATE clickhouse-local-lib)
-    install (TARGETS clickhouse-local ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
+clickhouse_program_add(local)
+
+if(NOT CLICKHOUSE_ONE_SHARED)
+    target_link_libraries(clickhouse-local-lib PRIVATE clickhouse-server-lib)
 endif ()
diff --git a/dbms/programs/main.cpp b/dbms/programs/main.cpp
index 2b88a5b7b0f..9ee2df0fab6 100644
--- a/dbms/programs/main.cpp
+++ b/dbms/programs/main.cpp
@@ -38,7 +38,7 @@ int mainEntryClickHouseLocal(int argc, char ** argv);
 #if ENABLE_CLICKHOUSE_BENCHMARK || !defined(ENABLE_CLICKHOUSE_BENCHMARK)
 int mainEntryClickHouseBenchmark(int argc, char ** argv);
 #endif
-#if ENABLE_CLICKHOUSE_PERFORMANCE || !defined(ENABLE_CLICKHOUSE_PERFORMANCE)
+#if ENABLE_CLICKHOUSE_PERFORMANCE_TEST || !defined(ENABLE_CLICKHOUSE_PERFORMANCE_TEST)
 int mainEntryClickHousePerformanceTest(int argc, char ** argv);
 #endif
 #if ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG || !defined(ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG)
@@ -84,7 +84,7 @@ std::pair<const char *, MainFunc> clickhouse_applications[] =
 #if ENABLE_CLICKHOUSE_SERVER || !defined(ENABLE_CLICKHOUSE_SERVER)
     {"server", mainEntryClickHouseServer},
 #endif
-#if ENABLE_CLICKHOUSE_PERFORMANCE || !defined(ENABLE_CLICKHOUSE_PERFORMANCE)
+#if ENABLE_CLICKHOUSE_PERFORMANCE_TEST || !defined(ENABLE_CLICKHOUSE_PERFORMANCE_TEST)
     {"performance-test", mainEntryClickHousePerformanceTest},
 #endif
 #if ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG || !defined(ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG)
diff --git a/dbms/programs/obfuscator/CMakeLists.txt b/dbms/programs/obfuscator/CMakeLists.txt
index 77096c2a169..19dba2be95c 100644
--- a/dbms/programs/obfuscator/CMakeLists.txt
+++ b/dbms/programs/obfuscator/CMakeLists.txt
@@ -1,9 +1,5 @@
-add_library (clickhouse-obfuscator-lib ${LINK_MODE} Obfuscator.cpp)
-target_link_libraries (clickhouse-obfuscator-lib PRIVATE dbms ${Boost_PROGRAM_OPTIONS_LIBRARY})
+set(CLICKHOUSE_OBFUSCATOR_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Obfuscator.cpp)
+set(CLICKHOUSE_OBFUSCATOR_LINK PRIVATE dbms ${Boost_PROGRAM_OPTIONS_LIBRARY})
+#set(CLICKHOUSE_OBFUSCATOR_INCLUDE SYSTEM PRIVATE ...)
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-obfuscator clickhouse-obfuscator.cpp)
-    set_target_properties(clickhouse-obfuscator PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..)
-    target_link_libraries (clickhouse-obfuscator PRIVATE clickhouse-obfuscator-lib)
-    install (TARGETS clickhouse-obfuscator ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+clickhouse_program_add(obfuscator)
diff --git a/dbms/programs/odbc-bridge/CMakeLists.txt b/dbms/programs/odbc-bridge/CMakeLists.txt
index 3b06e0bc395..b32fe363b73 100644
--- a/dbms/programs/odbc-bridge/CMakeLists.txt
+++ b/dbms/programs/odbc-bridge/CMakeLists.txt
@@ -1,30 +1,36 @@
-add_headers_and_sources(clickhouse_odbc_bridge .)
+set(CLICKHOUSE_ODBC_BRIDGE_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/ColumnInfoHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/getIdentifierQuote.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/HandlerFactory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/IdentifierQuoteHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/MainHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ODBCBlockInputStream.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/odbc-bridge.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ODBCBridge.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/PingHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/validateODBCConnectionString.cpp
+)
 
-add_library (clickhouse-odbc-bridge-lib ${LINK_MODE} ${clickhouse_odbc_bridge_sources})
-
-target_link_libraries (clickhouse-odbc-bridge-lib PRIVATE daemon dbms clickhouse_common_io)
-target_include_directories (clickhouse-odbc-bridge-lib PUBLIC ${ClickHouse_SOURCE_DIR}/libs/libdaemon/include)
+set(CLICKHOUSE_ODBC_BRIDGE_LINK PRIVATE daemon dbms clickhouse_common_io)
+set(CLICKHOUSE_ODBC_BRIDGE_INCLUDE PUBLIC ${ClickHouse_SOURCE_DIR}/libs/libdaemon/include)
 
 if (USE_POCO_SQLODBC)
-    target_link_libraries (clickhouse-odbc-bridge-lib PRIVATE ${Poco_SQLODBC_LIBRARY})
-    target_include_directories (clickhouse-odbc-bridge-lib SYSTEM PRIVATE ${ODBC_INCLUDE_DIRECTORIES} ${Poco_SQLODBC_INCLUDE_DIR})
+    set(CLICKHOUSE_ODBC_BRIDGE_LINK ${CLICKHOUSE_ODBC_BRIDGE_LINK} PRIVATE ${Poco_SQLODBC_LIBRARY})
+    set(CLICKHOUSE_ODBC_BRIDGE_INCLUDE ${CLICKHOUSE_ODBC_BRIDGE_INCLUDE} SYSTEM PRIVATE ${ODBC_INCLUDE_DIRECTORIES} ${Poco_SQLODBC_INCLUDE_DIR})
 endif ()
 if (Poco_SQL_FOUND)
-    target_link_libraries (clickhouse-odbc-bridge-lib PRIVATE ${Poco_SQL_LIBRARY})
+    set(CLICKHOUSE_ODBC_BRIDGE_LINK ${CLICKHOUSE_ODBC_BRIDGE_LINK} PRIVATE ${Poco_SQL_LIBRARY})
 endif ()
 
 if (USE_POCO_DATAODBC)
-    target_link_libraries (clickhouse-odbc-bridge-lib PRIVATE ${Poco_DataODBC_LIBRARY})
-    target_include_directories (clickhouse-odbc-bridge-lib SYSTEM PRIVATE ${ODBC_INCLUDE_DIRECTORIES} ${Poco_DataODBC_INCLUDE_DIR})
+    set(CLICKHOUSE_ODBC_BRIDGE_LINK ${CLICKHOUSE_ODBC_BRIDGE_LINK} PRIVATE ${Poco_DataODBC_LIBRARY})
+    set(CLICKHOUSE_ODBC_BRIDGE_INCLUDE ${CLICKHOUSE_ODBC_BRIDGE_INCLUDE} SYSTEM PRIVATE ${ODBC_INCLUDE_DIRECTORIES} ${Poco_DataODBC_INCLUDE_DIR})
 endif()
 if (Poco_Data_FOUND)
-    target_link_libraries (clickhouse-odbc-bridge-lib PRIVATE ${Poco_Data_LIBRARY})
+    set(CLICKHOUSE_ODBC_BRIDGE_LINK ${CLICKHOUSE_ODBC_BRIDGE_LINK} PRIVATE ${Poco_Data_LIBRARY})
 endif ()
 
-
-if (ENABLE_TESTS)
-    add_subdirectory (tests)
-endif ()
+clickhouse_program_add_library(odbc-bridge)
 
 # clickhouse-odbc-bridge is always a separate binary.
 # Reason: it must not export symbols from SSL, mariadb-client, etc. to not break ABI compatibility with ODBC drivers.
@@ -32,5 +38,11 @@ endif ()
 SET(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")
 
 add_executable (clickhouse-odbc-bridge odbc-bridge.cpp)
-target_link_libraries (clickhouse-odbc-bridge PRIVATE clickhouse-odbc-bridge-lib)
+
+clickhouse_program_link_split_binary(odbc-bridge)
+
 install (TARGETS clickhouse-odbc-bridge RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
+
+if (ENABLE_TESTS)
+    add_subdirectory (tests)
+endif ()
diff --git a/dbms/programs/odbc-bridge/tests/CMakeLists.txt b/dbms/programs/odbc-bridge/tests/CMakeLists.txt
index 5211c39d111..60e7afab969 100644
--- a/dbms/programs/odbc-bridge/tests/CMakeLists.txt
+++ b/dbms/programs/odbc-bridge/tests/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_executable (validate-odbc-connection-string validate-odbc-connection-string.cpp)
-target_link_libraries (validate-odbc-connection-string PRIVATE clickhouse-odbc-bridge-lib clickhouse_common_io)
+clickhouse_target_link_split_lib(validate-odbc-connection-string odbc-bridge)
+target_link_libraries (validate-odbc-connection-string PRIVATE clickhouse_common_io)
diff --git a/dbms/programs/performance-test/CMakeLists.txt b/dbms/programs/performance-test/CMakeLists.txt
index 974c64ef859..c7eeaa45ab3 100644
--- a/dbms/programs/performance-test/CMakeLists.txt
+++ b/dbms/programs/performance-test/CMakeLists.txt
@@ -1,21 +1,18 @@
-add_library (clickhouse-performance-test-lib ${LINK_MODE}
-  JSONString.cpp
-  StopConditionsSet.cpp
-  TestStopConditions.cpp
-  TestStats.cpp
-  ConfigPreprocessor.cpp
-  PerformanceTest.cpp
-  PerformanceTestInfo.cpp
-  executeQuery.cpp
-  applySubstitutions.cpp
-  ReportBuilder.cpp
-  PerformanceTestSuite.cpp
-)
-target_link_libraries (clickhouse-performance-test-lib PRIVATE dbms clickhouse_common_io clickhouse_common_config ${Boost_PROGRAM_OPTIONS_LIBRARY})
-target_include_directories (clickhouse-performance-test-lib SYSTEM PRIVATE ${PCG_RANDOM_INCLUDE_DIR})
+set(CLICKHOUSE_PERFORMANCE_TEST_SOURCES 
+  ${CMAKE_CURRENT_SOURCE_DIR}/JSONString.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/StopConditionsSet.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/TestStopConditions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/TestStats.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/ConfigPreprocessor.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceTest.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceTestInfo.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/executeQuery.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/applySubstitutions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/ReportBuilder.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceTestSuite.cpp
+ )
 
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-performance-test clickhouse-performance-test.cpp)
-    target_link_libraries (clickhouse-performance-test PRIVATE clickhouse-performance-test-lib)
-    install (TARGETS clickhouse-performance-test ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+set(CLICKHOUSE_PERFORMANCE_TEST_LINK PRIVATE dbms clickhouse_common_io clickhouse_common_config ${Boost_PROGRAM_OPTIONS_LIBRARY})
+set(CLICKHOUSE_PERFORMANCE_TEST_INCLUDE SYSTEM PRIVATE ${PCG_RANDOM_INCLUDE_DIR})
+
+clickhouse_program_add(performance-test)
diff --git a/dbms/programs/server/CMakeLists.txt b/dbms/programs/server/CMakeLists.txt
index 9cbfde2b1d5..217447413d5 100644
--- a/dbms/programs/server/CMakeLists.txt
+++ b/dbms/programs/server/CMakeLists.txt
@@ -1,27 +1,22 @@
-add_library (clickhouse-server-lib ${LINK_MODE}
-    HTTPHandler.cpp
-    InterserverIOHTTPHandler.cpp
-    MetricsTransmitter.cpp
-    NotFoundHandler.cpp
-    PingRequestHandler.cpp
-    ReplicasStatusHandler.cpp
-    RootRequestHandler.cpp
-    Server.cpp
-    TCPHandler.cpp
-    )
+set(CLICKHOUSE_SERVER_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/HTTPHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/InterserverIOHTTPHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/MetricsTransmitter.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/NotFoundHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/PingRequestHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ReplicasStatusHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/RootRequestHandler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Server.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/TCPHandler.cpp
+   )
 
-target_link_libraries (clickhouse-server-lib PRIVATE clickhouse_dictionaries clickhouse_common_io daemon clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY})
+set(CLICKHOUSE_SERVER_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io daemon clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY})
 if (USE_POCO_NETSSL)
-    target_link_libraries (clickhouse-server-lib PRIVATE ${Poco_NetSSL_LIBRARY} ${Poco_Crypto_LIBRARY})
+    set(CLICKHOUSE_SERVER_LINK ${CLICKHOUSE_SERVER_LINK} PRIVATE ${Poco_NetSSL_LIBRARY} ${Poco_Crypto_LIBRARY})
 endif ()
+set(CLICKHOUSE_SERVER_INCLUDE PUBLIC ${ClickHouse_SOURCE_DIR}/libs/libdaemon/include)
 
-target_include_directories (clickhouse-server-lib PUBLIC ${ClickHouse_SOURCE_DIR}/libs/libdaemon/include)
-
-if (CLICKHOUSE_SPLIT_BINARY)
-    add_executable (clickhouse-server clickhouse-server.cpp)
-    target_link_libraries (clickhouse-server PRIVATE clickhouse-server-lib)
-    install (TARGETS clickhouse-server ${CLICKHOUSE_ALL_TARGETS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
-endif ()
+clickhouse_program_add(server)
 
 if (GLIBC_COMPATIBILITY)
     set (GLIBC_MAX_REQUIRED 2.4 CACHE INTERNAL "")
@@ -31,7 +26,4 @@ if (GLIBC_COMPATIBILITY)
     #add_test(NAME GLIBC_required_version COMMAND bash -c "readelf -s ${CMAKE_CURRENT_BINARY_DIR}/../clickhouse-server | grep '@GLIBC' | grep -oP 'GLIBC_[\\d\\.]+' | sort | uniq | sort --version-sort --reverse | perl -lnE 'warn($_), exit 1 if $_ gt q{GLIBC_${GLIBC_MAX_REQUIRED}}'") # old
 endif ()
 
-install (
-    FILES config.xml users.xml
-    DESTINATION  ${CLICKHOUSE_ETC_DIR}/clickhouse-server
-    COMPONENT clickhouse)
+install(FILES config.xml users.xml DESTINATION ${CLICKHOUSE_ETC_DIR}/clickhouse-server COMPONENT clickhouse)
diff --git a/utils/compressor/CMakeLists.txt b/utils/compressor/CMakeLists.txt
index 2dec2117943..5af551f8d03 100644
--- a/utils/compressor/CMakeLists.txt
+++ b/utils/compressor/CMakeLists.txt
@@ -1,11 +1,5 @@
 find_package (Threads)
 
-add_executable (util-clickhouse-compressor main.cpp)
-target_link_libraries (util-clickhouse-compressor PRIVATE clickhouse-compressor-lib)
-set_target_properties(util-clickhouse-compressor PROPERTIES OUTPUT_NAME "clickhouse-compressor")
-
-#install (TARGETS util-clickhouse-compressor RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse-compressor)
-
 add_executable (zstd_test zstd_test.cpp)
 target_link_libraries (zstd_test PRIVATE ${ZSTD_LIBRARY} common Threads::Threads)
 

From d019ac7ec5fedefc9dfad16b13f32f3dc72b4000 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 21:19:48 +0300
Subject: [PATCH 40/69] Avoid std::terminate when invalidate_query returned
 wrong resultset #4580

---
 dbms/src/Interpreters/ExternalLoader.cpp  | 44 ++++++++++++++---------
 dbms/src/Interpreters/IExternalLoadable.h |  2 +-
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp
index 947a19c5204..8d7318bafdb 100644
--- a/dbms/src/Interpreters/ExternalLoader.cpp
+++ b/dbms/src/Interpreters/ExternalLoader.cpp
@@ -161,30 +161,40 @@ void ExternalLoader::reloadAndUpdate(bool throw_on_error)
 
         for (auto & loadable_object : loadable_objects)
         {
-            /// If the loadable objects failed to load or even failed to initialize from the config.
-            if (!loadable_object.second.loadable)
-                continue;
+            try
+            {
+                /// If the loadable objects failed to load or even failed to initialize from the config.
+                if (!loadable_object.second.loadable)
+                    continue;
 
-            const LoadablePtr & current = loadable_object.second.loadable;
-            const auto & lifetime = current->getLifetime();
+                const LoadablePtr & current = loadable_object.second.loadable;
+                const auto & lifetime = current->getLifetime();
 
-            /// do not update loadable objects with zero as lifetime
-            if (lifetime.min_sec == 0 || lifetime.max_sec == 0)
-                continue;
+                /// do not update loadable objects with zero as lifetime
+                if (lifetime.min_sec == 0 || lifetime.max_sec == 0)
+                    continue;
 
-            if (!current->supportUpdates())
-                continue;
+                if (!current->supportUpdates())
+                    continue;
 
-            auto update_time = update_times[current->getName()];
+                auto update_time = update_times[current->getName()];
 
-            /// check that timeout has passed
-            if (std::chrono::system_clock::now() < update_time)
-                continue;
+                /// check that timeout has passed
+                if (std::chrono::system_clock::now() < update_time)
+                    continue;
 
-            if (!current->isModified())
-                continue;
+                if (!current->isModified())
+                    continue;
 
-            objects_to_update.emplace_back(loadable_object.first, current);
+                objects_to_update.emplace_back(loadable_object.first, current);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(log, "Cannot check if the '" + loadable_object.first + "' " + object_name + " need to be updated");
+
+                if (throw_on_error)
+                    throw;
+            }
         }
     }
 
diff --git a/dbms/src/Interpreters/IExternalLoadable.h b/dbms/src/Interpreters/IExternalLoadable.h
index c94d8d97a49..33d40088f53 100644
--- a/dbms/src/Interpreters/IExternalLoadable.h
+++ b/dbms/src/Interpreters/IExternalLoadable.h
@@ -37,7 +37,7 @@ public:
     virtual std::string getName() const = 0;
     /// True if object can be updated when lifetime exceeded.
     virtual bool supportUpdates() const = 0;
-    /// If lifetime exceeded and isModified() ExternalLoader replace current object with the result of clone().
+    /// If lifetime exceeded and isModified(), ExternalLoader replace current object with the result of clone().
     virtual bool isModified() const = 0;
     /// Returns new object with the same configuration. Is used to update modified object when lifetime exceeded.
     virtual std::unique_ptr<IExternalLoadable> clone() const = 0;

From e418e4da3310300c1d3567417b8baec8d0f1a939 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 21:20:35 +0300
Subject: [PATCH 41/69] Added logging for invalidate query

---
 dbms/src/Dictionaries/ClickHouseDictionarySource.cpp | 3 +++
 dbms/src/Dictionaries/ClickHouseDictionarySource.h   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp b/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
index b797dd5815b..2c1fad33c02 100644
--- a/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
+++ b/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
@@ -6,6 +6,7 @@
 #include <Interpreters/executeQuery.h>
 #include <Common/isLocalAddress.h>
 #include <ext/range.h>
+#include <common/logger_useful.h>
 #include "DictionarySourceFactory.h"
 #include "DictionaryStructure.h"
 #include "ExternalQueryBuilder.h"
@@ -155,6 +156,7 @@ bool ClickHouseDictionarySource::isModified() const
     if (!invalidate_query.empty())
     {
         auto response = doInvalidateQuery(invalidate_query);
+        LOG_TRACE(log, "Invalidate query has returned: '" << response << "', previous value: '" << invalidate_query_response << "'");
         if (invalidate_query_response == response)
             return false;
         invalidate_query_response = response;
@@ -182,6 +184,7 @@ BlockInputStreamPtr ClickHouseDictionarySource::createStreamForSelectiveLoad(con
 
 std::string ClickHouseDictionarySource::doInvalidateQuery(const std::string & request) const
 {
+    LOG_TRACE(log, "Performing invalidate query: " << request);
     if (is_local)
     {
         Context query_context = context;
diff --git a/dbms/src/Dictionaries/ClickHouseDictionarySource.h b/dbms/src/Dictionaries/ClickHouseDictionarySource.h
index e468b642d37..2603f24fa0f 100644
--- a/dbms/src/Dictionaries/ClickHouseDictionarySource.h
+++ b/dbms/src/Dictionaries/ClickHouseDictionarySource.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <memory>
+#include <Poco/Logger.h>
 #include <Client/ConnectionPoolWithFailover.h>
 #include <Interpreters/Context.h>
 #include "DictionaryStructure.h"
@@ -70,6 +71,7 @@ private:
     const bool is_local;
     ConnectionPoolWithFailoverPtr pool;
     const std::string load_all_query;
+    Poco::Logger * log = &Poco::Logger::get("ClickHouseDictionarySource");
 };
 
 }

From 5bac476eb1de99e5686497b8a5e77a83bcdb4a2f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 21:21:07 +0300
Subject: [PATCH 42/69] Make the value of invalidate_query human readable

---
 dbms/src/Dictionaries/readInvalidateQuery.cpp | 10 ++++++----
 dbms/src/Dictionaries/readInvalidateQuery.h   |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/dbms/src/Dictionaries/readInvalidateQuery.cpp b/dbms/src/Dictionaries/readInvalidateQuery.cpp
index 5b9433d9a3b..dbd7c2e5f3e 100644
--- a/dbms/src/Dictionaries/readInvalidateQuery.cpp
+++ b/dbms/src/Dictionaries/readInvalidateQuery.cpp
@@ -1,8 +1,10 @@
 #include "readInvalidateQuery.h"
 #include <DataStreams/IBlockInputStream.h>
+#include <IO/WriteBufferFromString.h>
 
 namespace DB
 {
+
 namespace ErrorCodes
 {
     extern const int TOO_MANY_COLUMNS;
@@ -13,7 +15,6 @@ namespace ErrorCodes
 std::string readInvalidateQuery(IBlockInputStream & block_input_stream)
 {
     block_input_stream.readPrefix();
-    std::string response;
 
     Block block = block_input_stream.read();
     if (!block)
@@ -29,8 +30,9 @@ std::string readInvalidateQuery(IBlockInputStream & block_input_stream)
     if (rows > 1)
         throw Exception("Expected single row in resultset, got at least " + std::to_string(rows), ErrorCodes::TOO_MANY_ROWS);
 
-    auto column = block.getByPosition(0).column;
-    response = column->getDataAt(0).toString();
+    WriteBufferFromOwnString out;
+    auto & column_type = block.getByPosition(0);
+    column_type.type->serializeAsText(*column_type.column, 0, out, FormatSettings());
 
     while ((block = block_input_stream.read()))
     {
@@ -40,7 +42,7 @@ std::string readInvalidateQuery(IBlockInputStream & block_input_stream)
 
     block_input_stream.readSuffix();
 
-    return response;
+    return out.str();
 }
 
 }
diff --git a/dbms/src/Dictionaries/readInvalidateQuery.h b/dbms/src/Dictionaries/readInvalidateQuery.h
index 8810c291391..a906c8f887f 100644
--- a/dbms/src/Dictionaries/readInvalidateQuery.h
+++ b/dbms/src/Dictionaries/readInvalidateQuery.h
@@ -5,8 +5,8 @@ class IBlockInputStream;
 
 namespace DB
 {
-// Using in MySQLDictionarySource and XDBCDictionarySource after processing invalidate_query
+
+/// Using in MySQLDictionarySource and XDBCDictionarySource after processing invalidate_query.
 std::string readInvalidateQuery(IBlockInputStream & block_input_stream);
 
-
 }

From af5041532a422bc3d7fd26d40d5cec230be2d6a2 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 21:28:42 +0300
Subject: [PATCH 43/69] Logging of internal queries

---
 .../ClickHouseDictionarySource.cpp            |  2 +-
 dbms/src/Interpreters/executeQuery.cpp        | 42 ++++++++++---------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp b/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
index 2c1fad33c02..bc2bd8d9bd0 100644
--- a/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
+++ b/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
@@ -184,7 +184,7 @@ BlockInputStreamPtr ClickHouseDictionarySource::createStreamForSelectiveLoad(con
 
 std::string ClickHouseDictionarySource::doInvalidateQuery(const std::string & request) const
 {
-    LOG_TRACE(log, "Performing invalidate query: " << request);
+    LOG_TRACE(log, "Performing invalidate query");
     if (is_local)
     {
         Context query_context = context;
diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp
index a5856fb6173..bba4202e7c0 100644
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@@ -63,18 +63,24 @@ static String joinLines(const String & query)
 
 
 /// Log query into text log (not into system table).
-static void logQuery(const String & query, const Context & context)
+static void logQuery(const String & query, const Context & context, bool internal)
 {
-    const auto & current_query_id = context.getClientInfo().current_query_id;
-    const auto & initial_query_id = context.getClientInfo().initial_query_id;
-    const auto & current_user = context.getClientInfo().current_user;
+    if (internal)
+    {
+        LOG_DEBUG(&Logger::get("executeQuery"), "(internal) " << joinLines(query));
+    }
+    else
+    {
+        const auto & current_query_id = context.getClientInfo().current_query_id;
+        const auto & initial_query_id = context.getClientInfo().initial_query_id;
+        const auto & current_user = context.getClientInfo().current_user;
 
-    LOG_DEBUG(&Logger::get("executeQuery"), "(from " << context.getClientInfo().current_address.toString()
-    << (current_user != "default" ? ", user: " + context.getClientInfo().current_user : "")
-    << (!initial_query_id.empty() && current_query_id != initial_query_id ? ", initial_query_id: " + initial_query_id : std::string())
-    << ") "
-    << joinLines(query)
-    );
+        LOG_DEBUG(&Logger::get("executeQuery"), "(from " << context.getClientInfo().current_address.toString()
+            << (current_user != "default" ? ", user: " + context.getClientInfo().current_user : "")
+            << (!initial_query_id.empty() && current_query_id != initial_query_id ? ", initial_query_id: " + initial_query_id : std::string())
+            << ") "
+            << joinLines(query));
+    }
 }
 
 
@@ -176,13 +182,12 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
     }
     catch (...)
     {
+        /// Anyway log the query.
+        String query = String(begin, begin + std::min(end - begin, static_cast<ptrdiff_t>(max_query_size)));
+        logQuery(query.substr(0, settings.log_queries_cut_to_length), context, internal);
+
         if (!internal)
-        {
-            /// Anyway log the query.
-            String query = String(begin, begin + std::min(end - begin, static_cast<ptrdiff_t>(max_query_size)));
-            logQuery(query.substr(0, settings.log_queries_cut_to_length), context);
             onExceptionBeforeStart(query, context, current_time);
-        }
 
         throw;
     }
@@ -193,15 +198,14 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
 
     try
     {
-        if (!internal)
-            logQuery(query.substr(0, settings.log_queries_cut_to_length), context);
+        logQuery(query.substr(0, settings.log_queries_cut_to_length), context, internal);
 
         if (!internal && settings.allow_experimental_multiple_joins_emulation)
         {
             JoinToSubqueryTransformVisitor::Data join_to_subs_data;
             JoinToSubqueryTransformVisitor(join_to_subs_data).visit(ast);
             if (join_to_subs_data.done)
-                logQuery(queryToString(*ast), context);
+                logQuery(queryToString(*ast), context, internal);
         }
 
         if (!internal && settings.allow_experimental_cross_to_join_conversion)
@@ -209,7 +213,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             CrossToInnerJoinVisitor::Data cross_to_inner;
             CrossToInnerJoinVisitor(cross_to_inner).visit(ast);
             if (cross_to_inner.done)
-                logQuery(queryToString(*ast), context);
+                logQuery(queryToString(*ast), context, internal);
         }
 
         /// Check the limits.

From bbaece69000fd24dad283784f5b78a068ce98a04 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 22:04:35 +0300
Subject: [PATCH 44/69] Fixed error

---
 dbms/src/Dictionaries/ClickHouseDictionarySource.cpp | 2 +-
 dbms/src/Dictionaries/readInvalidateQuery.cpp        | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp b/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
index bc2bd8d9bd0..cd609bfc70a 100644
--- a/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
+++ b/dbms/src/Dictionaries/ClickHouseDictionarySource.cpp
@@ -156,7 +156,7 @@ bool ClickHouseDictionarySource::isModified() const
     if (!invalidate_query.empty())
     {
         auto response = doInvalidateQuery(invalidate_query);
-        LOG_TRACE(log, "Invalidate query has returned: '" << response << "', previous value: '" << invalidate_query_response << "'");
+        LOG_TRACE(log, "Invalidate query has returned: " << response << ", previous value: " << invalidate_query_response);
         if (invalidate_query_response == response)
             return false;
         invalidate_query_response = response;
diff --git a/dbms/src/Dictionaries/readInvalidateQuery.cpp b/dbms/src/Dictionaries/readInvalidateQuery.cpp
index dbd7c2e5f3e..26f543af0b9 100644
--- a/dbms/src/Dictionaries/readInvalidateQuery.cpp
+++ b/dbms/src/Dictionaries/readInvalidateQuery.cpp
@@ -2,6 +2,7 @@
 #include <DataStreams/IBlockInputStream.h>
 #include <IO/WriteBufferFromString.h>
 
+
 namespace DB
 {
 
@@ -32,16 +33,13 @@ std::string readInvalidateQuery(IBlockInputStream & block_input_stream)
 
     WriteBufferFromOwnString out;
     auto & column_type = block.getByPosition(0);
-    column_type.type->serializeAsText(*column_type.column, 0, out, FormatSettings());
+    column_type.type->serializeAsTextQuoted(*column_type.column->convertToFullColumnIfConst(), 0, out, FormatSettings());
 
     while ((block = block_input_stream.read()))
-    {
         if (block.rows() > 0)
             throw Exception("Expected single row in resultset, got at least " + std::to_string(rows + 1), ErrorCodes::TOO_MANY_ROWS);
-    }
 
     block_input_stream.readSuffix();
-
     return out.str();
 }
 

From 2e7db37cde7dc1dcbf77101d84c55010ed40f4d3 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 22:13:05 +0300
Subject: [PATCH 45/69] Fixed issue with update time of dictionaries after
 checking that the dictionary is not modified #4581

---
 dbms/src/Interpreters/ExternalLoader.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp
index 8d7318bafdb..4e0f12ea254 100644
--- a/dbms/src/Interpreters/ExternalLoader.cpp
+++ b/dbms/src/Interpreters/ExternalLoader.cpp
@@ -155,6 +155,14 @@ void ExternalLoader::reloadAndUpdate(bool throw_on_error)
     /// periodic update
     std::vector<std::pair<std::string, LoadablePtr>> objects_to_update;
 
+    auto getNextUpdateTime = [this](const LoadablePtr & current)
+    {
+        /// calculate next update time
+        const auto & lifetime = current->getLifetime();
+        std::uniform_int_distribution<UInt64> distribution{lifetime.min_sec, lifetime.max_sec};
+        return std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)};
+    };
+
     /// Collect objects that needs to be updated under lock. Then create new versions without lock, and assign under lock.
     {
         std::lock_guard lock{map_mutex};
@@ -177,14 +185,17 @@ void ExternalLoader::reloadAndUpdate(bool throw_on_error)
                 if (!current->supportUpdates())
                     continue;
 
-                auto update_time = update_times[current->getName()];
+                auto & update_time = update_times[current->getName()];
 
                 /// check that timeout has passed
                 if (std::chrono::system_clock::now() < update_time)
                     continue;
 
                 if (!current->isModified())
+                {
+                    update_time = getNextUpdateTime(current);
                     continue;
+                }
 
                 objects_to_update.emplace_back(loadable_object.first, current);
             }
@@ -219,10 +230,7 @@ void ExternalLoader::reloadAndUpdate(bool throw_on_error)
 
             if (auto it = loadable_objects.find(name); it != loadable_objects.end())
             {
-                /// calculate next update time
-                const auto & lifetime = current->getLifetime();
-                std::uniform_int_distribution<UInt64> distribution{lifetime.min_sec, lifetime.max_sec};
-                update_times[name] = std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)};
+                update_times[name] = getNextUpdateTime(current);
 
                 it->second.exception = exception;
                 if (!exception)

From f15762a96ed705c277318ceb652f3988f2e80415 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 4 Mar 2019 22:29:23 +0300
Subject: [PATCH 46/69] Added a test for ThreadPool #4572

---
 .../0_stateless/00913_many_threads.reference        |  4 ++++
 .../queries/0_stateless/00913_many_threads.sql      | 13 +++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 dbms/tests/queries/0_stateless/00913_many_threads.reference
 create mode 100644 dbms/tests/queries/0_stateless/00913_many_threads.sql

diff --git a/dbms/tests/queries/0_stateless/00913_many_threads.reference b/dbms/tests/queries/0_stateless/00913_many_threads.reference
new file mode 100644
index 00000000000..ddc9df5d339
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00913_many_threads.reference
@@ -0,0 +1,4 @@
+1
+1500
+0
+Ok.
diff --git a/dbms/tests/queries/0_stateless/00913_many_threads.sql b/dbms/tests/queries/0_stateless/00913_many_threads.sql
new file mode 100644
index 00000000000..fa567582b21
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00913_many_threads.sql
@@ -0,0 +1,13 @@
+-- This test creates many threads to test a case when ThreadPool will remove some threads from pool after job is done.
+SET max_block_size = 1, min_insert_block_size_rows = 0, min_insert_block_size_bytes = 0;
+
+CREATE TEMPORARY TABLE t (x UInt64);
+INSERT INTO t SELECT * FROM system.numbers LIMIT 1500;
+
+SELECT DISTINCT blockSize() FROM t;
+
+SET max_threads = 1500;
+
+SELECT count() FROM t;
+SELECT sum(sleep(0.1)) FROM t; -- All threads have time to be created.
+SELECT 'Ok.';

From 154ea471565a0108a2c686826eb95212af098773 Mon Sep 17 00:00:00 2001
From: proller <proller@users.noreply.github.com>
Date: Mon, 4 Mar 2019 22:37:50 +0300
Subject: [PATCH 47/69] Build fixes (#4582)

* Fix link in split mode

* clean

* Fix link validate-odbc-connection-string

* Fix includes
---
 dbms/src/Compression/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbms/src/Compression/CMakeLists.txt b/dbms/src/Compression/CMakeLists.txt
index 288d452bebf..10ef3993899 100644
--- a/dbms/src/Compression/CMakeLists.txt
+++ b/dbms/src/Compression/CMakeLists.txt
@@ -3,6 +3,7 @@ add_headers_and_sources(clickhouse_compression .)
 add_library(clickhouse_compression ${LINK_MODE} ${clickhouse_compression_headers} ${clickhouse_compression_sources})
 target_link_libraries(clickhouse_compression PRIVATE clickhouse_parsers clickhouse_common_io ${ZSTD_LIBRARY} ${LZ4_LIBRARY})
 target_include_directories(clickhouse_compression PUBLIC ${DBMS_INCLUDE_DIR})
+target_include_directories(clickhouse_compression SYSTEM PUBLIC ${PCG_RANDOM_INCLUDE_DIR})
 
 if (NOT USE_INTERNAL_LZ4_LIBRARY)
     target_include_directories(clickhouse_compression SYSTEM BEFORE PRIVATE ${LZ4_INCLUDE_DIR})

From 339047fc40bdd24272ca78b330b3f71ff2b87145 Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Mon, 4 Mar 2019 22:40:58 +0300
Subject: [PATCH 48/69] fix multiple joins asterisks qualification

---
 .../DatabaseAndTableWithAlias.cpp             |  4 +-
 .../Interpreters/DatabaseAndTableWithAlias.h  |  2 +-
 dbms/src/Interpreters/SyntaxAnalyzer.cpp      |  3 +-
 .../TranslateQualifiedNamesVisitor.cpp        | 34 +++++--------
 dbms/src/Parsers/ASTIdentifier.cpp            |  4 ++
 dbms/src/Parsers/ASTIdentifier.h              |  1 +
 .../00820_multiple_joins.reference            |  4 --
 .../0_stateless/00820_multiple_joins.sql      | 10 ++--
 .../00847_multiple_join_same_column.reference | 37 ++++++++++++++
 .../00847_multiple_join_same_column.sql       | 48 +++++++++++++++++++
 10 files changed, 111 insertions(+), 36 deletions(-)
 create mode 100644 dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference
 create mode 100644 dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql

diff --git a/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp b/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp
index c6fbfaad088..52b05fc5933 100644
--- a/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp
+++ b/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp
@@ -62,11 +62,11 @@ bool DatabaseAndTableWithAlias::satisfies(const DatabaseAndTableWithAlias & db_t
     return database == db_table.database && table == db_table.table;
 }
 
-String DatabaseAndTableWithAlias::getQualifiedNamePrefix() const
+String DatabaseAndTableWithAlias::getQualifiedNamePrefix(bool with_dot) const
 {
     if (alias.empty() && table.empty())
         return "";
-    return (!alias.empty() ? alias : table) + '.';
+    return (!alias.empty() ? alias : table) + (with_dot ? "." : "");
 }
 
 std::vector<const ASTTableExpression *> getSelectTablesExpression(const ASTSelectQuery & select_query)
diff --git a/dbms/src/Interpreters/DatabaseAndTableWithAlias.h b/dbms/src/Interpreters/DatabaseAndTableWithAlias.h
index e9d8ee409a6..0f1cbe8bbc7 100644
--- a/dbms/src/Interpreters/DatabaseAndTableWithAlias.h
+++ b/dbms/src/Interpreters/DatabaseAndTableWithAlias.h
@@ -32,7 +32,7 @@ struct DatabaseAndTableWithAlias
     DatabaseAndTableWithAlias(const ASTTableExpression & table_expression, const String & current_database = "");
 
     /// "alias." or "table." if alias is empty
-    String getQualifiedNamePrefix() const;
+    String getQualifiedNamePrefix(bool with_dot = true) const;
 
     /// Check if it satisfies another db_table name. @note opterion is not symmetric.
     bool satisfies(const DatabaseAndTableWithAlias & table, bool table_may_be_an_alias);
diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.cpp b/dbms/src/Interpreters/SyntaxAnalyzer.cpp
index d3b42c57926..6d274e326b4 100644
--- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp
+++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp
@@ -652,7 +652,8 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(
     {
         if (const ASTTablesInSelectQueryElement * node = select_query->join())
         {
-            replaceJoinedTable(node);
+            if (settings.enable_optimize_predicate_expression)
+                replaceJoinedTable(node);
 
             const auto & joined_expression = static_cast<const ASTTableExpression &>(*node->table_expression);
             DatabaseAndTableWithAlias table(joined_expression, context.getCurrentDatabase());
diff --git a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
index 07a823e3452..a0b5aed0af3 100644
--- a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
+++ b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
@@ -143,21 +143,14 @@ void TranslateQualifiedNamesMatcher::visit(ASTSelectQuery & select, const ASTPtr
         Visitor(data).visit(*add_node);
 }
 
-/// qualifed names for duplicates
-static std::shared_ptr<ASTIdentifier> makeIdentifier(const String & short_name, const String & long_name, bool need_long_name)
+static void addIdentifier(ASTs & nodes, const String & table_name, const String & column_name, AsteriskSemantic::RevertedAliasesPtr aliases)
 {
-    if (need_long_name)
-        return std::make_shared<ASTIdentifier>(long_name);
-    return std::make_shared<ASTIdentifier>(short_name);
-}
+    auto identifier = std::make_shared<ASTIdentifier>(std::vector<String>{table_name, column_name});
 
-static void addIdentifier(ASTs & nodes, std::shared_ptr<ASTIdentifier> identifier, const String & long_name,
-                          AsteriskSemantic::RevertedAliasesPtr aliases)
-{
     bool added = false;
-    if (aliases && aliases->count(long_name))
+    if (aliases && aliases->count(identifier->name))
     {
-        for (const String & alias : (*aliases)[long_name])
+        for (const String & alias : (*aliases)[identifier->name])
         {
             nodes.push_back(identifier->clone());
             nodes.back()->setAlias(alias);
@@ -173,7 +166,6 @@ static void addIdentifier(ASTs & nodes, std::shared_ptr<ASTIdentifier> identifie
 void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPtr &, Data & data)
 {
     const auto & tables_with_columns = data.tables;
-    const auto & source_columns = data.source_columns;
 
     ASTs old_children;
     if (data.processAsterisks())
@@ -208,16 +200,14 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
         if (const auto * asterisk = typeid_cast<const ASTAsterisk *>(child.get()))
         {
             bool first_table = true;
-            for (const auto & [table_name, table_columns] : tables_with_columns)
+            for (const auto & [table, table_columns] : tables_with_columns)
             {
                 for (const auto & column_name : table_columns)
                 {
                     if (first_table || !data.join_using_columns.count(column_name))
                     {
-                        bool need_prefix = !first_table && source_columns.count(column_name);
-                        String long_name = table_name.getQualifiedNamePrefix() + column_name;
-                        auto identifier = makeIdentifier(column_name, long_name, need_prefix);
-                        addIdentifier(node.children, identifier, long_name, AsteriskSemantic::getAliases(*asterisk));
+                        String table_name = table.getQualifiedNamePrefix(false);
+                        addIdentifier(node.children, table_name, column_name, AsteriskSemantic::getAliases(*asterisk));
                     }
                 }
 
@@ -229,16 +219,14 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
             DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);
 
             bool first_table = true;
-            for (const auto & [table_name, table_columns] : tables_with_columns)
+            for (const auto & [table, table_columns] : tables_with_columns)
             {
-                if (ident_db_and_name.satisfies(table_name, true))
+                if (ident_db_and_name.satisfies(table, true))
                 {
                     for (const auto & column_name : table_columns)
                     {
-                        bool need_prefix = !first_table && source_columns.count(column_name);
-                        String long_name = table_name.getQualifiedNamePrefix() + column_name;
-                        auto identifier = makeIdentifier(column_name, long_name, need_prefix);
-                        addIdentifier(node.children, identifier, long_name, AsteriskSemantic::getAliases(*qualified_asterisk));
+                        String table_name = table.getQualifiedNamePrefix(false);
+                        addIdentifier(node.children, table_name, column_name, AsteriskSemantic::getAliases(*qualified_asterisk));
                     }
                     break;
                 }
diff --git a/dbms/src/Parsers/ASTIdentifier.cpp b/dbms/src/Parsers/ASTIdentifier.cpp
index 406a405b02c..c8b3d719b3b 100644
--- a/dbms/src/Parsers/ASTIdentifier.cpp
+++ b/dbms/src/Parsers/ASTIdentifier.cpp
@@ -29,6 +29,10 @@ ASTIdentifier::ASTIdentifier(const String & name_, std::vector<String> && name_p
 {
 }
 
+ASTIdentifier::ASTIdentifier(std::vector<String> && name_parts_)
+    : ASTIdentifier(name_parts_.at(0) + '.' + name_parts_.at(1), std::move(name_parts_))
+{}
+
 void ASTIdentifier::setShortName(const String & new_name)
 {
     name = new_name;
diff --git a/dbms/src/Parsers/ASTIdentifier.h b/dbms/src/Parsers/ASTIdentifier.h
index 995b725185c..1439ab2dcbd 100644
--- a/dbms/src/Parsers/ASTIdentifier.h
+++ b/dbms/src/Parsers/ASTIdentifier.h
@@ -22,6 +22,7 @@ public:
     String name;
 
     ASTIdentifier(const String & name_, std::vector<String> && name_parts_ = {});
+    ASTIdentifier(std::vector<String> && name_parts_);
 
     /** Get the text that identifies this element. */
     String getID(char delim) const override { return "Identifier" + (delim + name); }
diff --git a/dbms/tests/queries/0_stateless/00820_multiple_joins.reference b/dbms/tests/queries/0_stateless/00820_multiple_joins.reference
index 2e7d8660562..93744e2c46d 100644
--- a/dbms/tests/queries/0_stateless/00820_multiple_joins.reference
+++ b/dbms/tests/queries/0_stateless/00820_multiple_joins.reference
@@ -32,7 +32,3 @@
 6	6	60	60
 12	12	120	120
 18	18	180	180
-0	0	0	0	0	0	0
-6	6	60	60	66	66	120
-12	12	120	120	132	132	240
-18	18	180	180	198	198	360
diff --git a/dbms/tests/queries/0_stateless/00820_multiple_joins.sql b/dbms/tests/queries/0_stateless/00820_multiple_joins.sql
index c19f4467934..b61777419d9 100644
--- a/dbms/tests/queries/0_stateless/00820_multiple_joins.sql
+++ b/dbms/tests/queries/0_stateless/00820_multiple_joins.sql
@@ -69,11 +69,11 @@ from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b; -- { serverError 48 }
 
-select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b,
-    (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
-from table1 as t1
-join table2 as t2 on t1_a = t2_a
-join table3 as t3 on t2_b = t3_b;
+--select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b,
+--    (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
+--from table1 as t1
+--join table2 as t2 on t1_a = t2_a
+--join table3 as t3 on t2_b = t3_b;
 
 --select (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
 --from table1 as t1
diff --git a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference
new file mode 100644
index 00000000000..d4d045a7796
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference
@@ -0,0 +1,37 @@
+Row 1:
+──────
+t.a: 1
+s.b: 1
+s.a: 1
+s.b: 1
+y.a: 1
+y.b: 1
+
+Row 2:
+──────
+t.a: 2
+s.b: 0
+s.a: 0
+s.b: 0
+y.a: 0
+y.b: 0
+┌─t.a─┬─s.b─┬─s.a─┬─s.b─┬─y.a─┬─y.b─┐
+│   1 │   1 │   1 │   1 │   1 │   1 │
+│   2 │   0 │   0 │   0 │   0 │   0 │
+└─────┴─────┴─────┴─────┴─────┴─────┘
+┌─t_a─┐
+│   1 │
+│   2 │
+└─────┘
+┌─t.a─┬─s_a─┐
+│   1 │   1 │
+│   2 │   0 │
+└─────┴─────┘
+┌─t.a─┬─t.a─┬─t_b─┐
+│   1 │   1 │   1 │
+│   2 │   2 │   2 │
+└─────┴─────┴─────┘
+┌─s.a─┬─s.a─┐
+│   1 │   1 │
+│   0 │   0 │
+└─────┴─────┘
diff --git a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql
new file mode 100644
index 00000000000..08aca31f99e
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql
@@ -0,0 +1,48 @@
+use test;
+
+drop table if exists t;
+drop table if exists s;
+drop table if exists y;
+
+create table t(a Int64, b Int64) engine = TinyLog;
+create table s(a Int64, b Int64) engine = TinyLog;
+create table y(a Int64, b Int64) engine = TinyLog;
+
+insert into t values (1,1), (2,2);
+insert into s values (1,1);
+insert into y values (1,1);
+
+select t.a, s.b, s.a, s.b, y.a, y.b from t
+left join s on (t.a = s.a and t.b = s.b)
+left join y on (y.a = s.a and y.b = s.b) format Vertical;
+
+select t.a, s.b, s.a, s.b, y.a, y.b from t
+left join s on (t.a = s.a and s.b = t.b)
+left join y on (y.a = s.a and y.b = s.b) format PrettyCompactNoEscapes;
+
+select t.a as t_a from t
+left join s on s.a = t_a format PrettyCompactNoEscapes;
+
+select t.a, s.a as s_a from t
+left join s on s.a = t.a
+left join y on y.b = s.b format PrettyCompactNoEscapes;
+
+select t.a, t.a, t.b as t_b from t
+left join s on s.a = t.a
+left join y on y.b = s.b format PrettyCompactNoEscapes;
+
+select s.a, s.a from t
+left join s on s.a = t.a
+left join y on y.b = s.b format PrettyCompactNoEscapes;
+
+--select t.a, t.a, t.b as t_b, t.b from t
+--left join s on s.a = t.a
+--left join y on y.b = s.b format PrettyCompactNoEscapes;
+
+--select t.a, t.a, s.b as s_b, s.b from t
+--left join s on s.a = t.a
+--left join y on y.b = s.b format PrettyCompactNoEscapes;
+
+drop table t;
+drop table s;
+drop table y;

From 405a747ddc57e03c16ef6fe9c0e5576cc3a139c3 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitbar@yandex-team.ru>
Date: Sat, 2 Mar 2019 13:21:11 +0300
Subject: [PATCH 49/69] Fix compilation after changing SmallTable.

---
 .../AggregateFunctionGroupBitmapData.h        | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
index 8a121b92866..bd3c2d63b77 100644
--- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
@@ -28,7 +28,7 @@ private:
         rb = roaring_bitmap_create();
 
         for (const auto & x : small)
-            roaring_bitmap_add(rb, x);
+            roaring_bitmap_add(rb, x.getValue());
     }
 
 public:
@@ -75,7 +75,7 @@ public:
         else
         {
             for (const auto & x : r1.small)
-                add(x);
+                add(x.getValue());
         }
     }
 
@@ -121,7 +121,7 @@ public:
     {
         roaring_bitmap_t * smallRb = roaring_bitmap_create();
         for (const auto & x : small)
-            roaring_bitmap_add(smallRb, x);
+            roaring_bitmap_add(smallRb, x.getValue());
         return smallRb;
     }
 
@@ -134,9 +134,9 @@ public:
         if (isSmall() && r1.isSmall())
         {
             // intersect
-            for (const auto & value : this->small)
-                if (r1.small.find(value) != r1.small.end())
-                    buffer.push_back(value);
+            for (const auto & x : this->small)
+                if (r1.small.find(x.getValue()) != r1.small.end())
+                    buffer.push_back(x.getValue());
 
             // Clear out the original values
             this->small.clear();
@@ -148,9 +148,9 @@ public:
         }
         else if (isSmall() && r1.isLarge())
         {
-            for (const auto & value : this->small)
-                if (roaring_bitmap_contains(r1.rb, value))
-                    buffer.push_back(value);
+            for (const auto & x : this->small)
+                if (roaring_bitmap_contains(r1.rb, x.getValue()))
+                    buffer.push_back(x.getValue());
 
             // Clear out the original values
             this->small.clear();
@@ -196,9 +196,9 @@ public:
         if (isSmall() && r1.isSmall())
         {
             // subtract
-            for (const auto & value : this->small)
-                if (r1.small.find(value) == r1.small.end())
-                    buffer.push_back(value);
+            for (const auto & x : this->small)
+                if (r1.small.find(x.getValue()) == r1.small.end())
+                    buffer.push_back(x.getValue());
 
             // Clear out the original values
             this->small.clear();
@@ -210,9 +210,9 @@ public:
         }
         else if (isSmall() && r1.isLarge())
         {
-            for (const auto & value : this->small)
-                if (!roaring_bitmap_contains(r1.rb, value))
-                    buffer.push_back(value);
+            for (const auto & x : this->small)
+                if (!roaring_bitmap_contains(r1.rb, x.getValue()))
+                    buffer.push_back(x.getValue());
 
             // Clear out the original values
             this->small.clear();
@@ -239,14 +239,14 @@ public:
         UInt64 retSize = 0;
         if (isSmall() && r1.isSmall())
         {
-            for (const auto & value : this->small)
-                if (r1.small.find(value) != r1.small.end())
+            for (const auto & x : this->small)
+                if (r1.small.find(x.getValue()) != r1.small.end())
                     retSize++;
         }
         else if (isSmall() && r1.isLarge())
         {
-            for (const auto & value : this->small)
-                if (roaring_bitmap_contains(r1.rb, value))
+            for (const auto & x : this->small)
+                if (roaring_bitmap_contains(r1.rb, x.getValue()))
                     retSize++;
         }
         else
@@ -363,7 +363,7 @@ public:
         {
             for (const auto & x : small)
             {
-                res_data.emplace_back(x);
+                res_data.emplace_back(x.getValue());
                 count++;
             }
         }

From 6edec1c63fbdd28687512fb350bf50b7cd7f6a05 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitbar@yandex-team.ru>
Date: Mon, 4 Mar 2019 03:39:04 +0300
Subject: [PATCH 50/69] Remove unnecessary 'this->'.

---
 .../AggregateFunctionGroupBitmapData.h        | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
index bd3c2d63b77..ea7907b4d60 100644
--- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
@@ -134,29 +134,29 @@ public:
         if (isSmall() && r1.isSmall())
         {
             // intersect
-            for (const auto & x : this->small)
+            for (const auto & x : small)
                 if (r1.small.find(x.getValue()) != r1.small.end())
                     buffer.push_back(x.getValue());
 
             // Clear out the original values
-            this->small.clear();
+            small.clear();
 
             for (const auto & value : buffer)
-                this->small.insert(value);
+                small.insert(value);
 
             buffer.clear();
         }
         else if (isSmall() && r1.isLarge())
         {
-            for (const auto & x : this->small)
+            for (const auto & x : small)
                 if (roaring_bitmap_contains(r1.rb, x.getValue()))
                     buffer.push_back(x.getValue());
 
             // Clear out the original values
-            this->small.clear();
+            small.clear();
 
             for (const auto & value : buffer)
-                this->small.insert(value);
+                small.insert(value);
 
             buffer.clear();
         }
@@ -172,14 +172,14 @@ public:
     /**
      * Computes the union between two bitmaps.
      */
-    void rb_or(const RoaringBitmapWithSmallSet & r1) { this->merge(r1); }
+    void rb_or(const RoaringBitmapWithSmallSet & r1) { merge(r1); }
 
     /**
      * Computes the symmetric difference (xor) between two bitmaps.
      */
     void rb_xor(const RoaringBitmapWithSmallSet & r1)
     {
-        if (this->isSmall())
+        if (isSmall())
             toLarge();
         roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
         roaring_bitmap_xor_inplace(rb, rb1);
@@ -196,29 +196,29 @@ public:
         if (isSmall() && r1.isSmall())
         {
             // subtract
-            for (const auto & x : this->small)
+            for (const auto & x : small)
                 if (r1.small.find(x.getValue()) == r1.small.end())
                     buffer.push_back(x.getValue());
 
             // Clear out the original values
-            this->small.clear();
+            small.clear();
 
             for (const auto & value : buffer)
-                this->small.insert(value);
+                small.insert(value);
 
             buffer.clear();
         }
         else if (isSmall() && r1.isLarge())
         {
-            for (const auto & x : this->small)
+            for (const auto & x : small)
                 if (!roaring_bitmap_contains(r1.rb, x.getValue()))
                     buffer.push_back(x.getValue());
 
             // Clear out the original values
-            this->small.clear();
+            small.clear();
 
             for (const auto & value : buffer)
-                this->small.insert(value);
+                small.insert(value);
 
             buffer.clear();
         }
@@ -239,13 +239,13 @@ public:
         UInt64 retSize = 0;
         if (isSmall() && r1.isSmall())
         {
-            for (const auto & x : this->small)
+            for (const auto & x : small)
                 if (r1.small.find(x.getValue()) != r1.small.end())
                     retSize++;
         }
         else if (isSmall() && r1.isLarge())
         {
-            for (const auto & x : this->small)
+            for (const auto & x : small)
                 if (roaring_bitmap_contains(r1.rb, x.getValue()))
                     retSize++;
         }
@@ -264,9 +264,9 @@ public:
     */
     UInt64 rb_or_cardinality(const RoaringBitmapWithSmallSet & r1) const
     {
-        UInt64 c1 = this->size();
+        UInt64 c1 = size();
         UInt64 c2 = r1.size();
-        UInt64 inter = this->rb_and_cardinality(r1);
+        UInt64 inter = rb_and_cardinality(r1);
         return c1 + c2 - inter;
     }
 
@@ -275,9 +275,9 @@ public:
     */
     UInt64 rb_xor_cardinality(const RoaringBitmapWithSmallSet & r1) const
     {
-        UInt64 c1 = this->size();
+        UInt64 c1 = size();
         UInt64 c2 = r1.size();
-        UInt64 inter = this->rb_and_cardinality(r1);
+        UInt64 inter = rb_and_cardinality(r1);
         return c1 + c2 - 2 * inter;
     }
 
@@ -286,8 +286,8 @@ public:
      */
     UInt64 rb_andnot_cardinality(const RoaringBitmapWithSmallSet & r1) const
     {
-        UInt64 c1 = this->size();
-        UInt64 inter = this->rb_and_cardinality(r1);
+        UInt64 c1 = size();
+        UInt64 inter = rb_and_cardinality(r1);
         return c1 - inter;
     }
 
@@ -296,7 +296,7 @@ public:
      */
     UInt8 rb_equals(const RoaringBitmapWithSmallSet & r1)
     {
-        if (this->isSmall())
+        if (isSmall())
             toLarge();
         roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
         UInt8 is_true = roaring_bitmap_equals(rb, rb1);
@@ -310,7 +310,7 @@ public:
      */
     UInt8 rb_intersect(const RoaringBitmapWithSmallSet & r1)
     {
-        if (this->isSmall())
+        if (isSmall())
             toLarge();
         roaring_bitmap_t * rb1 = r1.isSmall() ? r1.getNewRbFromSmall() : r1.getRb();
         UInt8 is_true = roaring_bitmap_intersect(rb, rb1);
@@ -324,7 +324,7 @@ public:
      */
     void rb_remove(UInt64 offsetid)
     {
-        if (this->isSmall())
+        if (isSmall())
             toLarge();
         roaring_bitmap_remove(rb, offsetid);
     }
@@ -337,7 +337,7 @@ public:
      */
     void rb_flip(UInt64 offsetstart, UInt64 offsetend)
     {
-        if (this->isSmall())
+        if (isSmall())
             toLarge();
         roaring_bitmap_flip_inplace(rb, offsetstart, offsetend);
     }
@@ -347,7 +347,7 @@ public:
      */
     UInt64 rb_rank(UInt64 offsetid)
     {
-        if (this->isSmall())
+        if (isSmall())
             toLarge();
         return roaring_bitmap_rank(rb, offsetid);
     }
@@ -359,7 +359,7 @@ public:
     UInt64 rb_to_array(PaddedPODArray<Element> & res_data) const
     {
         UInt64 count = 0;
-        if (this->isSmall())
+        if (isSmall())
         {
             for (const auto & x : small)
             {

From 36add97932a2001f987e9c14d386c2cd71b4a738 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitbar@yandex-team.ru>
Date: Mon, 4 Mar 2019 20:36:22 +0300
Subject: [PATCH 51/69] Fix print_include_directories.cmake

---
 cmake/print_include_directories.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/print_include_directories.cmake b/cmake/print_include_directories.cmake
index c4c5d00c54f..05be8f909ee 100644
--- a/cmake/print_include_directories.cmake
+++ b/cmake/print_include_directories.cmake
@@ -13,6 +13,9 @@ list(APPEND dirs ${dirs1})
 get_property (dirs1 TARGET cityhash PROPERTY INCLUDE_DIRECTORIES)
 list(APPEND dirs ${dirs1})
 
+get_property (dirs1 TARGET roaring PROPERTY INCLUDE_DIRECTORIES)
+list(APPEND dirs ${dirs1})
+
 if (USE_INTERNAL_BOOST_LIBRARY)
     get_property (dirs1 TARGET ${Boost_PROGRAM_OPTIONS_LIBRARY} PROPERTY INCLUDE_DIRECTORIES)
     list(APPEND dirs ${dirs1})

From 9ffb59bc0846fc39cc754ad115661ef136166e6f Mon Sep 17 00:00:00 2001
From: BanyRule <banyrule@gmail.com>
Date: Tue, 5 Mar 2019 14:13:33 +0700
Subject: [PATCH 52/69] remove paragraph duplication in DISTINCT clause
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

description of the work of `DISTINCT ` with NULL values ​​was duplicated twice
---
 docs/ru/query_language/select.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/ru/query_language/select.md b/docs/ru/query_language/select.md
index dc0c2d27803..b9c899f6532 100644
--- a/docs/ru/query_language/select.md
+++ b/docs/ru/query_language/select.md
@@ -712,8 +712,6 @@ WHERE и HAVING отличаются тем, что WHERE выполняется
 
 `DISTINCT` работает с [NULL](syntax.md) как если бы `NULL` был конкретным значением, причём `NULL=NULL`. Т.е. в результате `DISTINCT` разные комбинации с `NULL` встретятся только по одному разу.
 
-`DISTINCT` работает с [NULL](syntax.md) как если бы `NULL` был конкретным значением, причём `NULL=NULL`. Т.е. в результате `DISTINCT` разные комбинации с `NULL` встретятся только по одному разу.
-
 ### Секция LIMIT
 
 LIMIT m позволяет выбрать из результата первые m строк.

From 89014b5480a5bd692eb8a0fb81d4070b3706c050 Mon Sep 17 00:00:00 2001
From: proller <proller@users.noreply.github.com>
Date: Tue, 5 Mar 2019 13:15:47 +0300
Subject: [PATCH 53/69]  Build fixes (#4591)

---
 dbms/src/AggregateFunctions/CMakeLists.txt | 2 +-
 dbms/src/Compression/CMakeLists.txt        | 2 +-
 dbms/src/Functions/CMakeLists.txt          | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dbms/src/AggregateFunctions/CMakeLists.txt b/dbms/src/AggregateFunctions/CMakeLists.txt
index 77748500d5e..148bf90446c 100644
--- a/dbms/src/AggregateFunctions/CMakeLists.txt
+++ b/dbms/src/AggregateFunctions/CMakeLists.txt
@@ -20,7 +20,7 @@ list(REMOVE_ITEM clickhouse_aggregate_functions_headers
 )
 
 add_library(clickhouse_aggregate_functions ${LINK_MODE} ${clickhouse_aggregate_functions_sources})
-target_link_libraries(clickhouse_aggregate_functions PRIVATE dbms)
+target_link_libraries(clickhouse_aggregate_functions PRIVATE dbms PUBLIC ${CITYHASH_LIBRARIES})
 target_include_directories (clickhouse_aggregate_functions BEFORE PRIVATE ${COMMON_INCLUDE_DIR})
 
 if (ENABLE_TESTS)
diff --git a/dbms/src/Compression/CMakeLists.txt b/dbms/src/Compression/CMakeLists.txt
index 10ef3993899..1369493cb7a 100644
--- a/dbms/src/Compression/CMakeLists.txt
+++ b/dbms/src/Compression/CMakeLists.txt
@@ -1,7 +1,7 @@
 include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
 add_headers_and_sources(clickhouse_compression .)
 add_library(clickhouse_compression ${LINK_MODE} ${clickhouse_compression_headers} ${clickhouse_compression_sources})
-target_link_libraries(clickhouse_compression PRIVATE clickhouse_parsers clickhouse_common_io ${ZSTD_LIBRARY} ${LZ4_LIBRARY})
+target_link_libraries(clickhouse_compression PRIVATE clickhouse_parsers clickhouse_common_io ${ZSTD_LIBRARY} ${LZ4_LIBRARY} ${CITYHASH_LIBRARIES})
 target_include_directories(clickhouse_compression PUBLIC ${DBMS_INCLUDE_DIR})
 target_include_directories(clickhouse_compression SYSTEM PUBLIC ${PCG_RANDOM_INCLUDE_DIR})
 
diff --git a/dbms/src/Functions/CMakeLists.txt b/dbms/src/Functions/CMakeLists.txt
index e767c17de92..6b4cfab15c1 100644
--- a/dbms/src/Functions/CMakeLists.txt
+++ b/dbms/src/Functions/CMakeLists.txt
@@ -11,11 +11,11 @@ add_library(clickhouse_functions ${LINK_MODE} ${clickhouse_functions_sources})
 
 target_link_libraries(clickhouse_functions
     PUBLIC
-        dbms
-    PRIVATE
         clickhouse_dictionaries
+        dbms
         ${CONSISTENT_HASHING_LIBRARY}
         consistent-hashing-sumbur
+        ${CITYHASH_LIBRARIES}
         ${FARMHASH_LIBRARIES}
         ${METROHASH_LIBRARIES}
         murmurhash

From a8106360bd00297b18bd8a238afa0905d807150b Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Tue, 5 Mar 2019 13:57:05 +0300
Subject: [PATCH 54/69] fix ASTIdentifier print (compound could be short)

---
 dbms/src/Parsers/ASTIdentifier.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dbms/src/Parsers/ASTIdentifier.cpp b/dbms/src/Parsers/ASTIdentifier.cpp
index c8b3d719b3b..1a9db37391a 100644
--- a/dbms/src/Parsers/ASTIdentifier.cpp
+++ b/dbms/src/Parsers/ASTIdentifier.cpp
@@ -52,9 +52,8 @@ void ASTIdentifier::formatImplWithoutAlias(const FormatSettings & settings, Form
         settings.ostr << (settings.hilite ? hilite_none : "");
     };
 
-    /// A simple or compound identifier?
-
-    if (name_parts.size() > 1)
+    /// It could be compound but short
+    if (!isShort())
     {
         for (size_t i = 0, size = name_parts.size(); i < size; ++i)
         {

From 0afb7dfc7f87fa62c5769ceefb646a11856273b9 Mon Sep 17 00:00:00 2001
From: proller <proller@github.com>
Date: Tue, 5 Mar 2019 14:13:31 +0300
Subject: [PATCH 55/69] Fix include

---
 dbms/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index e6072f65b5f..76d4ebd7dbf 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -203,7 +203,6 @@ target_link_libraries (clickhouse_common_io
     roaring	
 )
 
-target_include_directories(clickhouse_common_io SYSTEM BEFORE PUBLIC ${PDQSORT_INCLUDE_DIR})
 
 target_include_directories(clickhouse_common_io SYSTEM BEFORE PUBLIC ${RE2_INCLUDE_DIR})
 
@@ -235,6 +234,7 @@ target_link_libraries (dbms
     Threads::Threads
 )
 
+target_include_directories(dbms SYSTEM BEFORE PUBLIC ${PDQSORT_INCLUDE_DIR})
 
 if (NOT USE_INTERNAL_BOOST_LIBRARY)
     target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS})

From ea4d3ec66178dbd4beeab556ce86d28f90909ebe Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Tue, 5 Mar 2019 15:34:48 +0300
Subject: [PATCH 56/69] some multiple join fixes

---
 .../JoinToSubqueryTransformVisitor.cpp        | 25 +++++++++++++------
 .../00820_multiple_joins.reference            |  4 +++
 .../0_stateless/00820_multiple_joins.sql      | 10 ++++----
 .../00847_multiple_join_same_column.reference | 16 +++++++++---
 .../00847_multiple_join_same_column.sql       | 20 +++++++--------
 5 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
index 55a036f7d74..5a1f7260a4f 100644
--- a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
+++ b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
@@ -36,9 +36,10 @@ struct ColumnAliasesMatcher
     {
         const std::vector<DatabaseAndTableWithAlias> tables;
         bool public_names;
-        AsteriskSemantic::RevertedAliases rev_aliases;
-        std::unordered_map<String, String> aliases;
+        AsteriskSemantic::RevertedAliases rev_aliases;  /// long_name -> aliases
+        std::unordered_map<String, String> aliases;     /// alias -> long_name
         std::vector<std::pair<ASTIdentifier *, bool>> compound_identifiers;
+        std::set<String> allowed_long_names;            /// original names allowed as aliases '--t.x as t.x' (select expressions only).
 
         Data(std::vector<DatabaseAndTableWithAlias> && tables_)
             : tables(tables_)
@@ -51,29 +52,37 @@ struct ColumnAliasesMatcher
 
             for (auto & [identifier, is_public] : compound_identifiers)
             {
-                auto it = rev_aliases.find(identifier->name);
+                String long_name = identifier->name;
+
+                auto it = rev_aliases.find(long_name);
                 if (it == rev_aliases.end())
                 {
                     bool last_table = IdentifierSemantic::canReferColumnToTable(*identifier, tables.back());
                     if (!last_table)
                     {
-                        String long_name = identifier->name;
                         String alias = hide_prefix + long_name;
                         aliases[alias] = long_name;
                         rev_aliases[long_name].push_back(alias);
 
                         identifier->setShortName(alias);
                         if (is_public)
+                        {
                             identifier->setAlias(long_name);
+                            allowed_long_names.insert(long_name);
+                        }
                     }
                     else if (is_public)
-                        identifier->setAlias(identifier->name); /// prevent crop long to short name
+                        identifier->setAlias(long_name); /// prevent crop long to short name
                 }
                 else
                 {
                     if (it->second.empty())
-                        throw Exception("No alias for '" + identifier->name + "'", ErrorCodes::LOGICAL_ERROR);
-                    identifier->setShortName(it->second[0]);
+                        throw Exception("No alias for '" + long_name + "'", ErrorCodes::LOGICAL_ERROR);
+
+                    if (is_public && allowed_long_names.count(long_name))
+                        ; /// leave original name unchanged for correct output
+                    else
+                        identifier->setShortName(it->second[0]);
                 }
             }
         }
@@ -131,7 +140,7 @@ struct ColumnAliasesMatcher
                 node.setAlias("");
             }
         }
-        else
+        else if (node.compound())
             data.compound_identifiers.emplace_back(&node, data.public_names);
     }
 };
diff --git a/dbms/tests/queries/0_stateless/00820_multiple_joins.reference b/dbms/tests/queries/0_stateless/00820_multiple_joins.reference
index 93744e2c46d..2e7d8660562 100644
--- a/dbms/tests/queries/0_stateless/00820_multiple_joins.reference
+++ b/dbms/tests/queries/0_stateless/00820_multiple_joins.reference
@@ -32,3 +32,7 @@
 6	6	60	60
 12	12	120	120
 18	18	180	180
+0	0	0	0	0	0	0
+6	6	60	60	66	66	120
+12	12	120	120	132	132	240
+18	18	180	180	198	198	360
diff --git a/dbms/tests/queries/0_stateless/00820_multiple_joins.sql b/dbms/tests/queries/0_stateless/00820_multiple_joins.sql
index b61777419d9..c19f4467934 100644
--- a/dbms/tests/queries/0_stateless/00820_multiple_joins.sql
+++ b/dbms/tests/queries/0_stateless/00820_multiple_joins.sql
@@ -69,11 +69,11 @@ from table1 as t1
 join table2 as t2 on t1.a = t2.a
 join table3 as t3 on t2.b = t3.b; -- { serverError 48 }
 
---select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b,
---    (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
---from table1 as t1
---join table2 as t2 on t1_a = t2_a
---join table3 as t3 on t2_b = t3_b;
+select t1.a as t1_a, t2.a as t2_a, t2.b as t2_b, t3.b as t3_b,
+    (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
+from table1 as t1
+join table2 as t2 on t1_a = t2_a
+join table3 as t3 on t2_b = t3_b;
 
 --select (t1.a + table2.b) as t1_t2_x, (table1.a + table3.b) as t1_t3_x, (t2.b + t3.b) as t2_t3_x
 --from table1 as t1
diff --git a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference
index d4d045a7796..1685a298042 100644
--- a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference
+++ b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.reference
@@ -31,7 +31,15 @@ y.b: 0
 │   1 │   1 │   1 │
 │   2 │   2 │   2 │
 └─────┴─────┴─────┘
-┌─s.a─┬─s.a─┐
-│   1 │   1 │
-│   0 │   0 │
-└─────┴─────┘
+┌─s.a─┬─s.a─┬─s_b─┬─s_b─┐
+│   1 │   1 │   1 │   1 │
+│   0 │   0 │   0 │   0 │
+└─────┴─────┴─────┴─────┘
+┌─y.a─┬─y.a─┬─y_b─┬─y_b─┐
+│   1 │   1 │   1 │   1 │
+│   0 │   0 │   0 │   0 │
+└─────┴─────┴─────┴─────┘
+┌─t_a─┬─t_a─┬─s_a─┬─s_a─┬─y_a─┬─y_a─┐
+│   1 │   1 │   1 │   1 │   1 │   1 │
+│   2 │   2 │   0 │   0 │   0 │   0 │
+└─────┴─────┴─────┴─────┴─────┴─────┘
diff --git a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql
index 08aca31f99e..b4ce61e3ee7 100644
--- a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql
+++ b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql
@@ -28,21 +28,21 @@ left join s on s.a = t.a
 left join y on y.b = s.b format PrettyCompactNoEscapes;
 
 select t.a, t.a, t.b as t_b from t
+left join s on t.a = s.a
+left join y on y.b = s.b format PrettyCompactNoEscapes;
+
+select s.a, s.a, s.b as s_b, s.b from t
+left join s on s.a = t.a
+left join y on s.b = y.b format PrettyCompactNoEscapes;
+
+select y.a, y.a, y.b as y_b, y.b from t
 left join s on s.a = t.a
 left join y on y.b = s.b format PrettyCompactNoEscapes;
 
-select s.a, s.a from t
-left join s on s.a = t.a
+select t.a, t.a as t_a, s.a, s.a as s_a, y.a, y.a as y_a from t
+left join s on t.a = s.a
 left join y on y.b = s.b format PrettyCompactNoEscapes;
 
---select t.a, t.a, t.b as t_b, t.b from t
---left join s on s.a = t.a
---left join y on y.b = s.b format PrettyCompactNoEscapes;
-
---select t.a, t.a, s.b as s_b, s.b from t
---left join s on s.a = t.a
---left join y on y.b = s.b format PrettyCompactNoEscapes;
-
 drop table t;
 drop table s;
 drop table y;

From 0c0d9343d46d20db0e403d2364de460a666020d0 Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Tue, 5 Mar 2019 15:49:00 +0300
Subject: [PATCH 57/69] update ASTIdentifier prints for push down predicate
 test reference (long vs short names)

---
 .../0_stateless/00597_push_down_predicate.reference    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dbms/tests/queries/0_stateless/00597_push_down_predicate.reference b/dbms/tests/queries/0_stateless/00597_push_down_predicate.reference
index ff97111f6b7..94adaffd52f 100644
--- a/dbms/tests/queries/0_stateless/00597_push_down_predicate.reference
+++ b/dbms/tests/queries/0_stateless/00597_push_down_predicate.reference
@@ -20,7 +20,7 @@ SELECT \n    a, \n    b\nFROM \n(\n    SELECT \n        toUInt64(sum(id) AS b) A
 3	3
 SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        name, \n        value, \n        min(id) AS id\n    FROM test.test \n    GROUP BY \n        date, \n        name, \n        value\n    HAVING id = 1\n) \nWHERE id = 1
 2000-01-01	1	test string 1	1
-SELECT \n    a, \n    b\nFROM \n(\n    SELECT \n        toUInt64(sum(id) AS b) AS a, \n        b\n    FROM test.test AS table_alias \n    HAVING b = 3\n) AS outer_table_alias \nWHERE outer_table_alias.b = 3
+SELECT \n    a, \n    b\nFROM \n(\n    SELECT \n        toUInt64(sum(id) AS b) AS a, \n        b\n    FROM test.test AS table_alias \n    HAVING b = 3\n) AS outer_table_alias \nWHERE b = 3
 3	3
 SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM test.test \n    WHERE id = 1\n) \nWHERE id = 1
 2000-01-01	1	test string 1	1
@@ -32,9 +32,9 @@ SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n
 2000-01-01	1	test string 1	1
 SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM \n    (\n        SELECT \n            date, \n            id, \n            name, \n            value\n        FROM test.test \n        WHERE id = 1\n    ) \n    WHERE id = 1\n) \nWHERE id = 1
 2000-01-01	1	test string 1	1
-SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM test.test \n    WHERE id = 1\n) AS b \nWHERE b.id = 1
+SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM test.test \n    WHERE id = 1\n) AS b \nWHERE id = 1
 2000-01-01	1	test string 1	1
-SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM \n    (\n        SELECT \n            date, \n            id, \n            name, \n            value\n        FROM test.test \n        WHERE id = 1\n    ) AS a \n    WHERE id = 1\n) AS b \nWHERE b.id = 1
+SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM \n    (\n        SELECT \n            date, \n            id, \n            name, \n            value\n        FROM test.test \n        WHERE id = 1\n    ) AS a \n    WHERE id = 1\n) AS b \nWHERE id = 1
 2000-01-01	1	test string 1	1
 SELECT \n    id, \n    date, \n    value\nFROM \n(\n    SELECT \n        id, \n        date, \n        min(value) AS value\n    FROM test.test \n    WHERE id = 1\n    GROUP BY \n        id, \n        date\n) \nWHERE id = 1
 1	2000-01-01	1
@@ -45,11 +45,11 @@ SELECT \n    date, \n    id, \n    name, \n    value, \n    date, \n    name, \n
 2000-01-01	1	test string 1	1	2000-01-01	test string 1	1
 SELECT \n    id, \n    date, \n    name, \n    value\nFROM \n(\n    SELECT toInt8(1) AS id\n) \nANY LEFT JOIN test.test USING (id)\nWHERE value = 1
 1	2000-01-01	test string 1	1
-SELECT b.value\nFROM \n(\n    SELECT toInt8(1) AS id\n) \nANY LEFT JOIN test.test AS b USING (id)\nWHERE value = 1
+SELECT value\nFROM \n(\n    SELECT toInt8(1) AS id\n) \nANY LEFT JOIN test.test AS b USING (id)\nWHERE value = 1
 1
 SELECT \n    date, \n    id, \n    name, \n    value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value, \n        date, \n        name, \n        value\n    FROM \n    (\n        SELECT \n            date, \n            id, \n            name, \n            value\n        FROM test.test \n        WHERE id = 1\n    ) \n    ANY LEFT JOIN \n    (\n        SELECT *\n        FROM test.test \n        WHERE id = 1\n    ) USING (id)\n    WHERE id = 1\n) \nWHERE id = 1
 2000-01-01	1	test string 1	1
-SELECT \n    date, \n    id, \n    name, \n    value, \n    `b.date`, \n    `b.name`, \n    `b.value`\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM test.test \n) \nANY LEFT JOIN \n(\n    SELECT *\n    FROM test.test \n    WHERE id = 1\n) AS b USING (id)\nWHERE b.id = 1
+SELECT \n    date, \n    id, \n    name, \n    value, \n    b.date, \n    b.name, \n    b.value\nFROM \n(\n    SELECT \n        date, \n        id, \n        name, \n        value\n    FROM test.test \n) \nANY LEFT JOIN \n(\n    SELECT *\n    FROM test.test \n    WHERE id = 1\n) AS b USING (id)\nWHERE b.id = 1
 2000-01-01	1	test string 1	1	2000-01-01	test string 1	1
 SELECT \n    id, \n    date, \n    name, \n    value\nFROM \n(\n    SELECT \n        toInt8(1) AS id, \n        toDate(\'2000-01-01\') AS date\n    FROM system.numbers \n    LIMIT 1\n) \nANY LEFT JOIN \n(\n    SELECT *\n    FROM test.test \n    WHERE date = toDate(\'2000-01-01\')\n) AS b USING (date, id)\nWHERE b.date = toDate(\'2000-01-01\')
 1	2000-01-01	test string 1	1

From 718fd5b4ca01a1807d3cb2781f807168e366a5e3 Mon Sep 17 00:00:00 2001
From: proller <proller@github.com>
Date: Tue, 5 Mar 2019 16:03:11 +0300
Subject: [PATCH 58/69] Add exception code METRIKA_OTHER_ERROR

---
 dbms/src/Common/ErrorCodes.cpp | 1 +
 dbms/src/Common/Exception.h    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp
index 7d04c6b25c2..d3401427037 100644
--- a/dbms/src/Common/ErrorCodes.cpp
+++ b/dbms/src/Common/ErrorCodes.cpp
@@ -424,6 +424,7 @@ namespace ErrorCodes
     extern const int POCO_EXCEPTION = 1000;
     extern const int STD_EXCEPTION = 1001;
     extern const int UNKNOWN_EXCEPTION = 1002;
+    extern const int METRIKA_OTHER_ERROR = 1003;
 
     extern const int CONDITIONAL_TREE_PARENT_NOT_FOUND = 2001;
     extern const int ILLEGAL_PROJECTION_MANIPULATOR = 2002;
diff --git a/dbms/src/Common/Exception.h b/dbms/src/Common/Exception.h
index 43cb9e597c5..6b0656f4828 100644
--- a/dbms/src/Common/Exception.h
+++ b/dbms/src/Common/Exception.h
@@ -17,6 +17,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int POCO_EXCEPTION;
+    extern const int METRIKA_OTHER_ERROR;
 }
 
 class Exception : public Poco::Exception

From 72fe0115d1aabebf891719a44afd634121d17461 Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Tue, 5 Mar 2019 16:53:39 +0300
Subject: [PATCH 59/69] move join convertions to InterpreterSelectQuery

---
 .../Interpreters/InterpreterSelectQuery.cpp    | 14 ++++++++++++++
 dbms/src/Interpreters/executeQuery.cpp         | 18 ------------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
index f2a76a20d0a..ebbfcf6d133 100644
--- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
@@ -38,6 +38,8 @@
 #include <Interpreters/convertFieldToType.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/DatabaseAndTableWithAlias.h>
+#include <Interpreters/JoinToSubqueryTransformVisitor.h>
+#include <Interpreters/CrossToInnerJoinVisitor.h>
 
 #include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
 #include <Storages/IStorage.h>
@@ -155,6 +157,18 @@ InterpreterSelectQuery::InterpreterSelectQuery(
         throw Exception("Too deep subqueries. Maximum: " + settings.max_subquery_depth.toString(),
             ErrorCodes::TOO_DEEP_SUBQUERIES);
 
+    if (settings.allow_experimental_multiple_joins_emulation)
+    {
+        JoinToSubqueryTransformVisitor::Data join_to_subs_data;
+        JoinToSubqueryTransformVisitor(join_to_subs_data).visit(query_ptr);
+    }
+
+    if (settings.allow_experimental_cross_to_join_conversion)
+    {
+        CrossToInnerJoinVisitor::Data cross_to_inner;
+        CrossToInnerJoinVisitor(cross_to_inner).visit(query_ptr);
+    }
+
     max_streams = settings.max_threads;
 
     ASTPtr table_expression = extractTableExpression(query, 0);
diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp
index a5856fb6173..d8b4ff4ebe0 100644
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@@ -21,8 +21,6 @@
 #include <Parsers/parseQuery.h>
 #include <Parsers/queryToString.h>
 
-#include <Interpreters/JoinToSubqueryTransformVisitor.h>
-#include <Interpreters/CrossToInnerJoinVisitor.h>
 #include <Interpreters/Quota.h>
 #include <Interpreters/InterpreterFactory.h>
 #include <Interpreters/ProcessList.h>
@@ -196,22 +194,6 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
         if (!internal)
             logQuery(query.substr(0, settings.log_queries_cut_to_length), context);
 
-        if (!internal && settings.allow_experimental_multiple_joins_emulation)
-        {
-            JoinToSubqueryTransformVisitor::Data join_to_subs_data;
-            JoinToSubqueryTransformVisitor(join_to_subs_data).visit(ast);
-            if (join_to_subs_data.done)
-                logQuery(queryToString(*ast), context);
-        }
-
-        if (!internal && settings.allow_experimental_cross_to_join_conversion)
-        {
-            CrossToInnerJoinVisitor::Data cross_to_inner;
-            CrossToInnerJoinVisitor(cross_to_inner).visit(ast);
-            if (cross_to_inner.done)
-                logQuery(queryToString(*ast), context);
-        }
-
         /// Check the limits.
         checkASTSizeLimits(*ast, settings);
 

From 22f699c8a0a278623142c13d23ba88293d4a4d52 Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Tue, 5 Mar 2019 18:16:59 +0300
Subject: [PATCH 60/69] fix CrossToInnerJoin (empty where crash & where for one
 table wrong result)

---
 dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp        | 9 ++++++---
 .../0_stateless/00826_cross_to_inner_join.reference      | 4 +++-
 .../queries/0_stateless/00826_cross_to_inner_join.sql    | 5 +++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
index 43f29046e9b..8c74ddf699a 100644
--- a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
+++ b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
@@ -192,6 +192,9 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr & ast, Data
     using CheckExpressionMatcher = OneTypeMatcher<CheckExpressionVisitorData, false>;
     using CheckExpressionVisitor = InDepthNodeVisitor<CheckExpressionMatcher, true>;
 
+    if (!select.where_expression)
+        return;
+
     std::vector<DatabaseAndTableWithAlias> table_names;
     ASTPtr ast_join = getCrossJoin(select, table_names);
     if (!ast_join)
@@ -215,10 +218,10 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr & ast, Data
             select.where_expression.reset();
 
         join.children.push_back(join.on_expression);
+
+        ast = ast->clone(); /// rewrite AST in right manner
+        data.done = true;
     }
-
-    ast = ast->clone(); /// rewrite AST in right manner
-    data.done = true;
 }
 
 }
diff --git a/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.reference b/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.reference
index 73c8a9f9ce4..93cdf438a0f 100644
--- a/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.reference
+++ b/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.reference
@@ -1,3 +1,5 @@
+0	0
+0	0
 cross
 1	1	1	1
 1	1	1	2
@@ -67,7 +69,7 @@ Explain ParsedAST (children 1)\n SelectWithUnionQuery (children 1)\n  Expression
 Explain ParsedAST (children 1)\n SelectWithUnionQuery (children 1)\n  ExpressionList (children 1)\n   SelectQuery (children 2)\n    ExpressionList (children 1)\n     Asterisk\n    TablesInSelectQuery (children 2)\n     TablesInSelectQueryElement (children 1)\n      TableExpression (children 1)\n       Identifier t1 (alias x)\n     TablesInSelectQueryElement (children 2)\n      TableJoin (children 1)\n       Function and (children 1)\n        ExpressionList (children 2)\n         Function equals (children 1)\n          ExpressionList (children 2)\n           Identifier x.a\n           Identifier y.a\n         Function equals (children 1)\n          ExpressionList (children 2)\n           Identifier x.b\n           Identifier y.b\n      TableExpression (children 1)\n       Identifier t1 (alias y)\n
 cross one table expr
 Explain ParsedAST (children 1)\n SelectWithUnionQuery (children 1)\n  ExpressionList (children 1)\n   SelectQuery (children 3)\n    ExpressionList (children 1)\n     Asterisk\n    TablesInSelectQuery (children 2)\n     TablesInSelectQueryElement (children 1)\n      TableExpression (children 1)\n       Identifier t1\n     TablesInSelectQueryElement (children 2)\n      TableExpression (children 1)\n       Identifier t2\n      TableJoin\n    Function equals (children 1)\n     ExpressionList (children 2)\n      Identifier t1.a\n      Identifier t1.b\n
-Explain ParsedAST (children 1)\n SelectWithUnionQuery (children 1)\n  ExpressionList (children 1)\n   SelectQuery (children 3)\n    ExpressionList (children 1)\n     Asterisk\n    TablesInSelectQuery (children 2)\n     TablesInSelectQueryElement (children 1)\n      TableExpression (children 1)\n       Identifier t1\n     TablesInSelectQueryElement (children 2)\n      TableJoin\n      TableExpression (children 1)\n       Identifier t2\n    Function equals (children 1)\n     ExpressionList (children 2)\n      Identifier t1.a\n      Identifier t1.b\n
+Explain ParsedAST (children 1)\n SelectWithUnionQuery (children 1)\n  ExpressionList (children 1)\n   SelectQuery (children 3)\n    ExpressionList (children 1)\n     Asterisk\n    TablesInSelectQuery (children 2)\n     TablesInSelectQueryElement (children 1)\n      TableExpression (children 1)\n       Identifier t1\n     TablesInSelectQueryElement (children 2)\n      TableExpression (children 1)\n       Identifier t2\n      TableJoin\n    Function equals (children 1)\n     ExpressionList (children 2)\n      Identifier t1.a\n      Identifier t1.b\n
 cross multiple ands
 Explain ParsedAST (children 1)\n SelectWithUnionQuery (children 1)\n  ExpressionList (children 1)\n   SelectQuery (children 3)\n    ExpressionList (children 1)\n     Asterisk\n    TablesInSelectQuery (children 2)\n     TablesInSelectQueryElement (children 1)\n      TableExpression (children 1)\n       Identifier t1\n     TablesInSelectQueryElement (children 2)\n      TableExpression (children 1)\n       Identifier t2\n      TableJoin\n    Function and (children 1)\n     ExpressionList (children 2)\n      Function equals (children 1)\n       ExpressionList (children 2)\n        Identifier t1.a\n        Identifier t2.a\n      Function equals (children 1)\n       ExpressionList (children 2)\n        Identifier t1.b\n        Identifier t2.b\n
 Explain ParsedAST (children 1)\n SelectWithUnionQuery (children 1)\n  ExpressionList (children 1)\n   SelectQuery (children 2)\n    ExpressionList (children 1)\n     Asterisk\n    TablesInSelectQuery (children 2)\n     TablesInSelectQueryElement (children 1)\n      TableExpression (children 1)\n       Identifier t1\n     TablesInSelectQueryElement (children 2)\n      TableJoin (children 1)\n       Function and (children 1)\n        ExpressionList (children 2)\n         Function equals (children 1)\n          ExpressionList (children 2)\n           Identifier t1.a\n           Identifier t2.a\n         Function equals (children 1)\n          ExpressionList (children 2)\n           Identifier t1.b\n           Identifier t2.b\n      TableExpression (children 1)\n       Identifier t2\n
diff --git a/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.sql b/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.sql
index 26d8d5abd57..218ea1f1e45 100644
--- a/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.sql
+++ b/dbms/tests/queries/0_stateless/00826_cross_to_inner_join.sql
@@ -1,6 +1,11 @@
 SET enable_debug_queries = 1;
 USE test;
 
+set allow_experimental_cross_to_join_conversion = 0;
+select * from system.one cross join system.one;
+set allow_experimental_cross_to_join_conversion = 1;
+select * from system.one cross join system.one;
+
 DROP TABLE IF EXISTS t1;
 DROP TABLE IF EXISTS t2;
 

From 049d49333dc12c5004f9e8a6618f2d850c0f869f Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Tue, 5 Mar 2019 18:21:52 +0300
Subject: [PATCH 61/69] undo last change

---
 .../Interpreters/InterpreterSelectQuery.cpp    | 14 --------------
 dbms/src/Interpreters/executeQuery.cpp         | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
index ebbfcf6d133..f2a76a20d0a 100644
--- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
@@ -38,8 +38,6 @@
 #include <Interpreters/convertFieldToType.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/DatabaseAndTableWithAlias.h>
-#include <Interpreters/JoinToSubqueryTransformVisitor.h>
-#include <Interpreters/CrossToInnerJoinVisitor.h>
 
 #include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
 #include <Storages/IStorage.h>
@@ -157,18 +155,6 @@ InterpreterSelectQuery::InterpreterSelectQuery(
         throw Exception("Too deep subqueries. Maximum: " + settings.max_subquery_depth.toString(),
             ErrorCodes::TOO_DEEP_SUBQUERIES);
 
-    if (settings.allow_experimental_multiple_joins_emulation)
-    {
-        JoinToSubqueryTransformVisitor::Data join_to_subs_data;
-        JoinToSubqueryTransformVisitor(join_to_subs_data).visit(query_ptr);
-    }
-
-    if (settings.allow_experimental_cross_to_join_conversion)
-    {
-        CrossToInnerJoinVisitor::Data cross_to_inner;
-        CrossToInnerJoinVisitor(cross_to_inner).visit(query_ptr);
-    }
-
     max_streams = settings.max_threads;
 
     ASTPtr table_expression = extractTableExpression(query, 0);
diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp
index d8b4ff4ebe0..a5856fb6173 100644
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@@ -21,6 +21,8 @@
 #include <Parsers/parseQuery.h>
 #include <Parsers/queryToString.h>
 
+#include <Interpreters/JoinToSubqueryTransformVisitor.h>
+#include <Interpreters/CrossToInnerJoinVisitor.h>
 #include <Interpreters/Quota.h>
 #include <Interpreters/InterpreterFactory.h>
 #include <Interpreters/ProcessList.h>
@@ -194,6 +196,22 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
         if (!internal)
             logQuery(query.substr(0, settings.log_queries_cut_to_length), context);
 
+        if (!internal && settings.allow_experimental_multiple_joins_emulation)
+        {
+            JoinToSubqueryTransformVisitor::Data join_to_subs_data;
+            JoinToSubqueryTransformVisitor(join_to_subs_data).visit(ast);
+            if (join_to_subs_data.done)
+                logQuery(queryToString(*ast), context);
+        }
+
+        if (!internal && settings.allow_experimental_cross_to_join_conversion)
+        {
+            CrossToInnerJoinVisitor::Data cross_to_inner;
+            CrossToInnerJoinVisitor(cross_to_inner).visit(ast);
+            if (cross_to_inner.done)
+                logQuery(queryToString(*ast), context);
+        }
+
         /// Check the limits.
         checkASTSizeLimits(*ast, settings);
 

From 4f80afb158575480c196af057d49e80ba1f05108 Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Tue, 5 Mar 2019 19:34:10 +0300
Subject: [PATCH 62/69] fix gcc build

---
 dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
index a0b5aed0af3..3d8a67ae766 100644
--- a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
+++ b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
@@ -218,7 +218,6 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
         {
             DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);
 
-            bool first_table = true;
             for (const auto & [table, table_columns] : tables_with_columns)
             {
                 if (ident_db_and_name.satisfies(table, true))
@@ -230,8 +229,6 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
                     }
                     break;
                 }
-
-                first_table = false;
             }
         }
         else

From fae0b054f91bd1c69479fa4afb621cbf9242b7f7 Mon Sep 17 00:00:00 2001
From: proller <proller@github.com>
Date: Tue, 5 Mar 2019 20:51:04 +0300
Subject: [PATCH 63/69] Arcadia fix

---
 dbms/src/DataTypes/IDataType.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dbms/src/DataTypes/IDataType.h b/dbms/src/DataTypes/IDataType.h
index c1c621eb19e..aa253fbdc08 100644
--- a/dbms/src/DataTypes/IDataType.h
+++ b/dbms/src/DataTypes/IDataType.h
@@ -262,8 +262,10 @@ protected:
 
     /** Text serialization with escaping but without quoting.
       */
+public: // used somewhere in arcadia
     virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0;
 
+protected:
     virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0;
 
     /** Text serialization as a literal that may be inserted into a query.

From ddbada664602c21cfe17b1e0cfb9d98b3b42303c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Tue, 5 Mar 2019 23:34:37 +0300
Subject: [PATCH 64/69] Fixed build #4583

---
 dbms/src/Dictionaries/readInvalidateQuery.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dbms/src/Dictionaries/readInvalidateQuery.h b/dbms/src/Dictionaries/readInvalidateQuery.h
index a906c8f887f..48fbfa03629 100644
--- a/dbms/src/Dictionaries/readInvalidateQuery.h
+++ b/dbms/src/Dictionaries/readInvalidateQuery.h
@@ -1,11 +1,11 @@
 #pragma once
 #include <string>
 
-class IBlockInputStream;
-
 namespace DB
 {
 
+class IBlockInputStream;
+
 /// Using in MySQLDictionarySource and XDBCDictionarySource after processing invalidate_query.
 std::string readInvalidateQuery(IBlockInputStream & block_input_stream);
 

From 899b2548c77ba527df51c79306d6d81918a6efe0 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Tue, 5 Mar 2019 23:49:21 +0300
Subject: [PATCH 65/69] Added comment #4572

---
 dbms/src/Common/ThreadPool.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dbms/src/Common/ThreadPool.cpp b/dbms/src/Common/ThreadPool.cpp
index a985e0486be..6ed350240c6 100644
--- a/dbms/src/Common/ThreadPool.cpp
+++ b/dbms/src/Common/ThreadPool.cpp
@@ -157,6 +157,7 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
             }
             else
             {
+                /// shutdown is true, simply finish the thread.
                 return;
             }
         }

From ffc9fbf769492c4f10e43722326060d401ba0981 Mon Sep 17 00:00:00 2001
From: proller <proller@users.noreply.github.com>
Date: Wed, 6 Mar 2019 02:53:35 +0300
Subject: [PATCH 66/69]  Build fixes (#4600)

* Add Y_IGNORE

* Build fixes
---
 dbms/src/Formats/ProtobufWriter.h | 46 +++++++++++++++----------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/dbms/src/Formats/ProtobufWriter.h b/dbms/src/Formats/ProtobufWriter.h
index aaa9b9a2f9b..aba3a2b2dc6 100644
--- a/dbms/src/Formats/ProtobufWriter.h
+++ b/dbms/src/Formats/ProtobufWriter.h
@@ -238,29 +238,29 @@ using ConstAggregateDataPtr = const char *;
 class ProtobufWriter
 {
 public:
-    bool writeNumber(Int8 value) { return false; }
-    bool writeNumber(UInt8 value) { return false; }
-    bool writeNumber(Int16 value) { return false; }
-    bool writeNumber(UInt16 value) { return false; }
-    bool writeNumber(Int32 value) { return false; }
-    bool writeNumber(UInt32 value) { return false; }
-    bool writeNumber(Int64 value) { return false; }
-    bool writeNumber(UInt64 value) { return false; }
-    bool writeNumber(UInt128 value) { return false; }
-    bool writeNumber(Float32 value) { return false; }
-    bool writeNumber(Float64 value) { return false; }
-    bool writeString(const StringRef & value) { return false; }
-    void prepareEnumMapping(const std::vector<std::pair<std::string, Int8>> & name_value_pairs) {}
-    void prepareEnumMapping(const std::vector<std::pair<std::string, Int16>> & name_value_pairs) {}
-    bool writeEnum(Int8 value) { return false; }
-    bool writeEnum(Int16 value) { return false; }
-    bool writeUUID(const UUID & value) { return false; }
-    bool writeDate(DayNum date) { return false; }
-    bool writeDateTime(time_t tm) { return false; }
-    bool writeDecimal(Decimal32 decimal, UInt32 scale) { return false; }
-    bool writeDecimal(Decimal64 decimal, UInt32 scale) { return false; }
-    bool writeDecimal(const Decimal128 & decimal, UInt32 scale) { return false; }
-    bool writeAggregateFunction(const AggregateFunctionPtr & function, ConstAggregateDataPtr place) { return false; }
+    bool writeNumber(Int8 /* value */) { return false; }
+    bool writeNumber(UInt8 /* value */) { return false; }
+    bool writeNumber(Int16 /* value */) { return false; }
+    bool writeNumber(UInt16 /* value */) { return false; }
+    bool writeNumber(Int32 /* value */) { return false; }
+    bool writeNumber(UInt32 /* value */) { return false; }
+    bool writeNumber(Int64 /* value */) { return false; }
+    bool writeNumber(UInt64 /* value */) { return false; }
+    bool writeNumber(UInt128 /* value */) { return false; }
+    bool writeNumber(Float32 /* value */) { return false; }
+    bool writeNumber(Float64 /* value */) { return false; }
+    bool writeString(const StringRef & /* value */) { return false; }
+    void prepareEnumMapping(const std::vector<std::pair<std::string, Int8>> & /* name_value_pairs */) {}
+    void prepareEnumMapping(const std::vector<std::pair<std::string, Int16>> & /* name_value_pairs */) {}
+    bool writeEnum(Int8 /* value */) { return false; }
+    bool writeEnum(Int16 /* value */) { return false; }
+    bool writeUUID(const UUID & /* value */) { return false; }
+    bool writeDate(DayNum /* date */) { return false; }
+    bool writeDateTime(time_t /* tm */) { return false; }
+    bool writeDecimal(Decimal32 /* decimal */, UInt32 /* scale */) { return false; }
+    bool writeDecimal(Decimal64 /* decimal */, UInt32 /* scale */) { return false; }
+    bool writeDecimal(const Decimal128 & /* decimal */, UInt32 /* scale */) { return false; }
+    bool writeAggregateFunction(const AggregateFunctionPtr & /* function */, ConstAggregateDataPtr /* place */) { return false; }
 };
 
 }

From 2e6dde45b4ca812118ab1f8a6f3e50a64ddecae9 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 6 Mar 2019 03:09:49 +0300
Subject: [PATCH 67/69] Update GraphiteRollupSortedBlockInputStream.cpp

---
 .../GraphiteRollupSortedBlockInputStream.cpp  | 75 +++++++++----------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp
index 6c1983568bb..fb24d8c37a4 100644
--- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp
+++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp
@@ -49,54 +49,53 @@ Graphite::RollupRule GraphiteRollupSortedBlockInputStream::selectPatternForPath(
     const Graphite::Pattern * first_match = &undef_pattern;
 
     for (const auto & pattern : params.patterns)
-        if (!pattern.regexp || pattern.regexp->match(path.data, path.size))
+    {
+        if (!pattern.regexp)
         {
-            if (!pattern.regexp)
+            /// Default pattern
+            if (first_match->type == first_match->TypeUndef && pattern.type == pattern.TypeAll)
             {
-                /// Default pattern
-                if (first_match->type == first_match->TypeUndef && pattern.type == pattern.TypeAll)
-                {
-                    /// There is only default pattern for both retention and aggregation
-                    return std::pair(&pattern, &pattern);
-                }
-                if (pattern.type != first_match->type)
-                {
-                    if (first_match->type == first_match->TypeRetention)
-                    {
-                        return std::pair(first_match, &pattern);
-                    }
-                    if (first_match->type == first_match->TypeAggregation)
-                    {
-                        return std::pair(&pattern, first_match);
-                    }
-                }
+                /// There is only default pattern for both retention and aggregation
+                return std::pair(&pattern, &pattern);
             }
-            else
+            if (pattern.type != first_match->type)
             {
-                /// General pattern with matched path
-                if (pattern.type == pattern.TypeAll)
+                if (first_match->type == first_match->TypeRetention)
                 {
-                   /// Only for not default patterns with both function and retention parameters
-                   return std::pair(&pattern, &pattern);
+                    return std::pair(first_match, &pattern);
                 }
-                if (first_match->type == first_match->TypeUndef)
+                if (first_match->type == first_match->TypeAggregation)
                 {
-                    first_match = &pattern;
-                    continue;
-                }
-                if (pattern.type != first_match->type)
-                {
-                    if (first_match->type == first_match->TypeRetention)
-                    {
-                        return std::pair(first_match, &pattern);
-                    }
-                    if (first_match->type == first_match->TypeAggregation)
-                    {
-                        return std::pair(&pattern, first_match);
-                    }
+                    return std::pair(&pattern, first_match);
                 }
             }
         }
+        else if (pattern.regexp->match(path.data, path.size))
+        {
+            /// General pattern with matched path
+            if (pattern.type == pattern.TypeAll)
+            {
+               /// Only for not default patterns with both function and retention parameters
+               return std::pair(&pattern, &pattern);
+            }
+            if (first_match->type == first_match->TypeUndef)
+            {
+                first_match = &pattern;
+                continue;
+            }
+            if (pattern.type != first_match->type)
+            {
+                if (first_match->type == first_match->TypeRetention)
+                {
+                    return std::pair(first_match, &pattern);
+                }
+                if (first_match->type == first_match->TypeAggregation)
+                {
+                    return std::pair(&pattern, first_match);
+                }
+            }
+        }
+    }
 
     return {nullptr, nullptr};
 }

From 01f7efe61534388f9d6cf693a11c2b2136176531 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 6 Mar 2019 03:44:25 +0300
Subject: [PATCH 68/69] Added support for clang-9

---
 dbms/CMakeLists.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index 76d4ebd7dbf..8de06ff6ac0 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -59,6 +59,14 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
     if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra-semi-stmt -Wshadow-field -Wstring-plus-int")
     endif ()
+
+    if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra-semi-stmt -Wshadow-field -Wstring-plus-int")
+
+        if (WEVERYTHING)
+            set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ctad-maybe-unsupported")
+        endif ()
+    endif ()
 endif ()
 
 if (USE_DEBUG_HELPERS)
@@ -200,7 +208,7 @@ target_link_libraries (clickhouse_common_io
         PRIVATE
     ${CMAKE_DL_LIBS}
         PUBLIC
-    roaring	
+    roaring
 )
 
 

From 98fe1a6e8bbd0a781d3cb90198cd545ef4f261ae Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 6 Mar 2019 04:01:53 +0300
Subject: [PATCH 69/69] Update CMakeLists.txt

---
 dbms/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index 8de06ff6ac0..900b1e0a650 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -61,8 +61,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
     endif ()
 
     if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
-        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra-semi-stmt -Wshadow-field -Wstring-plus-int")
-
         if (WEVERYTHING)
             set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ctad-maybe-unsupported")
         endif ()