Merge pull request #21715 from ClickHouse/memcpy-bench

Add more variants for memcpy benchmark
2024-11-10 09:32:06 +00:00 · 2021-03-15 20:04:36 +03:00 · 2021-03-15 20:04:36 +03:00 · 6a455fe71d
commit 6a455fe71d
parent f5a7071dbd 9bea10d9f9
18 changed files with 8609 additions and 61 deletions
--- a/utils/memcpy-bench/CMakeLists.txt
+++ b/utils/memcpy-bench/CMakeLists.txt
@ -1,5 +1,22 @@
 enable_language(ASM)
-add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S)
-#target_compile_options(memcpy-bench PRIVATE -mavx)
-target_link_libraries(memcpy-bench PRIVATE dbms)
+
+add_executable (memcpy-bench
+    memcpy-bench.cpp
+    FastMemcpy.cpp
+    FastMemcpy_Avx.cpp
+    memcpy_jart.S
+    glibc/memcpy-ssse3.S
+    glibc/memcpy-ssse3-back.S
+    glibc/memmove-sse2-unaligned-erms.S
+    glibc/memmove-avx-unaligned-erms.S
+    glibc/memmove-avx512-unaligned-erms.S
+    glibc/memmove-avx512-no-vzeroupper.S
+    )
+
+add_compile_options(memcpy-bench PRIVATE -fno-tree-loop-distribute-patterns)
+
+set_source_files_properties(FastMemcpy.cpp PROPERTIES COMPILE_FLAGS "-Wno-old-style-cast")
+set_source_files_properties(FastMemcpy_Avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -Wno-old-style-cast -Wno-cast-qual -Wno-cast-align")
+
+target_link_libraries(memcpy-bench PRIVATE dbms boost::program_options)

--- a/utils/memcpy-bench/FastMemcpy.cpp
+++ b/utils/memcpy-bench/FastMemcpy.cpp
@ -0,0 +1 @@
+#include "FastMemcpy.h"
--- a/utils/memcpy-bench/FastMemcpy.h
+++ b/utils/memcpy-bench/FastMemcpy.h
@ -93,7 +93,7 @@ static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restric
 /// Attribute is used to avoid an error with undefined behaviour sanitizer
 /// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
 /// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
-__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
+__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
 {
    unsigned char *dd = ((unsigned char*)dst) + size;
    const unsigned char *ss = ((const unsigned char*)src) + size;
--- a/utils/memcpy-bench/FastMemcpy_Avx.cpp
+++ b/utils/memcpy-bench/FastMemcpy_Avx.cpp
@ -0,0 +1 @@
+#include "FastMemcpy_Avx.h"
--- a/utils/memcpy-bench/glibc/asm-syntax.h
+++ b/utils/memcpy-bench/glibc/asm-syntax.h
@ -0,0 +1,26 @@
+#pragma once
+
+/* Definitions for x86 syntax variations.
+    Copyright (C) 1992-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.  Its master source is NOT part of
+    the C library, however.  The master source lives in the GNU MP Library.
+
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+
+#undef ALIGN
+#define ALIGN(log) .align 1<<log
+
+#undef L
+#define L(body) .L##body
--- a/utils/memcpy-bench/glibc/dwarf2.h
+++ b/utils/memcpy-bench/glibc/dwarf2.h
@ -0,0 +1,592 @@
+#pragma once
+
+/* Declarations and definitions of codes relating to the DWARF2 symbolic
+    debugging information format.
+    Copyright (C) 1992-2020 Free Software Foundation, Inc.
+    Contributed by Gary Funck (gary@intrepid.com).  Derived from the
+    DWARF 1 implementation written by Ron Guilmette (rfg@monkeys.com).
+
+    This file is part of the GNU C Library.
+
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DWARF2_H
+#define _DWARF2_H    1
+
+/* This file is derived from the DWARF specification (a public document)
+    Revision 2.0.0 (July 27, 1993) developed by the UNIX International
+    Programming Languages Special Interest Group (UI/PLSIG) and distributed
+    by UNIX International.  Copies of this specification are available from
+    UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.  */
+
+/* This file is shared between GCC and GDB, and should not contain
+    prototypes.  */
+
+#ifndef __ASSEMBLER__
+/* Tag names and codes.  */
+
+enum dwarf_tag
+    {
+    DW_TAG_padding = 0x00,
+    DW_TAG_array_type = 0x01,
+    DW_TAG_class_type = 0x02,
+    DW_TAG_entry_point = 0x03,
+    DW_TAG_enumeration_type = 0x04,
+    DW_TAG_formal_parameter = 0x05,
+    DW_TAG_imported_declaration = 0x08,
+    DW_TAG_label = 0x0a,
+    DW_TAG_lexical_block = 0x0b,
+    DW_TAG_member = 0x0d,
+    DW_TAG_pointer_type = 0x0f,
+    DW_TAG_reference_type = 0x10,
+    DW_TAG_compile_unit = 0x11,
+    DW_TAG_string_type = 0x12,
+    DW_TAG_structure_type = 0x13,
+    DW_TAG_subroutine_type = 0x15,
+    DW_TAG_typedef = 0x16,
+    DW_TAG_union_type = 0x17,
+    DW_TAG_unspecified_parameters = 0x18,
+    DW_TAG_variant = 0x19,
+    DW_TAG_common_block = 0x1a,
+    DW_TAG_common_inclusion = 0x1b,
+    DW_TAG_inheritance = 0x1c,
+    DW_TAG_inlined_subroutine = 0x1d,
+    DW_TAG_module = 0x1e,
+    DW_TAG_ptr_to_member_type = 0x1f,
+    DW_TAG_set_type = 0x20,
+    DW_TAG_subrange_type = 0x21,
+    DW_TAG_with_stmt = 0x22,
+    DW_TAG_access_declaration = 0x23,
+    DW_TAG_base_type = 0x24,
+    DW_TAG_catch_block = 0x25,
+    DW_TAG_const_type = 0x26,
+    DW_TAG_constant = 0x27,
+    DW_TAG_enumerator = 0x28,
+    DW_TAG_file_type = 0x29,
+    DW_TAG_friend = 0x2a,
+    DW_TAG_namelist = 0x2b,
+    DW_TAG_namelist_item = 0x2c,
+    DW_TAG_packed_type = 0x2d,
+    DW_TAG_subprogram = 0x2e,
+    DW_TAG_template_type_param = 0x2f,
+    DW_TAG_template_value_param = 0x30,
+    DW_TAG_thrown_type = 0x31,
+    DW_TAG_try_block = 0x32,
+    DW_TAG_variant_part = 0x33,
+    DW_TAG_variable = 0x34,
+    DW_TAG_volatile_type = 0x35,
+    /* SGI/MIPS Extensions */
+    DW_TAG_MIPS_loop = 0x4081,
+    /* GNU extensions */
+    DW_TAG_format_label = 0x4101,    /* for FORTRAN 77 and Fortran 90 */
+    DW_TAG_function_template = 0x4102,    /* for C++ */
+    DW_TAG_class_template = 0x4103,    /* for C++ */
+    DW_TAG_GNU_BINCL = 0x4104,
+    DW_TAG_GNU_EINCL = 0x4105
+    };
+
+#define DW_TAG_lo_user    0x4080
+#define DW_TAG_hi_user    0xffff
+
+/* flag that tells whether entry has a child or not */
+#define DW_children_no   0
+#define    DW_children_yes  1
+
+/* Form names and codes.  */
+enum dwarf_form
+    {
+    DW_FORM_addr = 0x01,
+    DW_FORM_block2 = 0x03,
+    DW_FORM_block4 = 0x04,
+    DW_FORM_data2 = 0x05,
+    DW_FORM_data4 = 0x06,
+    DW_FORM_data8 = 0x07,
+    DW_FORM_string = 0x08,
+    DW_FORM_block = 0x09,
+    DW_FORM_block1 = 0x0a,
+    DW_FORM_data1 = 0x0b,
+    DW_FORM_flag = 0x0c,
+    DW_FORM_sdata = 0x0d,
+    DW_FORM_strp = 0x0e,
+    DW_FORM_udata = 0x0f,
+    DW_FORM_ref_addr = 0x10,
+    DW_FORM_ref1 = 0x11,
+    DW_FORM_ref2 = 0x12,
+    DW_FORM_ref4 = 0x13,
+    DW_FORM_ref8 = 0x14,
+    DW_FORM_ref_udata = 0x15,
+    DW_FORM_indirect = 0x16
+    };
+
+/* Attribute names and codes.  */
+
+enum dwarf_attribute
+    {
+    DW_AT_sibling = 0x01,
+    DW_AT_location = 0x02,
+    DW_AT_name = 0x03,
+    DW_AT_ordering = 0x09,
+    DW_AT_subscr_data = 0x0a,
+    DW_AT_byte_size = 0x0b,
+    DW_AT_bit_offset = 0x0c,
+    DW_AT_bit_size = 0x0d,
+    DW_AT_element_list = 0x0f,
+    DW_AT_stmt_list = 0x10,
+    DW_AT_low_pc = 0x11,
+    DW_AT_high_pc = 0x12,
+    DW_AT_language = 0x13,
+    DW_AT_member = 0x14,
+    DW_AT_discr = 0x15,
+    DW_AT_discr_value = 0x16,
+    DW_AT_visibility = 0x17,
+    DW_AT_import = 0x18,
+    DW_AT_string_length = 0x19,
+    DW_AT_common_reference = 0x1a,
+    DW_AT_comp_dir = 0x1b,
+    DW_AT_const_value = 0x1c,
+    DW_AT_containing_type = 0x1d,
+    DW_AT_default_value = 0x1e,
+    DW_AT_inline = 0x20,
+    DW_AT_is_optional = 0x21,
+    DW_AT_lower_bound = 0x22,
+    DW_AT_producer = 0x25,
+    DW_AT_prototyped = 0x27,
+    DW_AT_return_addr = 0x2a,
+    DW_AT_start_scope = 0x2c,
+    DW_AT_stride_size = 0x2e,
+    DW_AT_upper_bound = 0x2f,
+    DW_AT_abstract_origin = 0x31,
+    DW_AT_accessibility = 0x32,
+    DW_AT_address_class = 0x33,
+    DW_AT_artificial = 0x34,
+    DW_AT_base_types = 0x35,
+    DW_AT_calling_convention = 0x36,
+    DW_AT_count = 0x37,
+    DW_AT_data_member_location = 0x38,
+    DW_AT_decl_column = 0x39,
+    DW_AT_decl_file = 0x3a,
+    DW_AT_decl_line = 0x3b,
+    DW_AT_declaration = 0x3c,
+    DW_AT_discr_list = 0x3d,
+    DW_AT_encoding = 0x3e,
+    DW_AT_external = 0x3f,
+    DW_AT_frame_base = 0x40,
+    DW_AT_friend = 0x41,
+    DW_AT_identifier_case = 0x42,
+    DW_AT_macro_info = 0x43,
+    DW_AT_namelist_items = 0x44,
+    DW_AT_priority = 0x45,
+    DW_AT_segment = 0x46,
+    DW_AT_specification = 0x47,
+    DW_AT_static_link = 0x48,
+    DW_AT_type = 0x49,
+    DW_AT_use_location = 0x4a,
+    DW_AT_variable_parameter = 0x4b,
+    DW_AT_virtuality = 0x4c,
+    DW_AT_vtable_elem_location = 0x4d,
+    /* SGI/MIPS Extensions */
+    DW_AT_MIPS_fde = 0x2001,
+    DW_AT_MIPS_loop_begin = 0x2002,
+    DW_AT_MIPS_tail_loop_begin = 0x2003,
+    DW_AT_MIPS_epilog_begin = 0x2004,
+    DW_AT_MIPS_loop_unroll_factor = 0x2005,
+    DW_AT_MIPS_software_pipeline_depth = 0x2006,
+    DW_AT_MIPS_linkage_name = 0x2007,
+    DW_AT_MIPS_stride = 0x2008,
+    DW_AT_MIPS_abstract_name = 0x2009,
+    DW_AT_MIPS_clone_origin = 0x200a,
+    DW_AT_MIPS_has_inlines = 0x200b,
+    /* GNU extensions.  */
+    DW_AT_sf_names = 0x2101,
+    DW_AT_src_info = 0x2102,
+    DW_AT_mac_info = 0x2103,
+    DW_AT_src_coords = 0x2104,
+    DW_AT_body_begin = 0x2105,
+    DW_AT_body_end = 0x2106
+    };
+
+#define DW_AT_lo_user    0x2000    /* implementation-defined range start */
+#define DW_AT_hi_user    0x3ff0    /* implementation-defined range end */
+
+/* Location atom names and codes.  */
+
+enum dwarf_location_atom
+    {
+    DW_OP_addr = 0x03,
+    DW_OP_deref = 0x06,
+    DW_OP_const1u = 0x08,
+    DW_OP_const1s = 0x09,
+    DW_OP_const2u = 0x0a,
+    DW_OP_const2s = 0x0b,
+    DW_OP_const4u = 0x0c,
+    DW_OP_const4s = 0x0d,
+    DW_OP_const8u = 0x0e,
+    DW_OP_const8s = 0x0f,
+    DW_OP_constu = 0x10,
+    DW_OP_consts = 0x11,
+    DW_OP_dup = 0x12,
+    DW_OP_drop = 0x13,
+    DW_OP_over = 0x14,
+    DW_OP_pick = 0x15,
+    DW_OP_swap = 0x16,
+    DW_OP_rot = 0x17,
+    DW_OP_xderef = 0x18,
+    DW_OP_abs = 0x19,
+    DW_OP_and = 0x1a,
+    DW_OP_div = 0x1b,
+    DW_OP_minus = 0x1c,
+    DW_OP_mod = 0x1d,
+    DW_OP_mul = 0x1e,
+    DW_OP_neg = 0x1f,
+    DW_OP_not = 0x20,
+    DW_OP_or = 0x21,
+    DW_OP_plus = 0x22,
+    DW_OP_plus_uconst = 0x23,
+    DW_OP_shl = 0x24,
+    DW_OP_shr = 0x25,
+    DW_OP_shra = 0x26,
+    DW_OP_xor = 0x27,
+    DW_OP_bra = 0x28,
+    DW_OP_eq = 0x29,
+    DW_OP_ge = 0x2a,
+    DW_OP_gt = 0x2b,
+    DW_OP_le = 0x2c,
+    DW_OP_lt = 0x2d,
+    DW_OP_ne = 0x2e,
+    DW_OP_skip = 0x2f,
+    DW_OP_lit0 = 0x30,
+    DW_OP_lit1 = 0x31,
+    DW_OP_lit2 = 0x32,
+    DW_OP_lit3 = 0x33,
+    DW_OP_lit4 = 0x34,
+    DW_OP_lit5 = 0x35,
+    DW_OP_lit6 = 0x36,
+    DW_OP_lit7 = 0x37,
+    DW_OP_lit8 = 0x38,
+    DW_OP_lit9 = 0x39,
+    DW_OP_lit10 = 0x3a,
+    DW_OP_lit11 = 0x3b,
+    DW_OP_lit12 = 0x3c,
+    DW_OP_lit13 = 0x3d,
+    DW_OP_lit14 = 0x3e,
+    DW_OP_lit15 = 0x3f,
+    DW_OP_lit16 = 0x40,
+    DW_OP_lit17 = 0x41,
+    DW_OP_lit18 = 0x42,
+    DW_OP_lit19 = 0x43,
+    DW_OP_lit20 = 0x44,
+    DW_OP_lit21 = 0x45,
+    DW_OP_lit22 = 0x46,
+    DW_OP_lit23 = 0x47,
+    DW_OP_lit24 = 0x48,
+    DW_OP_lit25 = 0x49,
+    DW_OP_lit26 = 0x4a,
+    DW_OP_lit27 = 0x4b,
+    DW_OP_lit28 = 0x4c,
+    DW_OP_lit29 = 0x4d,
+    DW_OP_lit30 = 0x4e,
+    DW_OP_lit31 = 0x4f,
+    DW_OP_reg0 = 0x50,
+    DW_OP_reg1 = 0x51,
+    DW_OP_reg2 = 0x52,
+    DW_OP_reg3 = 0x53,
+    DW_OP_reg4 = 0x54,
+    DW_OP_reg5 = 0x55,
+    DW_OP_reg6 = 0x56,
+    DW_OP_reg7 = 0x57,
+    DW_OP_reg8 = 0x58,
+    DW_OP_reg9 = 0x59,
+    DW_OP_reg10 = 0x5a,
+    DW_OP_reg11 = 0x5b,
+    DW_OP_reg12 = 0x5c,
+    DW_OP_reg13 = 0x5d,
+    DW_OP_reg14 = 0x5e,
+    DW_OP_reg15 = 0x5f,
+    DW_OP_reg16 = 0x60,
+    DW_OP_reg17 = 0x61,
+    DW_OP_reg18 = 0x62,
+    DW_OP_reg19 = 0x63,
+    DW_OP_reg20 = 0x64,
+    DW_OP_reg21 = 0x65,
+    DW_OP_reg22 = 0x66,
+    DW_OP_reg23 = 0x67,
+    DW_OP_reg24 = 0x68,
+    DW_OP_reg25 = 0x69,
+    DW_OP_reg26 = 0x6a,
+    DW_OP_reg27 = 0x6b,
+    DW_OP_reg28 = 0x6c,
+    DW_OP_reg29 = 0x6d,
+    DW_OP_reg30 = 0x6e,
+    DW_OP_reg31 = 0x6f,
+    DW_OP_breg0 = 0x70,
+    DW_OP_breg1 = 0x71,
+    DW_OP_breg2 = 0x72,
+    DW_OP_breg3 = 0x73,
+    DW_OP_breg4 = 0x74,
+    DW_OP_breg5 = 0x75,
+    DW_OP_breg6 = 0x76,
+    DW_OP_breg7 = 0x77,
+    DW_OP_breg8 = 0x78,
+    DW_OP_breg9 = 0x79,
+    DW_OP_breg10 = 0x7a,
+    DW_OP_breg11 = 0x7b,
+    DW_OP_breg12 = 0x7c,
+    DW_OP_breg13 = 0x7d,
+    DW_OP_breg14 = 0x7e,
+    DW_OP_breg15 = 0x7f,
+    DW_OP_breg16 = 0x80,
+    DW_OP_breg17 = 0x81,
+    DW_OP_breg18 = 0x82,
+    DW_OP_breg19 = 0x83,
+    DW_OP_breg20 = 0x84,
+    DW_OP_breg21 = 0x85,
+    DW_OP_breg22 = 0x86,
+    DW_OP_breg23 = 0x87,
+    DW_OP_breg24 = 0x88,
+    DW_OP_breg25 = 0x89,
+    DW_OP_breg26 = 0x8a,
+    DW_OP_breg27 = 0x8b,
+    DW_OP_breg28 = 0x8c,
+    DW_OP_breg29 = 0x8d,
+    DW_OP_breg30 = 0x8e,
+    DW_OP_breg31 = 0x8f,
+    DW_OP_regx = 0x90,
+    DW_OP_fbreg = 0x91,
+    DW_OP_bregx = 0x92,
+    DW_OP_piece = 0x93,
+    DW_OP_deref_size = 0x94,
+    DW_OP_xderef_size = 0x95,
+    DW_OP_nop = 0x96
+    };
+
+#define DW_OP_lo_user    0x80    /* implementation-defined range start */
+#define DW_OP_hi_user    0xff    /* implementation-defined range end */
+
+/* Type encodings.  */
+
+enum dwarf_type
+    {
+    DW_ATE_void = 0x0,
+    DW_ATE_address = 0x1,
+    DW_ATE_boolean = 0x2,
+    DW_ATE_complex_float = 0x3,
+    DW_ATE_float = 0x4,
+    DW_ATE_signed = 0x5,
+    DW_ATE_signed_char = 0x6,
+    DW_ATE_unsigned = 0x7,
+    DW_ATE_unsigned_char = 0x8
+    };
+
+#define    DW_ATE_lo_user 0x80
+#define    DW_ATE_hi_user 0xff
+
+/* Array ordering names and codes.  */
+enum dwarf_array_dim_ordering
+    {
+    DW_ORD_row_major = 0,
+    DW_ORD_col_major = 1
+    };
+
+/* access attribute */
+enum dwarf_access_attribute
+    {
+    DW_ACCESS_public = 1,
+    DW_ACCESS_protected = 2,
+    DW_ACCESS_private = 3
+    };
+
+/* visibility */
+enum dwarf_visibility_attribute
+    {
+    DW_VIS_local = 1,
+    DW_VIS_exported = 2,
+    DW_VIS_qualified = 3
+    };
+
+/* virtuality */
+enum dwarf_virtuality_attribute
+    {
+    DW_VIRTUALITY_none = 0,
+    DW_VIRTUALITY_virtual = 1,
+    DW_VIRTUALITY_pure_virtual = 2
+    };
+
+/* case sensitivity */
+enum dwarf_id_case
+    {
+    DW_ID_case_sensitive = 0,
+    DW_ID_up_case = 1,
+    DW_ID_down_case = 2,
+    DW_ID_case_insensitive = 3
+    };
+
+/* calling convention */
+enum dwarf_calling_convention
+    {
+    DW_CC_normal = 0x1,
+    DW_CC_program = 0x2,
+    DW_CC_nocall = 0x3
+    };
+
+#define DW_CC_lo_user 0x40
+#define DW_CC_hi_user 0xff
+
+/* inline attribute */
+enum dwarf_inline_attribute
+    {
+    DW_INL_not_inlined = 0,
+    DW_INL_inlined = 1,
+    DW_INL_declared_not_inlined = 2,
+    DW_INL_declared_inlined = 3
+    };
+
+/* discriminant lists */
+enum dwarf_discrim_list
+    {
+    DW_DSC_label = 0,
+    DW_DSC_range = 1
+    };
+
+/* line number opcodes */
+enum dwarf_line_number_ops
+    {
+    DW_LNS_extended_op = 0,
+    DW_LNS_copy = 1,
+    DW_LNS_advance_pc = 2,
+    DW_LNS_advance_line = 3,
+    DW_LNS_set_file = 4,
+    DW_LNS_set_column = 5,
+    DW_LNS_negate_stmt = 6,
+    DW_LNS_set_basic_block = 7,
+    DW_LNS_const_add_pc = 8,
+    DW_LNS_fixed_advance_pc = 9
+    };
+
+/* line number extended opcodes */
+enum dwarf_line_number_x_ops
+    {
+    DW_LNE_end_sequence = 1,
+    DW_LNE_set_address = 2,
+    DW_LNE_define_file = 3
+    };
+
+/* call frame information */
+enum dwarf_call_frame_info
+    {
+    DW_CFA_advance_loc = 0x40,
+    DW_CFA_offset = 0x80,
+    DW_CFA_restore = 0xc0,
+    DW_CFA_nop = 0x00,
+    DW_CFA_set_loc = 0x01,
+    DW_CFA_advance_loc1 = 0x02,
+    DW_CFA_advance_loc2 = 0x03,
+    DW_CFA_advance_loc4 = 0x04,
+    DW_CFA_offset_extended = 0x05,
+    DW_CFA_restore_extended = 0x06,
+    DW_CFA_undefined = 0x07,
+    DW_CFA_same_value = 0x08,
+    DW_CFA_register = 0x09,
+    DW_CFA_remember_state = 0x0a,
+    DW_CFA_restore_state = 0x0b,
+    DW_CFA_def_cfa = 0x0c,
+    DW_CFA_def_cfa_register = 0x0d,
+    DW_CFA_def_cfa_offset = 0x0e,
+    DW_CFA_def_cfa_expression = 0x0f,
+    DW_CFA_expression = 0x10,
+    /* Dwarf 2.1 */
+    DW_CFA_offset_extended_sf = 0x11,
+    DW_CFA_def_cfa_sf = 0x12,
+    DW_CFA_def_cfa_offset_sf = 0x13,
+
+    /* SGI/MIPS specific */
+    DW_CFA_MIPS_advance_loc8 = 0x1d,
+
+    /* GNU extensions */
+    DW_CFA_GNU_window_save = 0x2d,
+    DW_CFA_GNU_args_size = 0x2e,
+    DW_CFA_GNU_negative_offset_extended = 0x2f
+    };
+
+#define DW_CIE_ID      0xffffffff
+#define DW_CIE_VERSION      1
+
+#define DW_CFA_extended   0
+#define DW_CFA_low_user   0x1c
+#define DW_CFA_high_user  0x3f
+
+#define DW_CHILDREN_no             0x00
+#define DW_CHILDREN_yes             0x01
+
+#define DW_ADDR_none        0
+
+/* Source language names and codes.  */
+
+enum dwarf_source_language
+    {
+    DW_LANG_C89 = 0x0001,
+    DW_LANG_C = 0x0002,
+    DW_LANG_Ada83 = 0x0003,
+    DW_LANG_C_plus_plus = 0x0004,
+    DW_LANG_Cobol74 = 0x0005,
+    DW_LANG_Cobol85 = 0x0006,
+    DW_LANG_Fortran77 = 0x0007,
+    DW_LANG_Fortran90 = 0x0008,
+    DW_LANG_Pascal83 = 0x0009,
+    DW_LANG_Modula2 = 0x000a,
+    DW_LANG_Java = 0x000b,
+    DW_LANG_Mips_Assembler = 0x8001
+    };
+
+
+#define DW_LANG_lo_user 0x8000    /* implementation-defined range start */
+#define DW_LANG_hi_user 0xffff    /* implementation-defined range start */
+
+/* Names and codes for macro information.  */
+
+enum dwarf_macinfo_record_type
+    {
+    DW_MACINFO_define = 1,
+    DW_MACINFO_undef = 2,
+    DW_MACINFO_start_file = 3,
+    DW_MACINFO_end_file = 4,
+    DW_MACINFO_vendor_ext = 255
+    };
+
+#endif /* !ASSEMBLER */
+
+/* @@@ For use with GNU frame unwind information.  */
+
+#define DW_EH_PE_absptr        0x00
+#define DW_EH_PE_omit        0xff
+
+#define DW_EH_PE_uleb128    0x01
+#define DW_EH_PE_udata2        0x02
+#define DW_EH_PE_udata4        0x03
+#define DW_EH_PE_udata8        0x04
+#define DW_EH_PE_sleb128    0x09
+#define DW_EH_PE_sdata2        0x0A
+#define DW_EH_PE_sdata4        0x0B
+#define DW_EH_PE_sdata8        0x0C
+#define DW_EH_PE_signed        0x08
+
+#define DW_EH_PE_pcrel        0x10
+#define DW_EH_PE_textrel    0x20
+#define DW_EH_PE_datarel    0x30
+#define DW_EH_PE_funcrel    0x40
+#define DW_EH_PE_aligned    0x50
+
+#define DW_EH_PE_indirect    0x80
+
+#endif /* dwarf2.h */
--- a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
--- a/utils/memcpy-bench/glibc/memcpy-ssse3.S
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3.S
--- a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
@ -0,0 +1,12 @@
+#if 1
+# define VEC_SIZE    32
+# define VEC(i)        ymm##i
+# define VMOVNT        vmovntdq
+# define VMOVU        vmovdqu
+# define VMOVA        vmovdqa
+
+# define SECTION(p)        p##.avx
+# define MEMMOVE_SYMBOL(p,s)    p##_avx_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
--- a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
+++ b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
@ -0,0 +1,419 @@
+/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#if 1
+
+# include "asm-syntax.h"
+
+    .section .text.avx512,"ax",@progbits
+ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx512_no_vzeroupper)
+
+ENTRY (__mempcpy_avx512_no_vzeroupper)
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start)
+END (__mempcpy_avx512_no_vzeroupper)
+
+ENTRY (__memmove_chk_avx512_no_vzeroupper)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_avx512_no_vzeroupper)
+
+ENTRY (__memmove_avx512_no_vzeroupper)
+    mov    %RDI_LP, %RAX_LP
+# ifdef USE_AS_MEMPCPY
+    add    %RDX_LP, %RAX_LP
+# endif
+L(start):
+# ifdef __ILP32__
+    /* Clear the upper 32 bits.  */
+    mov    %edx, %edx
+# endif
+    lea    (%rsi, %rdx), %rcx
+    lea    (%rdi, %rdx), %r9
+    cmp    $512, %rdx
+    ja    L(512bytesormore)
+
+L(check):
+    cmp    $16, %rdx
+    jbe    L(less_16bytes)
+    cmp    $256, %rdx
+    jb    L(less_256bytes)
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups 0x80(%rsi), %zmm2
+    vmovups 0xC0(%rsi), %zmm3
+    vmovups    -0x100(%rcx), %zmm4
+    vmovups -0xC0(%rcx), %zmm5
+    vmovups -0x80(%rcx), %zmm6
+    vmovups -0x40(%rcx), %zmm7
+    vmovups %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, 0x80(%rdi)
+    vmovups %zmm3, 0xC0(%rdi)
+    vmovups    %zmm4, -0x100(%r9)
+    vmovups %zmm5, -0xC0(%r9)
+    vmovups %zmm6, -0x80(%r9)
+    vmovups %zmm7, -0x40(%r9)
+    ret
+
+L(less_256bytes):
+    cmp    $128, %dl
+    jb    L(less_128bytes)
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups -0x80(%rcx), %zmm2
+    vmovups -0x40(%rcx), %zmm3
+    vmovups    %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, -0x80(%r9)
+    vmovups %zmm3, -0x40(%r9)
+    ret
+
+L(less_128bytes):
+    cmp    $64, %dl
+    jb    L(less_64bytes)
+    vmovdqu (%rsi), %ymm0
+    vmovdqu 0x20(%rsi), %ymm1
+    vmovdqu -0x40(%rcx), %ymm2
+    vmovdqu -0x20(%rcx), %ymm3
+    vmovdqu %ymm0, (%rdi)
+    vmovdqu %ymm1, 0x20(%rdi)
+    vmovdqu %ymm2, -0x40(%r9)
+    vmovdqu %ymm3, -0x20(%r9)
+    ret
+
+L(less_64bytes):
+    cmp    $32, %dl
+    jb    L(less_32bytes)
+    vmovdqu    (%rsi), %ymm0
+    vmovdqu -0x20(%rcx), %ymm1
+    vmovdqu    %ymm0, (%rdi)
+    vmovdqu    %ymm1, -0x20(%r9)
+    ret
+
+L(less_32bytes):
+    vmovdqu (%rsi), %xmm0
+    vmovdqu -0x10(%rcx), %xmm1
+    vmovdqu %xmm0, (%rdi)
+    vmovdqu %xmm1, -0x10(%r9)
+    ret
+
+L(less_16bytes):
+    cmp    $8, %dl
+    jb    L(less_8bytes)
+    movq    (%rsi), %rsi
+    movq    -0x8(%rcx), %rcx
+    movq    %rsi, (%rdi)
+    movq    %rcx, -0x8(%r9)
+    ret
+
+L(less_8bytes):
+    cmp    $4, %dl
+    jb    L(less_4bytes)
+    mov    (%rsi), %esi
+    mov    -0x4(%rcx), %ecx
+    mov    %esi, (%rdi)
+    mov    %ecx, -0x4(%r9)
+    ret
+
+L(less_4bytes):
+    cmp    $2, %dl
+    jb    L(less_2bytes)
+    mov    (%rsi), %si
+    mov    -0x2(%rcx), %cx
+    mov    %si, (%rdi)
+    mov    %cx, -0x2(%r9)
+    ret
+
+L(less_2bytes):
+    cmp    $1, %dl
+    jb    L(less_1bytes)
+    mov    (%rsi), %cl
+    mov    %cl, (%rdi)
+L(less_1bytes):
+    ret
+
+L(512bytesormore):
+# ifdef SHARED_CACHE_SIZE_HALF
+    mov    $SHARED_CACHE_SIZE_HALF, %r8
+# else
+    mov    __x86_shared_cache_size_half(%rip), %r8
+# endif
+    cmp    %r8, %rdx
+    jae    L(preloop_large)
+    cmp    $1024, %rdx
+    ja    L(1024bytesormore)
+    prefetcht1 (%rsi)
+    prefetcht1 0x40(%rsi)
+    prefetcht1 0x80(%rsi)
+    prefetcht1 0xC0(%rsi)
+    prefetcht1 0x100(%rsi)
+    prefetcht1 0x140(%rsi)
+    prefetcht1 0x180(%rsi)
+    prefetcht1 0x1C0(%rsi)
+    prefetcht1 -0x200(%rcx)
+    prefetcht1 -0x1C0(%rcx)
+    prefetcht1 -0x180(%rcx)
+    prefetcht1 -0x140(%rcx)
+    prefetcht1 -0x100(%rcx)
+    prefetcht1 -0xC0(%rcx)
+    prefetcht1 -0x80(%rcx)
+    prefetcht1 -0x40(%rcx)
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups 0x80(%rsi), %zmm2
+    vmovups 0xC0(%rsi), %zmm3
+    vmovups    0x100(%rsi), %zmm4
+    vmovups 0x140(%rsi), %zmm5
+    vmovups 0x180(%rsi), %zmm6
+    vmovups 0x1C0(%rsi), %zmm7
+    vmovups    -0x200(%rcx), %zmm8
+    vmovups -0x1C0(%rcx), %zmm9
+    vmovups -0x180(%rcx), %zmm10
+    vmovups -0x140(%rcx), %zmm11
+    vmovups    -0x100(%rcx), %zmm12
+    vmovups -0xC0(%rcx), %zmm13
+    vmovups -0x80(%rcx), %zmm14
+    vmovups -0x40(%rcx), %zmm15
+    vmovups %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, 0x80(%rdi)
+    vmovups %zmm3, 0xC0(%rdi)
+    vmovups %zmm4, 0x100(%rdi)
+    vmovups %zmm5, 0x140(%rdi)
+    vmovups %zmm6, 0x180(%rdi)
+    vmovups %zmm7, 0x1C0(%rdi)
+    vmovups    %zmm8, -0x200(%r9)
+    vmovups %zmm9, -0x1C0(%r9)
+    vmovups %zmm10, -0x180(%r9)
+    vmovups %zmm11, -0x140(%r9)
+    vmovups    %zmm12, -0x100(%r9)
+    vmovups %zmm13, -0xC0(%r9)
+    vmovups %zmm14, -0x80(%r9)
+    vmovups %zmm15, -0x40(%r9)
+    ret
+
+L(1024bytesormore):
+    cmp    %rsi, %rdi
+    ja    L(1024bytesormore_bkw)
+    sub    $512, %r9
+    vmovups -0x200(%rcx), %zmm8
+    vmovups -0x1C0(%rcx), %zmm9
+    vmovups -0x180(%rcx), %zmm10
+    vmovups -0x140(%rcx), %zmm11
+    vmovups    -0x100(%rcx), %zmm12
+    vmovups -0xC0(%rcx), %zmm13
+    vmovups -0x80(%rcx), %zmm14
+    vmovups -0x40(%rcx), %zmm15
+    prefetcht1 (%rsi)
+    prefetcht1 0x40(%rsi)
+    prefetcht1 0x80(%rsi)
+    prefetcht1 0xC0(%rsi)
+    prefetcht1 0x100(%rsi)
+    prefetcht1 0x140(%rsi)
+    prefetcht1 0x180(%rsi)
+    prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups 0x80(%rsi), %zmm2
+    vmovups 0xC0(%rsi), %zmm3
+    vmovups    0x100(%rsi), %zmm4
+    vmovups 0x140(%rsi), %zmm5
+    vmovups 0x180(%rsi), %zmm6
+    vmovups 0x1C0(%rsi), %zmm7
+    add    $512, %rsi
+    prefetcht1 (%rsi)
+    prefetcht1 0x40(%rsi)
+    prefetcht1 0x80(%rsi)
+    prefetcht1 0xC0(%rsi)
+    prefetcht1 0x100(%rsi)
+    prefetcht1 0x140(%rsi)
+    prefetcht1 0x180(%rsi)
+    prefetcht1 0x1C0(%rsi)
+    vmovups    %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, 0x80(%rdi)
+    vmovups %zmm3, 0xC0(%rdi)
+    vmovups    %zmm4, 0x100(%rdi)
+    vmovups %zmm5, 0x140(%rdi)
+    vmovups %zmm6, 0x180(%rdi)
+    vmovups %zmm7, 0x1C0(%rdi)
+    add    $512, %rdi
+    cmp    %r9, %rdi
+    jb    L(gobble_512bytes_loop)
+    vmovups %zmm8, (%r9)
+    vmovups %zmm9, 0x40(%r9)
+    vmovups %zmm10, 0x80(%r9)
+    vmovups %zmm11, 0xC0(%r9)
+    vmovups %zmm12, 0x100(%r9)
+    vmovups %zmm13, 0x140(%r9)
+    vmovups %zmm14, 0x180(%r9)
+    vmovups %zmm15, 0x1C0(%r9)
+    ret
+
+L(1024bytesormore_bkw):
+    add    $512, %rdi
+    vmovups    0x1C0(%rsi), %zmm8
+    vmovups 0x180(%rsi), %zmm9
+    vmovups 0x140(%rsi), %zmm10
+    vmovups 0x100(%rsi), %zmm11
+    vmovups    0xC0(%rsi), %zmm12
+    vmovups 0x80(%rsi), %zmm13
+    vmovups 0x40(%rsi), %zmm14
+    vmovups (%rsi), %zmm15
+    prefetcht1 -0x40(%rcx)
+    prefetcht1 -0x80(%rcx)
+    prefetcht1 -0xC0(%rcx)
+    prefetcht1 -0x100(%rcx)
+    prefetcht1 -0x140(%rcx)
+    prefetcht1 -0x180(%rcx)
+    prefetcht1 -0x1C0(%rcx)
+    prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access.  */
+L(gobble_512bytes_loop_bkw):
+    vmovups -0x40(%rcx), %zmm0
+    vmovups -0x80(%rcx), %zmm1
+    vmovups -0xC0(%rcx), %zmm2
+    vmovups    -0x100(%rcx), %zmm3
+    vmovups -0x140(%rcx), %zmm4
+    vmovups -0x180(%rcx), %zmm5
+    vmovups -0x1C0(%rcx), %zmm6
+    vmovups    -0x200(%rcx), %zmm7
+    sub    $512, %rcx
+    prefetcht1 -0x40(%rcx)
+    prefetcht1 -0x80(%rcx)
+    prefetcht1 -0xC0(%rcx)
+    prefetcht1 -0x100(%rcx)
+    prefetcht1 -0x140(%rcx)
+    prefetcht1 -0x180(%rcx)
+    prefetcht1 -0x1C0(%rcx)
+    prefetcht1 -0x200(%rcx)
+    vmovups %zmm0, -0x40(%r9)
+    vmovups %zmm1, -0x80(%r9)
+    vmovups %zmm2, -0xC0(%r9)
+    vmovups    %zmm3, -0x100(%r9)
+    vmovups %zmm4, -0x140(%r9)
+    vmovups %zmm5, -0x180(%r9)
+    vmovups %zmm6, -0x1C0(%r9)
+    vmovups    %zmm7, -0x200(%r9)
+    sub    $512, %r9
+    cmp    %rdi, %r9
+    ja    L(gobble_512bytes_loop_bkw)
+    vmovups %zmm8, -0x40(%rdi)
+    vmovups %zmm9, -0x80(%rdi)
+    vmovups %zmm10, -0xC0(%rdi)
+    vmovups %zmm11, -0x100(%rdi)
+    vmovups %zmm12, -0x140(%rdi)
+    vmovups %zmm13, -0x180(%rdi)
+    vmovups %zmm14, -0x1C0(%rdi)
+    vmovups %zmm15, -0x200(%rdi)
+    ret
+
+L(preloop_large):
+    cmp    %rsi, %rdi
+    ja    L(preloop_large_bkw)
+    vmovups    (%rsi), %zmm4
+    vmovups    0x40(%rsi), %zmm5
+
+    mov    %rdi, %r11
+/* Align destination for access with non-temporal stores in the loop.  */
+    mov    %rdi, %r8
+    and    $-0x80, %rdi
+    add    $0x80, %rdi
+    sub    %rdi, %r8
+    sub    %r8, %rsi
+    add    %r8, %rdx
+L(gobble_256bytes_nt_loop):
+    prefetcht1 0x200(%rsi)
+    prefetcht1 0x240(%rsi)
+    prefetcht1 0x280(%rsi)
+    prefetcht1 0x2C0(%rsi)
+    prefetcht1 0x300(%rsi)
+    prefetcht1 0x340(%rsi)
+    prefetcht1 0x380(%rsi)
+    prefetcht1 0x3C0(%rsi)
+    vmovdqu64 (%rsi), %zmm0
+    vmovdqu64 0x40(%rsi), %zmm1
+    vmovdqu64 0x80(%rsi), %zmm2
+    vmovdqu64 0xC0(%rsi), %zmm3
+    vmovntdq %zmm0, (%rdi)
+    vmovntdq %zmm1, 0x40(%rdi)
+    vmovntdq %zmm2, 0x80(%rdi)
+    vmovntdq %zmm3, 0xC0(%rdi)
+    sub    $256, %rdx
+    add    $256, %rsi
+    add    $256, %rdi
+    cmp    $256, %rdx
+    ja    L(gobble_256bytes_nt_loop)
+    sfence
+    vmovups    %zmm4, (%r11)
+    vmovups    %zmm5, 0x40(%r11)
+    jmp    L(check)
+
+L(preloop_large_bkw):
+    vmovups -0x80(%rcx), %zmm4
+    vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+    mov    %r9, %r8
+    and    $-0x80, %r9
+    sub    %r9, %r8
+    sub    %r8, %rcx
+    sub    %r8, %rdx
+    add    %r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+    prefetcht1 -0x400(%rcx)
+    prefetcht1 -0x3C0(%rcx)
+    prefetcht1 -0x380(%rcx)
+    prefetcht1 -0x340(%rcx)
+    prefetcht1 -0x300(%rcx)
+    prefetcht1 -0x2C0(%rcx)
+    prefetcht1 -0x280(%rcx)
+    prefetcht1 -0x240(%rcx)
+    vmovdqu64 -0x100(%rcx), %zmm0
+    vmovdqu64 -0xC0(%rcx), %zmm1
+    vmovdqu64 -0x80(%rcx), %zmm2
+    vmovdqu64 -0x40(%rcx), %zmm3
+    vmovntdq %zmm0,    -0x100(%r9)
+    vmovntdq %zmm1,    -0xC0(%r9)
+    vmovntdq %zmm2,    -0x80(%r9)
+    vmovntdq %zmm3,    -0x40(%r9)
+    sub    $256, %rdx
+    sub    $256, %rcx
+    sub    $256, %r9
+    cmp    $256, %rdx
+    ja    L(gobble_256bytes_nt_loop_bkw)
+    sfence
+    vmovups    %zmm4, -0x80(%r8)
+    vmovups    %zmm5, -0x40(%r8)
+    jmp    L(check)
+END (__memmove_avx512_no_vzeroupper)
+
+strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
+strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
+#endif
--- a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
@ -0,0 +1,12 @@
+#if 1
+# define VEC_SIZE    64
+# define VEC(i)        zmm##i
+# define VMOVNT        vmovntdq
+# define VMOVU        vmovdqu64
+# define VMOVA        vmovdqa64
+
+# define SECTION(p)        p##.avx512
+# define MEMMOVE_SYMBOL(p,s)    p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
--- a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
@ -0,0 +1,33 @@
+/* memmove with SSE2.
+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if 1
+# define MEMMOVE_SYMBOL(p,s)    p##_sse2_##s
+#else
+weak_alias (__mempcpy, mempcpy)
+#endif
+
+#include "memmove.S"
+
+#if defined SHARED
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_sse2_unaligned to support overlapping addresses.  */
+compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# endif
+#endif
--- a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
@ -0,0 +1,559 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Load all sources into registers and store them together to avoid
+      possible address overlap between source and destination.
+   3. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   4. If address of destination > address of source, backward copy
+      4 * VEC_SIZE at a time with unaligned load and aligned store.
+      Load the first 4 * VEC and last VEC before the loop and store
+      them after the loop to support overlapping addresses.
+   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+      load and aligned store.  Load the last 4 * VEC and first VEC
+      before the loop and store them after the loop to support
+      overlapping addresses.
+   6. If size >= __x86_shared_non_temporal_threshold and there is no
+      overlap between destination and source, use non-temporal store
+      instead of aligned store.  */
+
+#include "sysdep.h"
+
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)    MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef PREFETCH
+# define PREFETCH(addr) prefetcht0 addr
+#endif
+
+/* Assume 64-byte prefetch size.  */
+#ifndef PREFETCH_SIZE
+# define PREFETCH_SIZE 64
+#endif
+
+#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
+
+#if PREFETCH_SIZE == 64
+# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+    PREFETCH ((offset)base)
+# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+    PREFETCH ((offset)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+    PREFETCH ((offset)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+# else
+#   error Unsupported PREFETCHED_LOAD_SIZE!
+# endif
+#else
+# error Unsupported PREFETCH_SIZE!
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+    .section SECTION(.text),"ax",@progbits
+#if defined SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+#endif
+
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start)
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+
+#if defined SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+#endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+    movq    %rdi, %rax
+L(start):
+# ifdef __ILP32__
+    /* Clear the upper 32 bits.  */
+    movl    %edx, %edx
+# endif
+    cmp    $VEC_SIZE, %RDX_LP
+    jb    L(less_vec)
+    cmp    $(VEC_SIZE * 2), %RDX_LP
+    ja    L(more_2x_vec)
+#if !defined USE_MULTIARCH
+L(last_2x_vec):
+#endif
+    /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(1)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), -VEC_SIZE(%rdi,%rdx)
+    VZEROUPPER
+#if !defined USE_MULTIARCH
+L(nop):
+#endif
+    ret
+#if defined USE_MULTIARCH
+END (MEMMOVE_SYMBOL (__memmove, unaligned))
+
+# if VEC_SIZE == 16
+ENTRY (__mempcpy_chk_erms)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_erms)
+
+/* Only used to measure performance of REP MOVSB.  */
+ENTRY (__mempcpy_erms)
+    mov    %RDI_LP, %RAX_LP
+    /* Skip zero length.  */
+    test    %RDX_LP, %RDX_LP
+    jz    2f
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start_movsb)
+END (__mempcpy_erms)
+
+ENTRY (__memmove_chk_erms)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_erms)
+
+ENTRY (__memmove_erms)
+    movq    %rdi, %rax
+    /* Skip zero length.  */
+    test    %RDX_LP, %RDX_LP
+    jz    2f
+L(start_movsb):
+    mov    %RDX_LP, %RCX_LP
+    cmp    %RSI_LP, %RDI_LP
+    jb    1f
+    /* Source == destination is less common.  */
+    je    2f
+    lea    (%rsi,%rcx), %RDX_LP
+    cmp    %RDX_LP, %RDI_LP
+    jb    L(movsb_backward)
+1:
+    rep movsb
+2:
+    ret
+L(movsb_backward):
+    leaq    -1(%rdi,%rcx), %rdi
+    leaq    -1(%rsi,%rcx), %rsi
+    std
+    rep movsb
+    cld
+    ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+# endif
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+    movq    %rdi, %rax
+L(start_erms):
+# ifdef __ILP32__
+    /* Clear the upper 32 bits.  */
+    movl    %edx, %edx
+# endif
+    cmp    $VEC_SIZE, %RDX_LP
+    jb    L(less_vec)
+    cmp    $(VEC_SIZE * 2), %RDX_LP
+    ja    L(movsb_more_2x_vec)
+L(last_2x_vec):
+    /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(1)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+    VZEROUPPER
+    ret
+
+L(movsb):
+    cmp    $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+    jae    L(more_8x_vec)
+    cmpq    %rsi, %rdi
+    jb    1f
+    /* Source == destination is less common.  */
+    je    L(nop)
+    leaq    (%rsi,%rdx), %r9
+    cmpq    %r9, %rdi
+    /* Avoid slow backward REP MOVSB.  */
+    jb    L(more_8x_vec_backward)
+1:
+    mov    %RDX_LP, %RCX_LP
+    rep movsb
+L(nop):
+    ret
+#endif
+
+L(less_vec):
+    /* Less than 1 VEC.  */
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
+    cmpb    $32, %dl
+    jae    L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+    cmpb    $16, %dl
+    jae    L(between_16_31)
+#endif
+    cmpb    $8, %dl
+    jae    L(between_8_15)
+    cmpb    $4, %dl
+    jae    L(between_4_7)
+    cmpb    $1, %dl
+    ja    L(between_2_3)
+    jb    1f
+    movzbl    (%rsi), %ecx
+    movb    %cl, (%rdi)
+1:
+    ret
+#if VEC_SIZE > 32
+L(between_32_63):
+    /* From 32 to 63.  No branch when size == 32.  */
+    vmovdqu    (%rsi), %ymm0
+    vmovdqu    -32(%rsi,%rdx), %ymm1
+    vmovdqu    %ymm0, (%rdi)
+    vmovdqu    %ymm1, -32(%rdi,%rdx)
+    VZEROUPPER
+    ret
+#endif
+#if VEC_SIZE > 16
+    /* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+    vmovdqu    (%rsi), %xmm0
+    vmovdqu    -16(%rsi,%rdx), %xmm1
+    vmovdqu    %xmm0, (%rdi)
+    vmovdqu    %xmm1, -16(%rdi,%rdx)
+    ret
+#endif
+L(between_8_15):
+    /* From 8 to 15.  No branch when size == 8.  */
+    movq    -8(%rsi,%rdx), %rcx
+    movq    (%rsi), %rsi
+    movq    %rcx, -8(%rdi,%rdx)
+    movq    %rsi, (%rdi)
+    ret
+L(between_4_7):
+    /* From 4 to 7.  No branch when size == 4.  */
+    movl    -4(%rsi,%rdx), %ecx
+    movl    (%rsi), %esi
+    movl    %ecx, -4(%rdi,%rdx)
+    movl    %esi, (%rdi)
+    ret
+L(between_2_3):
+    /* From 2 to 3.  No branch when size == 2.  */
+    movzwl    -2(%rsi,%rdx), %ecx
+    movzwl    (%rsi), %esi
+    movw    %cx, -2(%rdi,%rdx)
+    movw    %si, (%rdi)
+    ret
+
+#if defined USE_MULTIARCH
+L(movsb_more_2x_vec):
+    cmp    $REP_MOSB_THRESHOLD, %RDX_LP
+    ja    L(movsb)
+#endif
+L(more_2x_vec):
+    /* More than 2 * VEC and there may be overlap between destination
+       and source.  */
+    cmpq    $(VEC_SIZE * 8), %rdx
+    ja    L(more_8x_vec)
+    cmpq    $(VEC_SIZE * 4), %rdx
+    jb    L(last_4x_vec)
+    /* Copy from 4 * VEC to 8 * VEC, inclusively. */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(2)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(3)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(4)
+    VMOVU    -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+    VMOVU    -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+    VMOVU    -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), VEC_SIZE(%rdi)
+    VMOVU    %VEC(2), (VEC_SIZE * 2)(%rdi)
+    VMOVU    %VEC(3), (VEC_SIZE * 3)(%rdi)
+    VMOVU    %VEC(4), -VEC_SIZE(%rdi,%rdx)
+    VMOVU    %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+    VMOVU    %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+    VMOVU    %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+    VZEROUPPER
+    ret
+L(last_4x_vec):
+    /* Copy from 2 * VEC to 4 * VEC. */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(2)
+    VMOVU    -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), VEC_SIZE(%rdi)
+    VMOVU    %VEC(2), -VEC_SIZE(%rdi,%rdx)
+    VMOVU    %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+    VZEROUPPER
+    ret
+
+L(more_8x_vec):
+    cmpq    %rsi, %rdi
+    ja    L(more_8x_vec_backward)
+    /* Source == destination is less common.  */
+    je    L(nop)
+    /* Load the first VEC and last 4 * VEC to support overlapping
+       addresses.  */
+    VMOVU    (%rsi), %VEC(4)
+    VMOVU    -VEC_SIZE(%rsi, %rdx), %VEC(5)
+    VMOVU    -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+    VMOVU    -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+    VMOVU    -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+    /* Save start and stop of the destination buffer.  */
+    movq    %rdi, %r11
+    leaq    -VEC_SIZE(%rdi, %rdx), %rcx
+    /* Align destination for aligned stores in the loop.  Compute
+       how much destination is misaligned.  */
+    movq    %rdi, %r8
+    andq    $(VEC_SIZE - 1), %r8
+    /* Get the negative of offset for alignment.  */
+    subq    $VEC_SIZE, %r8
+    /* Adjust source.  */
+    subq    %r8, %rsi
+    /* Adjust destination which should be aligned now.  */
+    subq    %r8, %rdi
+    /* Adjust length.  */
+    addq    %r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+    /* Check non-temporal store threshold.  */
+    cmp    $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+    ja    L(large_forward)
+#endif
+L(loop_4x_vec_forward):
+    /* Copy 4 * VEC a time forward.  */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(2)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(3)
+    addq    $(VEC_SIZE * 4), %rsi
+    subq    $(VEC_SIZE * 4), %rdx
+    VMOVA    %VEC(0), (%rdi)
+    VMOVA    %VEC(1), VEC_SIZE(%rdi)
+    VMOVA    %VEC(2), (VEC_SIZE * 2)(%rdi)
+    VMOVA    %VEC(3), (VEC_SIZE * 3)(%rdi)
+    addq    $(VEC_SIZE * 4), %rdi
+    cmpq    $(VEC_SIZE * 4), %rdx
+    ja    L(loop_4x_vec_forward)
+    /* Store the last 4 * VEC.  */
+    VMOVU    %VEC(5), (%rcx)
+    VMOVU    %VEC(6), -VEC_SIZE(%rcx)
+    VMOVU    %VEC(7), -(VEC_SIZE * 2)(%rcx)
+    VMOVU    %VEC(8), -(VEC_SIZE * 3)(%rcx)
+    /* Store the first VEC.  */
+    VMOVU    %VEC(4), (%r11)
+    VZEROUPPER
+    ret
+
+L(more_8x_vec_backward):
+    /* Load the first 4 * VEC and last VEC to support overlapping
+       addresses.  */
+    VMOVU    (%rsi), %VEC(4)
+    VMOVU    VEC_SIZE(%rsi), %VEC(5)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(6)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(7)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(8)
+    /* Save stop of the destination buffer.  */
+    leaq    -VEC_SIZE(%rdi, %rdx), %r11
+    /* Align destination end for aligned stores in the loop.  Compute
+       how much destination end is misaligned.  */
+    leaq    -VEC_SIZE(%rsi, %rdx), %rcx
+    movq    %r11, %r9
+    movq    %r11, %r8
+    andq    $(VEC_SIZE - 1), %r8
+    /* Adjust source.  */
+    subq    %r8, %rcx
+    /* Adjust the end of destination which should be aligned now.  */
+    subq    %r8, %r9
+    /* Adjust length.  */
+    subq    %r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+    /* Check non-temporal store threshold.  */
+    cmp    $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+    ja    L(large_backward)
+#endif
+L(loop_4x_vec_backward):
+    /* Copy 4 * VEC a time backward.  */
+    VMOVU    (%rcx), %VEC(0)
+    VMOVU    -VEC_SIZE(%rcx), %VEC(1)
+    VMOVU    -(VEC_SIZE * 2)(%rcx), %VEC(2)
+    VMOVU    -(VEC_SIZE * 3)(%rcx), %VEC(3)
+    subq    $(VEC_SIZE * 4), %rcx
+    subq    $(VEC_SIZE * 4), %rdx
+    VMOVA    %VEC(0), (%r9)
+    VMOVA    %VEC(1), -VEC_SIZE(%r9)
+    VMOVA    %VEC(2), -(VEC_SIZE * 2)(%r9)
+    VMOVA    %VEC(3), -(VEC_SIZE * 3)(%r9)
+    subq    $(VEC_SIZE * 4), %r9
+    cmpq    $(VEC_SIZE * 4), %rdx
+    ja    L(loop_4x_vec_backward)
+    /* Store the first 4 * VEC.  */
+    VMOVU    %VEC(4), (%rdi)
+    VMOVU    %VEC(5), VEC_SIZE(%rdi)
+    VMOVU    %VEC(6), (VEC_SIZE * 2)(%rdi)
+    VMOVU    %VEC(7), (VEC_SIZE * 3)(%rdi)
+    /* Store the last VEC.  */
+    VMOVU    %VEC(8), (%r11)
+    VZEROUPPER
+    ret
+
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+L(large_forward):
+    /* Don't use non-temporal store if there is overlap between
+       destination and source since destination may be in cache
+       when source is loaded.  */
+    leaq    (%rdi, %rdx), %r10
+    cmpq    %r10, %rsi
+    jb    L(loop_4x_vec_forward)
+L(loop_large_forward):
+    /* Copy 4 * VEC a time forward with non-temporal stores.  */
+    PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+    PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(2)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(3)
+    addq    $PREFETCHED_LOAD_SIZE, %rsi
+    subq    $PREFETCHED_LOAD_SIZE, %rdx
+    VMOVNT    %VEC(0), (%rdi)
+    VMOVNT    %VEC(1), VEC_SIZE(%rdi)
+    VMOVNT    %VEC(2), (VEC_SIZE * 2)(%rdi)
+    VMOVNT    %VEC(3), (VEC_SIZE * 3)(%rdi)
+    addq    $PREFETCHED_LOAD_SIZE, %rdi
+    cmpq    $PREFETCHED_LOAD_SIZE, %rdx
+    ja    L(loop_large_forward)
+    sfence
+    /* Store the last 4 * VEC.  */
+    VMOVU    %VEC(5), (%rcx)
+    VMOVU    %VEC(6), -VEC_SIZE(%rcx)
+    VMOVU    %VEC(7), -(VEC_SIZE * 2)(%rcx)
+    VMOVU    %VEC(8), -(VEC_SIZE * 3)(%rcx)
+    /* Store the first VEC.  */
+    VMOVU    %VEC(4), (%r11)
+    VZEROUPPER
+    ret
+
+L(large_backward):
+    /* Don't use non-temporal store if there is overlap between
+       destination and source since destination may be in cache
+       when source is loaded.  */
+    leaq    (%rcx, %rdx), %r10
+    cmpq    %r10, %r9
+    jb    L(loop_4x_vec_backward)
+L(loop_large_backward):
+    /* Copy 4 * VEC a time backward with non-temporal stores.  */
+    PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+    PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+    VMOVU    (%rcx), %VEC(0)
+    VMOVU    -VEC_SIZE(%rcx), %VEC(1)
+    VMOVU    -(VEC_SIZE * 2)(%rcx), %VEC(2)
+    VMOVU    -(VEC_SIZE * 3)(%rcx), %VEC(3)
+    subq    $PREFETCHED_LOAD_SIZE, %rcx
+    subq    $PREFETCHED_LOAD_SIZE, %rdx
+    VMOVNT    %VEC(0), (%r9)
+    VMOVNT    %VEC(1), -VEC_SIZE(%r9)
+    VMOVNT    %VEC(2), -(VEC_SIZE * 2)(%r9)
+    VMOVNT    %VEC(3), -(VEC_SIZE * 3)(%r9)
+    subq    $PREFETCHED_LOAD_SIZE, %r9
+    cmpq    $PREFETCHED_LOAD_SIZE, %rdx
+    ja    L(loop_large_backward)
+    sfence
+    /* Store the first 4 * VEC.  */
+    VMOVU    %VEC(4), (%rdi)
+    VMOVU    %VEC(5), VEC_SIZE(%rdi)
+    VMOVU    %VEC(6), (VEC_SIZE * 2)(%rdi)
+    VMOVU    %VEC(7), (VEC_SIZE * 3)(%rdi)
+    /* Store the last VEC.  */
+    VMOVU    %VEC(8), (%r11)
+    VZEROUPPER
+    ret
+#endif
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+#if 1
+# ifdef USE_MULTIARCH
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+          MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+#  ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+          MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+#  endif
+# endif
+# ifdef SHARED
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
+          MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
+# endif
+#endif
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
+          MEMCPY_SYMBOL (__memcpy, unaligned))
--- a/utils/memcpy-bench/glibc/memmove.S
+++ b/utils/memcpy-bench/glibc/memmove.S
@ -0,0 +1,71 @@
+/* Optimized memmove for x86-64.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#define VEC_SIZE    16
+#define VEC(i)        xmm##i
+#define PREFETCHNT    prefetchnta
+#define VMOVNT        movntdq
+/* Use movups and movaps for smaller code sizes.  */
+#define VMOVU        movups
+#define VMOVA        movaps
+
+#define SECTION(p)        p
+
+#ifdef USE_MULTIARCH
+# if 0
+#  define MEMCPY_SYMBOL(p,s)        memcpy
+# endif
+#else
+# if defined SHARED
+#  define MEMCPY_SYMBOL(p,s)        __memcpy
+# else
+#  define MEMCPY_SYMBOL(p,s)        memcpy
+# endif
+#endif
+#if !defined USE_MULTIARCH
+# define MEMPCPY_SYMBOL(p,s)        __mempcpy
+#endif
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)    p
+# define MEMMOVE_SYMBOL(p,s)        memmove
+#endif
+
+#include "memmove-vec-unaligned-erms.S"
+
+#ifndef USE_MULTIARCH
+libc_hidden_builtin_def (memmove)
+# if defined SHARED && IS_IN (libc)
+strong_alias (memmove, __memcpy)
+libc_hidden_ver (memmove, memcpy)
+# endif
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+# if defined SHARED && IS_IN (libc)
+#  undef memcpy
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
+#endif
--- a/utils/memcpy-bench/glibc/sysdep.h
+++ b/utils/memcpy-bench/glibc/sysdep.h
@ -0,0 +1,131 @@
+#pragma once
+
+/* Assembler macros for x86-64.
+    Copyright (C) 2001-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_64_SYSDEP_H
+#define _X86_64_SYSDEP_H 1
+
+#include "sysdep_x86.h"
+
+#ifdef    __ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+/* This macro is for setting proper CFI with DW_CFA_expression describing
+    the register as saved relative to %rsp instead of relative to the CFA.
+    Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
+    from %rsp.  */
+#define cfi_offset_rel_rsp(regn, off)    .cfi_escape 0x10, regn, 0x4, 0x13, \
+                    0x77, off & 0x7F | 0x80, off >> 7
+
+/* If compiled for profiling, call `mcount' at the start of each function.  */
+#ifdef    PROF
+/* The mcount code relies on a normal frame pointer being on the stack
+    to locate our caller, so push one just for its benefit.  */
+#define CALL_MCOUNT                                                          \
+    pushq %rbp;                                                                \
+    cfi_adjust_cfa_offset(8);                                                  \
+    movq %rsp, %rbp;                                                           \
+    cfi_def_cfa_register(%rbp);                                                \
+    call JUMPTARGET(mcount);                                                   \
+    popq %rbp;                                                                 \
+    cfi_def_cfa(rsp,8);
+#else
+#define CALL_MCOUNT        /* Do nothing.  */
+#endif
+
+#define    PSEUDO(name, syscall_name, args)                      \
+lose:                                          \
+    jmp JUMPTARGET(syscall_error)                              \
+    .globl syscall_error;                                  \
+    ENTRY (name)                                      \
+    DO_CALL (syscall_name, args);                              \
+    jb lose
+
+#undef JUMPTARGET
+#ifdef SHARED
+# ifdef BIND_NOW
+#  define JUMPTARGET(name)    *name##@GOTPCREL(%rip)
+# else
+#  define JUMPTARGET(name)    name##@PLT
+# endif
+#else
+/* For static archives, branch to target directly.  */
+# define JUMPTARGET(name)    name
+#endif
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE    8
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) insn##q
+
+/* Assembler address directive. */
+#define ASM_ADDR .quad
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP    rax
+#define RBP_LP    rbp
+#define RBX_LP    rbx
+#define RCX_LP    rcx
+#define RDI_LP    rdi
+#define RDX_LP    rdx
+#define RSI_LP    rsi
+#define RSP_LP    rsp
+#define R8_LP    r8
+#define R9_LP    r9
+#define R10_LP    r10
+#define R11_LP    r11
+#define R12_LP    r12
+#define R13_LP    r13
+#define R14_LP    r14
+#define R15_LP    r15
+
+#else    /* __ASSEMBLER__ */
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE "8"
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) #insn "q"
+
+/* Assembler address directive. */
+#define ASM_ADDR ".quad"
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP    "rax"
+#define RBP_LP    "rbp"
+#define RBX_LP    "rbx"
+#define RCX_LP    "rcx"
+#define RDI_LP    "rdi"
+#define RDX_LP    "rdx"
+#define RSI_LP    "rsi"
+#define RSP_LP    "rsp"
+#define R8_LP    "r8"
+#define R9_LP    "r9"
+#define R10_LP    "r10"
+#define R11_LP    "r11"
+#define R12_LP    "r12"
+#define R13_LP    "r13"
+#define R14_LP    "r14"
+#define R15_LP    "r15"
+
+#endif    /* __ASSEMBLER__ */
+
+#endif    /* _X86_64_SYSDEP_H */
--- a/utils/memcpy-bench/glibc/sysdep_generic.h
+++ b/utils/memcpy-bench/glibc/sysdep_generic.h
@ -0,0 +1,115 @@
+#pragma once
+
+/* Generic asm macros used on many machines.
+    Copyright (C) 1991-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+
+#define C_SYMBOL_NAME(name) name
+#define HIDDEN_JUMPTARGET(name) 0x0
+#define SHARED_CACHE_SIZE_HALF (1024*1024)
+#define DATA_CACHE_SIZE_HALF (1024*32/2)
+#define DATA_CACHE_SIZE (1024*32)
+#define SHARED_NON_TEMPORAL_THRESHOLD (1024*1024*4)
+#define REP_MOSB_THRESHOLD 1024
+
+#define USE_MULTIARCH
+
+#define ASM_LINE_SEP ;
+
+#define strong_alias(original, alias)                \
+    .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP        \
+    C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
+
+#ifndef C_LABEL
+
+/* Define a macro we can use to construct the asm name for a C symbol.  */
+# define C_LABEL(name)    name##:
+
+#endif
+
+#ifdef __ASSEMBLER__
+/* Mark the end of function named SYM.  This is used on some platforms
+    to generate correct debugging information.  */
+# ifndef END
+#  define END(sym)
+# endif
+
+# ifndef JUMPTARGET
+#  define JUMPTARGET(sym)    sym
+# endif
+#endif
+
+/* Macros to generate eh_frame unwind information.  */
+#ifdef __ASSEMBLER__
+# define cfi_startproc            .cfi_startproc
+# define cfi_endproc            .cfi_endproc
+# define cfi_def_cfa(reg, off)        .cfi_def_cfa reg, off
+# define cfi_def_cfa_register(reg)    .cfi_def_cfa_register reg
+# define cfi_def_cfa_offset(off)    .cfi_def_cfa_offset off
+# define cfi_adjust_cfa_offset(off)    .cfi_adjust_cfa_offset off
+# define cfi_offset(reg, off)        .cfi_offset reg, off
+# define cfi_rel_offset(reg, off)    .cfi_rel_offset reg, off
+# define cfi_register(r1, r2)        .cfi_register r1, r2
+# define cfi_return_column(reg)    .cfi_return_column reg
+# define cfi_restore(reg)        .cfi_restore reg
+# define cfi_same_value(reg)        .cfi_same_value reg
+# define cfi_undefined(reg)        .cfi_undefined reg
+# define cfi_remember_state        .cfi_remember_state
+# define cfi_restore_state        .cfi_restore_state
+# define cfi_window_save        .cfi_window_save
+# define cfi_personality(enc, exp)    .cfi_personality enc, exp
+# define cfi_lsda(enc, exp)        .cfi_lsda enc, exp
+
+#else /* ! ASSEMBLER */
+
+# define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name)
+# define CFI_STRINGIFY2(Name) #Name
+# define CFI_STARTPROC    ".cfi_startproc"
+# define CFI_ENDPROC    ".cfi_endproc"
+# define CFI_DEF_CFA(reg, off)    \
+    ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_DEF_CFA_REGISTER(reg) \
+    ".cfi_def_cfa_register " CFI_STRINGIFY(reg)
+# define CFI_DEF_CFA_OFFSET(off) \
+    ".cfi_def_cfa_offset " CFI_STRINGIFY(off)
+# define CFI_ADJUST_CFA_OFFSET(off) \
+    ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off)
+# define CFI_OFFSET(reg, off) \
+    ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_REL_OFFSET(reg, off) \
+    ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_REGISTER(r1, r2) \
+    ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2)
+# define CFI_RETURN_COLUMN(reg) \
+    ".cfi_return_column " CFI_STRINGIFY(reg)
+# define CFI_RESTORE(reg) \
+    ".cfi_restore " CFI_STRINGIFY(reg)
+# define CFI_UNDEFINED(reg) \
+    ".cfi_undefined " CFI_STRINGIFY(reg)
+# define CFI_REMEMBER_STATE \
+    ".cfi_remember_state"
+# define CFI_RESTORE_STATE \
+    ".cfi_restore_state"
+# define CFI_WINDOW_SAVE \
+    ".cfi_window_save"
+# define CFI_PERSONALITY(enc, exp) \
+    ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+# define CFI_LSDA(enc, exp) \
+    ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+#endif
+
+#include "dwarf2.h"
--- a/utils/memcpy-bench/glibc/sysdep_x86.h
+++ b/utils/memcpy-bench/glibc/sysdep_x86.h
@ -0,0 +1,115 @@
+#pragma once
+
+/* Assembler macros for x86.
+    Copyright (C) 2017-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_SYSDEP_H
+#define _X86_SYSDEP_H 1
+
+#include "sysdep_generic.h"
+
+/* __CET__ is defined by GCC with Control-Flow Protection values:
+
+enum cf_protection_level
+{
+    CF_NONE = 0,
+    CF_BRANCH = 1 << 0,
+    CF_RETURN = 1 << 1,
+    CF_FULL = CF_BRANCH | CF_RETURN,
+    CF_SET = 1 << 2
+};
+*/
+
+/* Set if CF_BRANCH (IBT) is enabled.  */
+#define X86_FEATURE_1_IBT    (1U << 0)
+/* Set if CF_RETURN (SHSTK) is enabled.  */
+#define X86_FEATURE_1_SHSTK    (1U << 1)
+
+#ifdef __CET__
+# define CET_ENABLED    1
+# define IBT_ENABLED    (__CET__ & X86_FEATURE_1_IBT)
+# define SHSTK_ENABLED    (__CET__ & X86_FEATURE_1_SHSTK)
+#else
+# define CET_ENABLED    0
+# define IBT_ENABLED    0
+# define SHSTK_ENABLED    0
+#endif
+
+/* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
+    space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
+    aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
+#define STATE_SAVE_OFFSET (8 * 7 + 8)
+
+/* Save SSE, AVX, AVX512, mask and bound registers.  */
+#define STATE_SAVE_MASK \
+    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
+
+#ifdef    __ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+#ifdef _CET_ENDBR
+# define _CET_NOTRACK notrack
+#else
+# define _CET_ENDBR
+# define _CET_NOTRACK
+#endif
+
+/* ELF uses byte-counts for .align, most others use log2 of count of bytes.  */
+#define ALIGNARG(log2) 1<<log2
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+
+/* Define an entry point visible from C.  */
+#define    ENTRY(name)                                  \
+    .globl C_SYMBOL_NAME(name);                              \
+    .type C_SYMBOL_NAME(name),@function;                          \
+    .align ALIGNARG(4);                                  \
+    C_LABEL(name)                                      \
+    cfi_startproc;                                  \
+    _CET_ENDBR;                                      \
+    CALL_MCOUNT
+
+#undef    END
+#define END(name)                                  \
+    cfi_endproc;                                      \
+    ASM_SIZE_DIRECTIVE(name)
+
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
+/* Since C identifiers are not normally prefixed with an underscore
+    on this system, the asm identifier `syscall_error' intrudes on the
+    C name space.  Make sure we use an innocuous name.  */
+#define    syscall_error    __syscall_error
+#define mcount        _mcount
+
+#undef    PSEUDO_END
+#define    PSEUDO_END(name)                              \
+    END (name)
+
+/* Local label name for asm code. */
+#ifndef L
+/* ELF-like local names start with `.L'.  */
+# define L(name)    .L##name
+#endif
+
+#define atom_text_section .section ".text.atom", "ax"
+
+#endif    /* __ASSEMBLER__ */
+
+#endif    /* _X86_SYSDEP_H */
--- a/utils/memcpy-bench/memcpy-bench.cpp
+++ b/utils/memcpy-bench/memcpy-bench.cpp
@ -1,5 +1,6 @@
 #include <memory>
 #include <cstddef>
+#include <stdexcept>
 #include <string>
 #include <random>
 #include <iostream>
@ -14,15 +15,11 @@

 #include <Common/Stopwatch.h>

-#pragma GCC diagnostic ignored "-Wold-style-cast"
-#pragma GCC diagnostic ignored "-Wcast-align"
-#pragma GCC diagnostic ignored "-Wcast-qual"
-#include "FastMemcpy.h"
-//#include "FastMemcpy_Avx.h"
-
 #include <emmintrin.h>
 #include <immintrin.h>

+#include <boost/program_options.hpp>
+

 template <typename F, typename MemcpyImpl>
 void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl)
@ -36,6 +33,9 @@ void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_d
        dst += bytes_to_copy;
        src += bytes_to_copy;
        size -= bytes_to_copy;
+
+        /// Execute at least one SSE instruction as a penalty after running AVX code.
+        __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
    }
 }

@ -47,7 +47,7 @@ size_t generatorUniform(RNG & rng) { return rng() % N; };


 template <typename F, typename MemcpyImpl>
-void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl)
+uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl, const char * name)
 {
    Stopwatch watch;

@ -76,15 +76,15 @@ void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t n
    for (auto & thread : threads)
        thread.join();

-    double elapsed_ns = watch.elapsed();
+    uint64_t elapsed_ns = watch.elapsed();

    /// Validation
-    size_t sum = 0;
    for (size_t i = 0; i < size; ++i)
-        sum += dst[i];
+        if (dst[i] != uint8_t(i))
+            throw std::logic_error("Incorrect result");

-    std::cerr << std::fixed << std::setprecision(3)
-        << "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n";
+    std::cout << name;
+    return elapsed_ns;
 }


@ -101,9 +101,30 @@ static void * memcpy_erms(void * dst, const void * src, size_t size)
    return dst;
 }

+static void * memcpy_trivial(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+    void * ret = dst;
+
+    while (size > 0)
+    {
+        *dst = *src;
+        ++dst;
+        ++src;
+        --size;
+    }
+
+    return ret;
+}
+
 extern "C" void * memcpy_jart(void * dst, const void * src, size_t size);
 extern "C" void MemCpy(void * dst, const void * src, size_t size);

+void * memcpy_fast_sse(void * dst, const void * src, size_t size);
+void * memcpy_fast_avx(void * dst, const void * src, size_t size);
+void * memcpy_tiny(void * dst, const void * src, size_t size);
+

 static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size)
 {
@ -329,7 +350,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t *
    if (padding > 0)
    {
        __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
-        _mm256_storeu_si256((__m256i*)dst, head);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
        dst += padding;
        src += padding;
        size -= padding;
@ -539,70 +560,141 @@ tail:
    return ret;
 }

+extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_ssse3_back(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_no_vzeroupper(void * __restrict destination, const void * __restrict source, size_t size);
+
+
+#define VARIANT(N, NAME) \
+    if (memcpy_variant == N) \
+        return test(dst, src, size, iterations, num_threads, std::forward<F>(generator), NAME, #NAME);

 template <typename F>
-void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
+uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
 {
-    memcpy_type memcpy_libc = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
+    memcpy_type memcpy_libc_old = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));

-    if (memcpy_variant == 1)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy);
-    if (memcpy_variant == 2)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_libc);
-    if (memcpy_variant == 3)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_erms);
-    if (memcpy_variant == 4)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), MemCpy);
-    if (memcpy_variant == 5)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2);
-    if (memcpy_variant == 6)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled2);
-    if (memcpy_variant == 7)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled4);
-    if (memcpy_variant == 8)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled8);
-//    if (memcpy_variant == 9)
-//        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_fast_avx);
-    if (memcpy_variant == 10)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_my);
+    VARIANT(1, memcpy)
+    VARIANT(2, memcpy_trivial)
+    VARIANT(3, memcpy_libc_old)
+    VARIANT(4, memcpy_erms)
+    VARIANT(5, MemCpy)
+    VARIANT(6, memcpySSE2)
+    VARIANT(7, memcpySSE2Unrolled2)
+    VARIANT(8, memcpySSE2Unrolled4)
+    VARIANT(9, memcpySSE2Unrolled8)
+    VARIANT(10, memcpy_fast_sse)
+    VARIANT(11, memcpy_fast_avx)
+    VARIANT(12, memcpy_my)
+
+    VARIANT(21, __memcpy_erms)
+    VARIANT(22, __memcpy_sse2_unaligned)
+    VARIANT(23, __memcpy_ssse3)
+    VARIANT(24, __memcpy_ssse3_back)
+    VARIANT(25, __memcpy_avx_unaligned)
+    VARIANT(26, __memcpy_avx_unaligned_erms)
+    VARIANT(27, __memcpy_avx512_unaligned)
+    VARIANT(28, __memcpy_avx512_unaligned_erms)
+    VARIANT(29, __memcpy_avx512_no_vzeroupper)
+
+    return 0;
 }

-void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
+uint64_t dispatchVariants(
+    size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
 {
    if (generator_variant == 1)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
    if (generator_variant == 2)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
    if (generator_variant == 3)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
    if (generator_variant == 4)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
    if (generator_variant == 5)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
+
+    return 0;
 }


 int main(int argc, char ** argv)
 {
-    size_t size = 1000000000;
-    if (argc >= 2)
-        size = std::stoull(argv[1]);
+    boost::program_options::options_description desc("Allowed options");
+    desc.add_options()("help,h", "produce help message")
+        ("size", boost::program_options::value<size_t>()->default_value(1000000), "Bytes to copy on every iteration")
+        ("iterations", boost::program_options::value<size_t>(), "Number of iterations")
+        ("threads", boost::program_options::value<size_t>()->default_value(1), "Number of copying threads")
+        ("distribution", boost::program_options::value<size_t>()->default_value(4), "Distribution of chunk sizes to perform copy")
+        ("variant", boost::program_options::value<size_t>(), "Variant of memcpy implementation")
+        ("tsv", "Print result in tab-separated format")
+        ;

-    size_t iterations = 10;
-    if (argc >= 3)
-        iterations = std::stoull(argv[2]);
+    boost::program_options::variables_map options;
+    boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);

-    size_t num_threads = 1;
-    if (argc >= 4)
-        num_threads = std::stoull(argv[3]);
+    if (options.count("help") || !options.count("variant"))
+    {
+        std::cout << R"(Usage:

-    size_t memcpy_variant = 1;
-    if (argc >= 5)
-        memcpy_variant = std::stoull(argv[4]);
+for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do
+    for threads in 1 2 4 $(($(nproc) / 2)) $(nproc); do
+        for distribution in 1 2 3 4 5; do
+            for variant in {1..12} {21..29}; do
+                for i in {1..10}; do
+                    ./memcpy-bench --tsv --size $size --variant $variant --threads $threads --distribution $distribution;
+                done;
+            done;
+        done;
+    done;
+done | tee result.tsv

-    size_t generator_variant = 1;
-    if (argc >= 6)
-        generator_variant = std::stoull(argv[5]);
+clickhouse-local --structure '
+    name String,
+    size UInt64,
+    iterations UInt64,
+    threads UInt16,
+    generator UInt8,
+    memcpy UInt8,
+    elapsed UInt64
+' --query "
+    SELECT
+        size, name,
+        avg(1000 * elapsed / size / iterations) AS s,
+        count() AS c
+    FROM table
+    GROUP BY size, name
+    ORDER BY size ASC, s DESC
+" --output-format PrettyCompact < result.tsv
+
+)" << std::endl;
+        std::cout << desc << std::endl;
+        return 1;
+    }
+
+    size_t size = options["size"].as<size_t>();
+    size_t num_threads = options["threads"].as<size_t>();
+    size_t memcpy_variant = options["variant"].as<size_t>();
+    size_t generator_variant = options["distribution"].as<size_t>();
+
+    size_t iterations;
+    if (options.count("iterations"))
+    {
+        iterations = options["iterations"].as<size_t>();
+    }
+    else
+    {
+        iterations = 10000000000ULL / size;
+
+        if (generator_variant == 1)
+            iterations /= 10;
+    }

    std::unique_ptr<uint8_t[]> src(new uint8_t[size]);
    std::unique_ptr<uint8_t[]> dst(new uint8_t[size]);
@ -614,7 +706,25 @@ int main(int argc, char ** argv)
    /// Fill dst to avoid page faults.
    memset(dst.get(), 0, size);

-    dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
+    uint64_t elapsed_ns = dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
+
+    std::cout << std::fixed << std::setprecision(3);
+
+    if (options.count("tsv"))
+    {
+        std::cout
+            << '\t' << size
+            << '\t' << iterations
+            << '\t' << num_threads
+            << '\t' << generator_variant
+            << '\t' << memcpy_variant
+            << '\t' << elapsed_ns
+            << '\n';
+    }
+    else
+    {
+        std::cout << ": processed in " << (elapsed_ns / 1e9) << " sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec\n";
+    }

    return 0;
 }