Add more variants

2024-11-21 15:12:02 +00:00 · 2021-03-14 19:52:51 +03:00 · 2021-03-14 19:52:51 +03:00 · 1bc21789d2
commit 1bc21789d2
parent c94ee3151d
18 changed files with 8585 additions and 59 deletions
--- a/utils/memcpy-bench/CMakeLists.txt
+++ b/utils/memcpy-bench/CMakeLists.txt
@ -1,5 +1,22 @@
 enable_language(ASM)
-add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S)
-#target_compile_options(memcpy-bench PRIVATE -mavx)
-target_link_libraries(memcpy-bench PRIVATE dbms)
+
+add_executable (memcpy-bench
+    memcpy-bench.cpp
+    FastMemcpy.cpp
+    FastMemcpy_Avx.cpp
+    memcpy_jart.S
+    glibc/memcpy-ssse3.S
+    glibc/memcpy-ssse3-back.S
+    glibc/memmove-sse2-unaligned-erms.S
+    glibc/memmove-avx-unaligned-erms.S
+    glibc/memmove-avx512-unaligned-erms.S
+    glibc/memmove-avx512-no-vzeroupper.S
+    )
+
+add_compile_options(memcpy-bench PRIVATE -fno-tree-loop-distribute-patterns)
+
+set_source_files_properties(FastMemcpy.cpp PROPERTIES COMPILE_FLAGS "-Wno-old-style-cast")
+set_source_files_properties(FastMemcpy_Avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -Wno-old-style-cast -Wno-cast-qual -Wno-cast-align")
+
+target_link_libraries(memcpy-bench PRIVATE dbms boost::program_options)

--- a/utils/memcpy-bench/FastMemcpy.cpp
+++ b/utils/memcpy-bench/FastMemcpy.cpp
@ -0,0 +1 @@
+#include "FastMemcpy.h"
--- a/utils/memcpy-bench/FastMemcpy.h
+++ b/utils/memcpy-bench/FastMemcpy.h
@ -93,7 +93,7 @@ static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restric
 /// Attribute is used to avoid an error with undefined behaviour sanitizer
 /// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
 /// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
-__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
+__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
 {
    unsigned char *dd = ((unsigned char*)dst) + size;
    const unsigned char *ss = ((const unsigned char*)src) + size;
--- a/utils/memcpy-bench/FastMemcpy_Avx.cpp
+++ b/utils/memcpy-bench/FastMemcpy_Avx.cpp
@ -0,0 +1 @@
+#include "FastMemcpy_Avx.h"
--- a/utils/memcpy-bench/glibc/asm-syntax.h
+++ b/utils/memcpy-bench/glibc/asm-syntax.h
@ -0,0 +1,24 @@
+/* Definitions for x86 syntax variations.
+   Copyright (C) 1992-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.  Its master source is NOT part of
+   the C library, however.  The master source lives in the GNU MP Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#undef ALIGN
+#define ALIGN(log) .align 1<<log
+
+#undef L
+#define L(body) .L##body
--- a/utils/memcpy-bench/glibc/dwarf2.h
+++ b/utils/memcpy-bench/glibc/dwarf2.h
@ -0,0 +1,590 @@
+/* Declarations and definitions of codes relating to the DWARF2 symbolic
+   debugging information format.
+   Copyright (C) 1992-2020 Free Software Foundation, Inc.
+   Contributed by Gary Funck (gary@intrepid.com).  Derived from the
+   DWARF 1 implementation written by Ron Guilmette (rfg@monkeys.com).
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DWARF2_H
+#define _DWARF2_H	1
+
+/* This file is derived from the DWARF specification (a public document)
+   Revision 2.0.0 (July 27, 1993) developed by the UNIX International
+   Programming Languages Special Interest Group (UI/PLSIG) and distributed
+   by UNIX International.  Copies of this specification are available from
+   UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.  */
+
+/* This file is shared between GCC and GDB, and should not contain
+   prototypes.  */
+
+#ifndef __ASSEMBLER__
+/* Tag names and codes.  */
+
+enum dwarf_tag
+  {
+    DW_TAG_padding = 0x00,
+    DW_TAG_array_type = 0x01,
+    DW_TAG_class_type = 0x02,
+    DW_TAG_entry_point = 0x03,
+    DW_TAG_enumeration_type = 0x04,
+    DW_TAG_formal_parameter = 0x05,
+    DW_TAG_imported_declaration = 0x08,
+    DW_TAG_label = 0x0a,
+    DW_TAG_lexical_block = 0x0b,
+    DW_TAG_member = 0x0d,
+    DW_TAG_pointer_type = 0x0f,
+    DW_TAG_reference_type = 0x10,
+    DW_TAG_compile_unit = 0x11,
+    DW_TAG_string_type = 0x12,
+    DW_TAG_structure_type = 0x13,
+    DW_TAG_subroutine_type = 0x15,
+    DW_TAG_typedef = 0x16,
+    DW_TAG_union_type = 0x17,
+    DW_TAG_unspecified_parameters = 0x18,
+    DW_TAG_variant = 0x19,
+    DW_TAG_common_block = 0x1a,
+    DW_TAG_common_inclusion = 0x1b,
+    DW_TAG_inheritance = 0x1c,
+    DW_TAG_inlined_subroutine = 0x1d,
+    DW_TAG_module = 0x1e,
+    DW_TAG_ptr_to_member_type = 0x1f,
+    DW_TAG_set_type = 0x20,
+    DW_TAG_subrange_type = 0x21,
+    DW_TAG_with_stmt = 0x22,
+    DW_TAG_access_declaration = 0x23,
+    DW_TAG_base_type = 0x24,
+    DW_TAG_catch_block = 0x25,
+    DW_TAG_const_type = 0x26,
+    DW_TAG_constant = 0x27,
+    DW_TAG_enumerator = 0x28,
+    DW_TAG_file_type = 0x29,
+    DW_TAG_friend = 0x2a,
+    DW_TAG_namelist = 0x2b,
+    DW_TAG_namelist_item = 0x2c,
+    DW_TAG_packed_type = 0x2d,
+    DW_TAG_subprogram = 0x2e,
+    DW_TAG_template_type_param = 0x2f,
+    DW_TAG_template_value_param = 0x30,
+    DW_TAG_thrown_type = 0x31,
+    DW_TAG_try_block = 0x32,
+    DW_TAG_variant_part = 0x33,
+    DW_TAG_variable = 0x34,
+    DW_TAG_volatile_type = 0x35,
+    /* SGI/MIPS Extensions */
+    DW_TAG_MIPS_loop = 0x4081,
+    /* GNU extensions */
+    DW_TAG_format_label = 0x4101,	/* for FORTRAN 77 and Fortran 90 */
+    DW_TAG_function_template = 0x4102,	/* for C++ */
+    DW_TAG_class_template = 0x4103,	/* for C++ */
+    DW_TAG_GNU_BINCL = 0x4104,
+    DW_TAG_GNU_EINCL = 0x4105
+  };
+
+#define DW_TAG_lo_user	0x4080
+#define DW_TAG_hi_user	0xffff
+
+/* flag that tells whether entry has a child or not */
+#define DW_children_no   0
+#define	DW_children_yes  1
+
+/* Form names and codes.  */
+enum dwarf_form
+  {
+    DW_FORM_addr = 0x01,
+    DW_FORM_block2 = 0x03,
+    DW_FORM_block4 = 0x04,
+    DW_FORM_data2 = 0x05,
+    DW_FORM_data4 = 0x06,
+    DW_FORM_data8 = 0x07,
+    DW_FORM_string = 0x08,
+    DW_FORM_block = 0x09,
+    DW_FORM_block1 = 0x0a,
+    DW_FORM_data1 = 0x0b,
+    DW_FORM_flag = 0x0c,
+    DW_FORM_sdata = 0x0d,
+    DW_FORM_strp = 0x0e,
+    DW_FORM_udata = 0x0f,
+    DW_FORM_ref_addr = 0x10,
+    DW_FORM_ref1 = 0x11,
+    DW_FORM_ref2 = 0x12,
+    DW_FORM_ref4 = 0x13,
+    DW_FORM_ref8 = 0x14,
+    DW_FORM_ref_udata = 0x15,
+    DW_FORM_indirect = 0x16
+  };
+
+/* Attribute names and codes.  */
+
+enum dwarf_attribute
+  {
+    DW_AT_sibling = 0x01,
+    DW_AT_location = 0x02,
+    DW_AT_name = 0x03,
+    DW_AT_ordering = 0x09,
+    DW_AT_subscr_data = 0x0a,
+    DW_AT_byte_size = 0x0b,
+    DW_AT_bit_offset = 0x0c,
+    DW_AT_bit_size = 0x0d,
+    DW_AT_element_list = 0x0f,
+    DW_AT_stmt_list = 0x10,
+    DW_AT_low_pc = 0x11,
+    DW_AT_high_pc = 0x12,
+    DW_AT_language = 0x13,
+    DW_AT_member = 0x14,
+    DW_AT_discr = 0x15,
+    DW_AT_discr_value = 0x16,
+    DW_AT_visibility = 0x17,
+    DW_AT_import = 0x18,
+    DW_AT_string_length = 0x19,
+    DW_AT_common_reference = 0x1a,
+    DW_AT_comp_dir = 0x1b,
+    DW_AT_const_value = 0x1c,
+    DW_AT_containing_type = 0x1d,
+    DW_AT_default_value = 0x1e,
+    DW_AT_inline = 0x20,
+    DW_AT_is_optional = 0x21,
+    DW_AT_lower_bound = 0x22,
+    DW_AT_producer = 0x25,
+    DW_AT_prototyped = 0x27,
+    DW_AT_return_addr = 0x2a,
+    DW_AT_start_scope = 0x2c,
+    DW_AT_stride_size = 0x2e,
+    DW_AT_upper_bound = 0x2f,
+    DW_AT_abstract_origin = 0x31,
+    DW_AT_accessibility = 0x32,
+    DW_AT_address_class = 0x33,
+    DW_AT_artificial = 0x34,
+    DW_AT_base_types = 0x35,
+    DW_AT_calling_convention = 0x36,
+    DW_AT_count = 0x37,
+    DW_AT_data_member_location = 0x38,
+    DW_AT_decl_column = 0x39,
+    DW_AT_decl_file = 0x3a,
+    DW_AT_decl_line = 0x3b,
+    DW_AT_declaration = 0x3c,
+    DW_AT_discr_list = 0x3d,
+    DW_AT_encoding = 0x3e,
+    DW_AT_external = 0x3f,
+    DW_AT_frame_base = 0x40,
+    DW_AT_friend = 0x41,
+    DW_AT_identifier_case = 0x42,
+    DW_AT_macro_info = 0x43,
+    DW_AT_namelist_items = 0x44,
+    DW_AT_priority = 0x45,
+    DW_AT_segment = 0x46,
+    DW_AT_specification = 0x47,
+    DW_AT_static_link = 0x48,
+    DW_AT_type = 0x49,
+    DW_AT_use_location = 0x4a,
+    DW_AT_variable_parameter = 0x4b,
+    DW_AT_virtuality = 0x4c,
+    DW_AT_vtable_elem_location = 0x4d,
+    /* SGI/MIPS Extensions */
+    DW_AT_MIPS_fde = 0x2001,
+    DW_AT_MIPS_loop_begin = 0x2002,
+    DW_AT_MIPS_tail_loop_begin = 0x2003,
+    DW_AT_MIPS_epilog_begin = 0x2004,
+    DW_AT_MIPS_loop_unroll_factor = 0x2005,
+    DW_AT_MIPS_software_pipeline_depth = 0x2006,
+    DW_AT_MIPS_linkage_name = 0x2007,
+    DW_AT_MIPS_stride = 0x2008,
+    DW_AT_MIPS_abstract_name = 0x2009,
+    DW_AT_MIPS_clone_origin = 0x200a,
+    DW_AT_MIPS_has_inlines = 0x200b,
+    /* GNU extensions.  */
+    DW_AT_sf_names = 0x2101,
+    DW_AT_src_info = 0x2102,
+    DW_AT_mac_info = 0x2103,
+    DW_AT_src_coords = 0x2104,
+    DW_AT_body_begin = 0x2105,
+    DW_AT_body_end = 0x2106
+  };
+
+#define DW_AT_lo_user	0x2000	/* implementation-defined range start */
+#define DW_AT_hi_user	0x3ff0	/* implementation-defined range end */
+
+/* Location atom names and codes.  */
+
+enum dwarf_location_atom
+  {
+    DW_OP_addr = 0x03,
+    DW_OP_deref = 0x06,
+    DW_OP_const1u = 0x08,
+    DW_OP_const1s = 0x09,
+    DW_OP_const2u = 0x0a,
+    DW_OP_const2s = 0x0b,
+    DW_OP_const4u = 0x0c,
+    DW_OP_const4s = 0x0d,
+    DW_OP_const8u = 0x0e,
+    DW_OP_const8s = 0x0f,
+    DW_OP_constu = 0x10,
+    DW_OP_consts = 0x11,
+    DW_OP_dup = 0x12,
+    DW_OP_drop = 0x13,
+    DW_OP_over = 0x14,
+    DW_OP_pick = 0x15,
+    DW_OP_swap = 0x16,
+    DW_OP_rot = 0x17,
+    DW_OP_xderef = 0x18,
+    DW_OP_abs = 0x19,
+    DW_OP_and = 0x1a,
+    DW_OP_div = 0x1b,
+    DW_OP_minus = 0x1c,
+    DW_OP_mod = 0x1d,
+    DW_OP_mul = 0x1e,
+    DW_OP_neg = 0x1f,
+    DW_OP_not = 0x20,
+    DW_OP_or = 0x21,
+    DW_OP_plus = 0x22,
+    DW_OP_plus_uconst = 0x23,
+    DW_OP_shl = 0x24,
+    DW_OP_shr = 0x25,
+    DW_OP_shra = 0x26,
+    DW_OP_xor = 0x27,
+    DW_OP_bra = 0x28,
+    DW_OP_eq = 0x29,
+    DW_OP_ge = 0x2a,
+    DW_OP_gt = 0x2b,
+    DW_OP_le = 0x2c,
+    DW_OP_lt = 0x2d,
+    DW_OP_ne = 0x2e,
+    DW_OP_skip = 0x2f,
+    DW_OP_lit0 = 0x30,
+    DW_OP_lit1 = 0x31,
+    DW_OP_lit2 = 0x32,
+    DW_OP_lit3 = 0x33,
+    DW_OP_lit4 = 0x34,
+    DW_OP_lit5 = 0x35,
+    DW_OP_lit6 = 0x36,
+    DW_OP_lit7 = 0x37,
+    DW_OP_lit8 = 0x38,
+    DW_OP_lit9 = 0x39,
+    DW_OP_lit10 = 0x3a,
+    DW_OP_lit11 = 0x3b,
+    DW_OP_lit12 = 0x3c,
+    DW_OP_lit13 = 0x3d,
+    DW_OP_lit14 = 0x3e,
+    DW_OP_lit15 = 0x3f,
+    DW_OP_lit16 = 0x40,
+    DW_OP_lit17 = 0x41,
+    DW_OP_lit18 = 0x42,
+    DW_OP_lit19 = 0x43,
+    DW_OP_lit20 = 0x44,
+    DW_OP_lit21 = 0x45,
+    DW_OP_lit22 = 0x46,
+    DW_OP_lit23 = 0x47,
+    DW_OP_lit24 = 0x48,
+    DW_OP_lit25 = 0x49,
+    DW_OP_lit26 = 0x4a,
+    DW_OP_lit27 = 0x4b,
+    DW_OP_lit28 = 0x4c,
+    DW_OP_lit29 = 0x4d,
+    DW_OP_lit30 = 0x4e,
+    DW_OP_lit31 = 0x4f,
+    DW_OP_reg0 = 0x50,
+    DW_OP_reg1 = 0x51,
+    DW_OP_reg2 = 0x52,
+    DW_OP_reg3 = 0x53,
+    DW_OP_reg4 = 0x54,
+    DW_OP_reg5 = 0x55,
+    DW_OP_reg6 = 0x56,
+    DW_OP_reg7 = 0x57,
+    DW_OP_reg8 = 0x58,
+    DW_OP_reg9 = 0x59,
+    DW_OP_reg10 = 0x5a,
+    DW_OP_reg11 = 0x5b,
+    DW_OP_reg12 = 0x5c,
+    DW_OP_reg13 = 0x5d,
+    DW_OP_reg14 = 0x5e,
+    DW_OP_reg15 = 0x5f,
+    DW_OP_reg16 = 0x60,
+    DW_OP_reg17 = 0x61,
+    DW_OP_reg18 = 0x62,
+    DW_OP_reg19 = 0x63,
+    DW_OP_reg20 = 0x64,
+    DW_OP_reg21 = 0x65,
+    DW_OP_reg22 = 0x66,
+    DW_OP_reg23 = 0x67,
+    DW_OP_reg24 = 0x68,
+    DW_OP_reg25 = 0x69,
+    DW_OP_reg26 = 0x6a,
+    DW_OP_reg27 = 0x6b,
+    DW_OP_reg28 = 0x6c,
+    DW_OP_reg29 = 0x6d,
+    DW_OP_reg30 = 0x6e,
+    DW_OP_reg31 = 0x6f,
+    DW_OP_breg0 = 0x70,
+    DW_OP_breg1 = 0x71,
+    DW_OP_breg2 = 0x72,
+    DW_OP_breg3 = 0x73,
+    DW_OP_breg4 = 0x74,
+    DW_OP_breg5 = 0x75,
+    DW_OP_breg6 = 0x76,
+    DW_OP_breg7 = 0x77,
+    DW_OP_breg8 = 0x78,
+    DW_OP_breg9 = 0x79,
+    DW_OP_breg10 = 0x7a,
+    DW_OP_breg11 = 0x7b,
+    DW_OP_breg12 = 0x7c,
+    DW_OP_breg13 = 0x7d,
+    DW_OP_breg14 = 0x7e,
+    DW_OP_breg15 = 0x7f,
+    DW_OP_breg16 = 0x80,
+    DW_OP_breg17 = 0x81,
+    DW_OP_breg18 = 0x82,
+    DW_OP_breg19 = 0x83,
+    DW_OP_breg20 = 0x84,
+    DW_OP_breg21 = 0x85,
+    DW_OP_breg22 = 0x86,
+    DW_OP_breg23 = 0x87,
+    DW_OP_breg24 = 0x88,
+    DW_OP_breg25 = 0x89,
+    DW_OP_breg26 = 0x8a,
+    DW_OP_breg27 = 0x8b,
+    DW_OP_breg28 = 0x8c,
+    DW_OP_breg29 = 0x8d,
+    DW_OP_breg30 = 0x8e,
+    DW_OP_breg31 = 0x8f,
+    DW_OP_regx = 0x90,
+    DW_OP_fbreg = 0x91,
+    DW_OP_bregx = 0x92,
+    DW_OP_piece = 0x93,
+    DW_OP_deref_size = 0x94,
+    DW_OP_xderef_size = 0x95,
+    DW_OP_nop = 0x96
+  };
+
+#define DW_OP_lo_user	0x80	/* implementation-defined range start */
+#define DW_OP_hi_user	0xff	/* implementation-defined range end */
+
+/* Type encodings.  */
+
+enum dwarf_type
+  {
+    DW_ATE_void = 0x0,
+    DW_ATE_address = 0x1,
+    DW_ATE_boolean = 0x2,
+    DW_ATE_complex_float = 0x3,
+    DW_ATE_float = 0x4,
+    DW_ATE_signed = 0x5,
+    DW_ATE_signed_char = 0x6,
+    DW_ATE_unsigned = 0x7,
+    DW_ATE_unsigned_char = 0x8
+  };
+
+#define	DW_ATE_lo_user 0x80
+#define	DW_ATE_hi_user 0xff
+
+/* Array ordering names and codes.  */
+enum dwarf_array_dim_ordering
+  {
+    DW_ORD_row_major = 0,
+    DW_ORD_col_major = 1
+  };
+
+/* access attribute */
+enum dwarf_access_attribute
+  {
+    DW_ACCESS_public = 1,
+    DW_ACCESS_protected = 2,
+    DW_ACCESS_private = 3
+  };
+
+/* visibility */
+enum dwarf_visibility_attribute
+  {
+    DW_VIS_local = 1,
+    DW_VIS_exported = 2,
+    DW_VIS_qualified = 3
+  };
+
+/* virtuality */
+enum dwarf_virtuality_attribute
+  {
+    DW_VIRTUALITY_none = 0,
+    DW_VIRTUALITY_virtual = 1,
+    DW_VIRTUALITY_pure_virtual = 2
+  };
+
+/* case sensitivity */
+enum dwarf_id_case
+  {
+    DW_ID_case_sensitive = 0,
+    DW_ID_up_case = 1,
+    DW_ID_down_case = 2,
+    DW_ID_case_insensitive = 3
+  };
+
+/* calling convention */
+enum dwarf_calling_convention
+  {
+    DW_CC_normal = 0x1,
+    DW_CC_program = 0x2,
+    DW_CC_nocall = 0x3
+  };
+
+#define DW_CC_lo_user 0x40
+#define DW_CC_hi_user 0xff
+
+/* inline attribute */
+enum dwarf_inline_attribute
+  {
+    DW_INL_not_inlined = 0,
+    DW_INL_inlined = 1,
+    DW_INL_declared_not_inlined = 2,
+    DW_INL_declared_inlined = 3
+  };
+
+/* discriminant lists */
+enum dwarf_discrim_list
+  {
+    DW_DSC_label = 0,
+    DW_DSC_range = 1
+  };
+
+/* line number opcodes */
+enum dwarf_line_number_ops
+  {
+    DW_LNS_extended_op = 0,
+    DW_LNS_copy = 1,
+    DW_LNS_advance_pc = 2,
+    DW_LNS_advance_line = 3,
+    DW_LNS_set_file = 4,
+    DW_LNS_set_column = 5,
+    DW_LNS_negate_stmt = 6,
+    DW_LNS_set_basic_block = 7,
+    DW_LNS_const_add_pc = 8,
+    DW_LNS_fixed_advance_pc = 9
+  };
+
+/* line number extended opcodes */
+enum dwarf_line_number_x_ops
+  {
+    DW_LNE_end_sequence = 1,
+    DW_LNE_set_address = 2,
+    DW_LNE_define_file = 3
+  };
+
+/* call frame information */
+enum dwarf_call_frame_info
+  {
+    DW_CFA_advance_loc = 0x40,
+    DW_CFA_offset = 0x80,
+    DW_CFA_restore = 0xc0,
+    DW_CFA_nop = 0x00,
+    DW_CFA_set_loc = 0x01,
+    DW_CFA_advance_loc1 = 0x02,
+    DW_CFA_advance_loc2 = 0x03,
+    DW_CFA_advance_loc4 = 0x04,
+    DW_CFA_offset_extended = 0x05,
+    DW_CFA_restore_extended = 0x06,
+    DW_CFA_undefined = 0x07,
+    DW_CFA_same_value = 0x08,
+    DW_CFA_register = 0x09,
+    DW_CFA_remember_state = 0x0a,
+    DW_CFA_restore_state = 0x0b,
+    DW_CFA_def_cfa = 0x0c,
+    DW_CFA_def_cfa_register = 0x0d,
+    DW_CFA_def_cfa_offset = 0x0e,
+    DW_CFA_def_cfa_expression = 0x0f,
+    DW_CFA_expression = 0x10,
+    /* Dwarf 2.1 */
+    DW_CFA_offset_extended_sf = 0x11,
+    DW_CFA_def_cfa_sf = 0x12,
+    DW_CFA_def_cfa_offset_sf = 0x13,
+
+    /* SGI/MIPS specific */
+    DW_CFA_MIPS_advance_loc8 = 0x1d,
+
+    /* GNU extensions */
+    DW_CFA_GNU_window_save = 0x2d,
+    DW_CFA_GNU_args_size = 0x2e,
+    DW_CFA_GNU_negative_offset_extended = 0x2f
+  };
+
+#define DW_CIE_ID	  0xffffffff
+#define DW_CIE_VERSION	  1
+
+#define DW_CFA_extended   0
+#define DW_CFA_low_user   0x1c
+#define DW_CFA_high_user  0x3f
+
+#define DW_CHILDREN_no		     0x00
+#define DW_CHILDREN_yes		     0x01
+
+#define DW_ADDR_none		0
+
+/* Source language names and codes.  */
+
+enum dwarf_source_language
+  {
+    DW_LANG_C89 = 0x0001,
+    DW_LANG_C = 0x0002,
+    DW_LANG_Ada83 = 0x0003,
+    DW_LANG_C_plus_plus = 0x0004,
+    DW_LANG_Cobol74 = 0x0005,
+    DW_LANG_Cobol85 = 0x0006,
+    DW_LANG_Fortran77 = 0x0007,
+    DW_LANG_Fortran90 = 0x0008,
+    DW_LANG_Pascal83 = 0x0009,
+    DW_LANG_Modula2 = 0x000a,
+    DW_LANG_Java = 0x000b,
+    DW_LANG_Mips_Assembler = 0x8001
+  };
+
+
+#define DW_LANG_lo_user 0x8000	/* implementation-defined range start */
+#define DW_LANG_hi_user 0xffff	/* implementation-defined range start */
+
+/* Names and codes for macro information.  */
+
+enum dwarf_macinfo_record_type
+  {
+    DW_MACINFO_define = 1,
+    DW_MACINFO_undef = 2,
+    DW_MACINFO_start_file = 3,
+    DW_MACINFO_end_file = 4,
+    DW_MACINFO_vendor_ext = 255
+  };
+
+#endif /* !ASSEMBLER */
+
+/* @@@ For use with GNU frame unwind information.  */
+
+#define DW_EH_PE_absptr		0x00
+#define DW_EH_PE_omit		0xff
+
+#define DW_EH_PE_uleb128	0x01
+#define DW_EH_PE_udata2		0x02
+#define DW_EH_PE_udata4		0x03
+#define DW_EH_PE_udata8		0x04
+#define DW_EH_PE_sleb128	0x09
+#define DW_EH_PE_sdata2		0x0A
+#define DW_EH_PE_sdata4		0x0B
+#define DW_EH_PE_sdata8		0x0C
+#define DW_EH_PE_signed		0x08
+
+#define DW_EH_PE_pcrel		0x10
+#define DW_EH_PE_textrel	0x20
+#define DW_EH_PE_datarel	0x30
+#define DW_EH_PE_funcrel	0x40
+#define DW_EH_PE_aligned	0x50
+
+#define DW_EH_PE_indirect	0x80
+
+#endif /* dwarf2.h */
--- a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
--- a/utils/memcpy-bench/glibc/memcpy-ssse3.S
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3.S
--- a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
@ -0,0 +1,12 @@
+#if 1
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define SECTION(p)		p##.avx
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
--- a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
+++ b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
@ -0,0 +1,419 @@
+/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#if 1
+
+# include "asm-syntax.h"
+
+	.section .text.avx512,"ax",@progbits
+ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx512_no_vzeroupper)
+
+ENTRY (__mempcpy_avx512_no_vzeroupper)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END (__mempcpy_avx512_no_vzeroupper)
+
+ENTRY (__memmove_chk_avx512_no_vzeroupper)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_avx512_no_vzeroupper)
+
+ENTRY (__memmove_avx512_no_vzeroupper)
+	mov	%RDI_LP, %RAX_LP
+# ifdef USE_AS_MEMPCPY
+	add	%RDX_LP, %RAX_LP
+# endif
+L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+	lea	(%rsi, %rdx), %rcx
+	lea	(%rdi, %rdx), %r9
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+
+L(check):
+	cmp	$16, %rdx
+	jbe	L(less_16bytes)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	-0x100(%rcx), %zmm4
+	vmovups -0xC0(%rcx), %zmm5
+	vmovups -0x80(%rcx), %zmm6
+	vmovups -0x40(%rcx), %zmm7
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, -0x100(%r9)
+	vmovups %zmm5, -0xC0(%r9)
+	vmovups %zmm6, -0x80(%r9)
+	vmovups %zmm7, -0x40(%r9)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups -0x80(%rcx), %zmm2
+	vmovups -0x40(%rcx), %zmm3
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%r9)
+	vmovups %zmm3, -0x40(%r9)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu -0x40(%rcx), %ymm2
+	vmovdqu -0x20(%rcx), %ymm3
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 0x20(%rdi)
+	vmovdqu %ymm2, -0x40(%r9)
+	vmovdqu %ymm3, -0x20(%r9)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu -0x20(%rcx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -0x20(%r9)
+	ret
+
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rcx), %xmm1
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, -0x10(%r9)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq	(%rsi), %rsi
+	movq	-0x8(%rcx), %rcx
+	movq	%rsi, (%rdi)
+	movq	%rcx, -0x8(%r9)
+	ret
+
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	(%rsi), %esi
+	mov	-0x4(%rcx), %ecx
+	mov	%esi, (%rdi)
+	mov	%ecx, -0x4(%r9)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	(%rsi), %si
+	mov	-0x2(%rcx), %cx
+	mov	%si, (%rdi)
+	mov	%cx, -0x2(%r9)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+# ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r8
+# else
+	mov	__x86_shared_cache_size_half(%rip), %r8
+# endif
+	cmp	%r8, %rdx
+	jae	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	prefetcht1 -0x200(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0x40(%rcx)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	vmovups	-0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups %zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	vmovups	%zmm8, -0x200(%r9)
+	vmovups %zmm9, -0x1C0(%r9)
+	vmovups %zmm10, -0x180(%r9)
+	vmovups %zmm11, -0x140(%r9)
+	vmovups	%zmm12, -0x100(%r9)
+	vmovups %zmm13, -0xC0(%r9)
+	vmovups %zmm14, -0x80(%r9)
+	vmovups %zmm15, -0x40(%r9)
+	ret
+
+L(1024bytesormore):
+	cmp	%rsi, %rdi
+	ja	L(1024bytesormore_bkw)
+	sub	$512, %r9
+	vmovups -0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	add	$512, %rsi
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	add	$512, %rdi
+	cmp	%r9, %rdi
+	jb	L(gobble_512bytes_loop)
+	vmovups %zmm8, (%r9)
+	vmovups %zmm9, 0x40(%r9)
+	vmovups %zmm10, 0x80(%r9)
+	vmovups %zmm11, 0xC0(%r9)
+	vmovups %zmm12, 0x100(%r9)
+	vmovups %zmm13, 0x140(%r9)
+	vmovups %zmm14, 0x180(%r9)
+	vmovups %zmm15, 0x1C0(%r9)
+	ret
+
+L(1024bytesormore_bkw):
+	add	$512, %rdi
+	vmovups	0x1C0(%rsi), %zmm8
+	vmovups 0x180(%rsi), %zmm9
+	vmovups 0x140(%rsi), %zmm10
+	vmovups 0x100(%rsi), %zmm11
+	vmovups	0xC0(%rsi), %zmm12
+	vmovups 0x80(%rsi), %zmm13
+	vmovups 0x40(%rsi), %zmm14
+	vmovups (%rsi), %zmm15
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access.  */
+L(gobble_512bytes_loop_bkw):
+	vmovups -0x40(%rcx), %zmm0
+	vmovups -0x80(%rcx), %zmm1
+	vmovups -0xC0(%rcx), %zmm2
+	vmovups	-0x100(%rcx), %zmm3
+	vmovups -0x140(%rcx), %zmm4
+	vmovups -0x180(%rcx), %zmm5
+	vmovups -0x1C0(%rcx), %zmm6
+	vmovups	-0x200(%rcx), %zmm7
+	sub	$512, %rcx
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+	vmovups %zmm0, -0x40(%r9)
+	vmovups %zmm1, -0x80(%r9)
+	vmovups %zmm2, -0xC0(%r9)
+	vmovups	%zmm3, -0x100(%r9)
+	vmovups %zmm4, -0x140(%r9)
+	vmovups %zmm5, -0x180(%r9)
+	vmovups %zmm6, -0x1C0(%r9)
+	vmovups	%zmm7, -0x200(%r9)
+	sub	$512, %r9
+	cmp	%rdi, %r9
+	ja	L(gobble_512bytes_loop_bkw)
+	vmovups %zmm8, -0x40(%rdi)
+	vmovups %zmm9, -0x80(%rdi)
+	vmovups %zmm10, -0xC0(%rdi)
+	vmovups %zmm11, -0x100(%rdi)
+	vmovups %zmm12, -0x140(%rdi)
+	vmovups %zmm13, -0x180(%rdi)
+	vmovups %zmm14, -0x1C0(%rdi)
+	vmovups %zmm15, -0x200(%rdi)
+	ret
+
+L(preloop_large):
+	cmp	%rsi, %rdi
+	ja	L(preloop_large_bkw)
+	vmovups	(%rsi), %zmm4
+	vmovups	0x40(%rsi), %zmm5
+
+	mov	%rdi, %r11
+/* Align destination for access with non-temporal stores in the loop.  */
+	mov	%rdi, %r8
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	sub	%rdi, %r8
+	sub	%r8, %rsi
+	add	%r8, %rdx
+L(gobble_256bytes_nt_loop):
+	prefetcht1 0x200(%rsi)
+	prefetcht1 0x240(%rsi)
+	prefetcht1 0x280(%rsi)
+	prefetcht1 0x2C0(%rsi)
+	prefetcht1 0x300(%rsi)
+	prefetcht1 0x340(%rsi)
+	prefetcht1 0x380(%rsi)
+	prefetcht1 0x3C0(%rsi)
+	vmovdqu64 (%rsi), %zmm0
+	vmovdqu64 0x40(%rsi), %zmm1
+	vmovdqu64 0x80(%rsi), %zmm2
+	vmovdqu64 0xC0(%rsi), %zmm3
+	vmovntdq %zmm0, (%rdi)
+	vmovntdq %zmm1, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm3, 0xC0(%rdi)
+	sub	$256, %rdx
+	add	$256, %rsi
+	add	$256, %rdi
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop)
+	sfence
+	vmovups	%zmm4, (%r11)
+	vmovups	%zmm5, 0x40(%r11)
+	jmp	L(check)
+
+L(preloop_large_bkw):
+	vmovups -0x80(%rcx), %zmm4
+	vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+	mov	%r9, %r8
+	and	$-0x80, %r9
+	sub	%r9, %r8
+	sub	%r8, %rcx
+	sub	%r8, %rdx
+	add	%r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+	prefetcht1 -0x400(%rcx)
+	prefetcht1 -0x3C0(%rcx)
+	prefetcht1 -0x380(%rcx)
+	prefetcht1 -0x340(%rcx)
+	prefetcht1 -0x300(%rcx)
+	prefetcht1 -0x2C0(%rcx)
+	prefetcht1 -0x280(%rcx)
+	prefetcht1 -0x240(%rcx)
+	vmovdqu64 -0x100(%rcx), %zmm0
+	vmovdqu64 -0xC0(%rcx), %zmm1
+	vmovdqu64 -0x80(%rcx), %zmm2
+	vmovdqu64 -0x40(%rcx), %zmm3
+	vmovntdq %zmm0,	-0x100(%r9)
+	vmovntdq %zmm1,	-0xC0(%r9)
+	vmovntdq %zmm2,	-0x80(%r9)
+	vmovntdq %zmm3,	-0x40(%r9)
+	sub	$256, %rdx
+	sub	$256, %rcx
+	sub	$256, %r9
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop_bkw)
+	sfence
+	vmovups	%zmm4, -0x80(%r8)
+	vmovups	%zmm5, -0x40(%r8)
+	jmp	L(check)
+END (__memmove_avx512_no_vzeroupper)
+
+strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
+strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
+#endif
--- a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
@ -0,0 +1,12 @@
+#if 1
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define SECTION(p)		p##.avx512
+# define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
--- a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
@ -0,0 +1,33 @@
+/* memmove with SSE2.
+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if 1
+# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+#else
+weak_alias (__mempcpy, mempcpy)
+#endif
+
+#include "memmove.S"
+
+#if defined SHARED
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_sse2_unaligned to support overlapping addresses.  */
+compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# endif
+#endif
--- a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
@ -0,0 +1,559 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Load all sources into registers and store them together to avoid
+      possible address overlap between source and destination.
+   3. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   4. If address of destination > address of source, backward copy
+      4 * VEC_SIZE at a time with unaligned load and aligned store.
+      Load the first 4 * VEC and last VEC before the loop and store
+      them after the loop to support overlapping addresses.
+   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+      load and aligned store.  Load the last 4 * VEC and first VEC
+      before the loop and store them after the loop to support
+      overlapping addresses.
+   6. If size >= __x86_shared_non_temporal_threshold and there is no
+      overlap between destination and source, use non-temporal store
+      instead of aligned store.  */
+
+#include "sysdep.h"
+
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef PREFETCH
+# define PREFETCH(addr) prefetcht0 addr
+#endif
+
+/* Assume 64-byte prefetch size.  */
+#ifndef PREFETCH_SIZE
+# define PREFETCH_SIZE 64
+#endif
+
+#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
+
+#if PREFETCH_SIZE == 64
+# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base)
+# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+# else
+#   error Unsupported PREFETCHED_LOAD_SIZE!
+# endif
+#else
+# error Unsupported PREFETCH_SIZE!
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+	.section SECTION(.text),"ax",@progbits
+#if defined SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+#endif
+
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+
+#if defined SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+#endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+	movq	%rdi, %rax
+L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(more_2x_vec)
+#if !defined USE_MULTIARCH
+L(last_2x_vec):
+#endif
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VZEROUPPER
+#if !defined USE_MULTIARCH
+L(nop):
+#endif
+	ret
+#if defined USE_MULTIARCH
+END (MEMMOVE_SYMBOL (__memmove, unaligned))
+
+# if VEC_SIZE == 16
+ENTRY (__mempcpy_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_erms)
+
+/* Only used to measure performance of REP MOVSB.  */
+ENTRY (__mempcpy_erms)
+	mov	%RDI_LP, %RAX_LP
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start_movsb)
+END (__mempcpy_erms)
+
+ENTRY (__memmove_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_erms)
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+L(start_movsb):
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
+	jb	1f
+	/* Source == destination is less common.  */
+	je	2f
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
+	jb	L(movsb_backward)
+1:
+	rep movsb
+2:
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+# endif
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+	movq	%rdi, %rax
+L(start_erms):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(movsb_more_2x_vec)
+L(last_2x_vec):
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(movsb):
+	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+	jae	L(more_8x_vec)
+	cmpq	%rsi, %rdi
+	jb	1f
+	/* Source == destination is less common.  */
+	je	L(nop)
+	leaq	(%rsi,%rdx), %r9
+	cmpq	%r9, %rdi
+	/* Avoid slow backward REP MOVSB.  */
+	jb	L(more_8x_vec_backward)
+1:
+	mov	%RDX_LP, %RCX_LP
+	rep movsb
+L(nop):
+	ret
+#endif
+
+L(less_vec):
+	/* Less than 1 VEC.  */
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+#endif
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+1:
+	ret
+#if VEC_SIZE > 32
+L(between_32_63):
+	/* From 32 to 63.  No branch when size == 32.  */
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	-32(%rsi,%rdx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VZEROUPPER
+	ret
+#endif
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi,%rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	ret
+#endif
+L(between_8_15):
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	-8(%rsi,%rdx), %rcx
+	movq	(%rsi), %rsi
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rsi, (%rdi)
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi,%rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%esi, (%rdi)
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movzwl	-2(%rsi,%rdx), %ecx
+	movzwl	(%rsi), %esi
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	ret
+
+#if defined USE_MULTIARCH
+L(movsb_more_2x_vec):
+	cmp	$REP_MOSB_THRESHOLD, %RDX_LP
+	ja	L(movsb)
+#endif
+L(more_2x_vec):
+	/* More than 2 * VEC and there may be overlap between destination
+	   and source.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(last_4x_vec):
+	/* Copy from 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec):
+	cmpq	%rsi, %rdi
+	ja	L(more_8x_vec_backward)
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r11
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+	/* Align destination for aligned stores in the loop.  Compute
+	   how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+	/* Check non-temporal store threshold.  */
+	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+	ja	L(large_forward)
+#endif
+L(loop_4x_vec_forward):
+	/* Copy 4 * VEC a time forward.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_forward)
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec_backward):
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Align destination end for aligned stores in the loop.  Compute
+	   how much destination end is misaligned.  */
+	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Adjust length.  */
+	subq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+	/* Check non-temporal store threshold.  */
+	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+	ja	L(large_backward)
+#endif
+L(loop_4x_vec_backward):
+	/* Copy 4 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_backward)
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+L(large_forward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rdi, %rdx), %r10
+	cmpq    %r10, %rsi
+	jb	L(loop_4x_vec_forward)
+L(loop_large_forward):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$PREFETCHED_LOAD_SIZE, %rsi
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$PREFETCHED_LOAD_SIZE, %rdi
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_forward)
+	sfence
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(large_backward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rcx, %rdx), %r10
+	cmpq    %r10, %r9
+	jb	L(loop_4x_vec_backward)
+L(loop_large_backward):
+	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$PREFETCHED_LOAD_SIZE, %rcx
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%r9)
+	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$PREFETCHED_LOAD_SIZE, %r9
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_backward)
+	sfence
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+#endif
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+#if 1
+# ifdef USE_MULTIARCH
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+#  ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+#  endif
+# endif
+# ifdef SHARED
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
+# endif
+#endif
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
+	      MEMCPY_SYMBOL (__memcpy, unaligned))
--- a/utils/memcpy-bench/glibc/memmove.S
+++ b/utils/memcpy-bench/glibc/memmove.S
@ -0,0 +1,71 @@
+/* Optimized memmove for x86-64.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define PREFETCHNT	prefetchnta
+#define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+#define VMOVU		movups
+#define VMOVA		movaps
+
+#define SECTION(p)		p
+
+#ifdef USE_MULTIARCH
+# if 0
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#else
+# if defined SHARED
+#  define MEMCPY_SYMBOL(p,s)		__memcpy
+# else
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#endif
+#if !defined USE_MULTIARCH
+# define MEMPCPY_SYMBOL(p,s)		__mempcpy
+#endif
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	p
+# define MEMMOVE_SYMBOL(p,s)		memmove
+#endif
+
+#include "memmove-vec-unaligned-erms.S"
+
+#ifndef USE_MULTIARCH
+libc_hidden_builtin_def (memmove)
+# if defined SHARED && IS_IN (libc)
+strong_alias (memmove, __memcpy)
+libc_hidden_ver (memmove, memcpy)
+# endif
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+# if defined SHARED && IS_IN (libc)
+#  undef memcpy
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
+#endif
--- a/utils/memcpy-bench/glibc/sysdep.h
+++ b/utils/memcpy-bench/glibc/sysdep.h
@ -0,0 +1,129 @@
+/* Assembler macros for x86-64.
+   Copyright (C) 2001-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_64_SYSDEP_H
+#define _X86_64_SYSDEP_H 1
+
+#include "sysdep_x86.h"
+
+#ifdef	__ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+/* This macro is for setting proper CFI with DW_CFA_expression describing
+   the register as saved relative to %rsp instead of relative to the CFA.
+   Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
+   from %rsp.  */
+#define cfi_offset_rel_rsp(regn, off)	.cfi_escape 0x10, regn, 0x4, 0x13, \
+					0x77, off & 0x7F | 0x80, off >> 7
+
+/* If compiled for profiling, call `mcount' at the start of each function.  */
+#ifdef	PROF
+/* The mcount code relies on a normal frame pointer being on the stack
+   to locate our caller, so push one just for its benefit.  */
+#define CALL_MCOUNT                                                          \
+  pushq %rbp;                                                                \
+  cfi_adjust_cfa_offset(8);                                                  \
+  movq %rsp, %rbp;                                                           \
+  cfi_def_cfa_register(%rbp);                                                \
+  call JUMPTARGET(mcount);                                                   \
+  popq %rbp;                                                                 \
+  cfi_def_cfa(rsp,8);
+#else
+#define CALL_MCOUNT		/* Do nothing.  */
+#endif
+
+#define	PSEUDO(name, syscall_name, args)				      \
+lose:									      \
+  jmp JUMPTARGET(syscall_error)						      \
+  .globl syscall_error;							      \
+  ENTRY (name)								      \
+  DO_CALL (syscall_name, args);						      \
+  jb lose
+
+#undef JUMPTARGET
+#ifdef SHARED
+# ifdef BIND_NOW
+#  define JUMPTARGET(name)	*name##@GOTPCREL(%rip)
+# else
+#  define JUMPTARGET(name)	name##@PLT
+# endif
+#else
+/* For static archives, branch to target directly.  */
+# define JUMPTARGET(name)	name
+#endif
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE	8
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) insn##q
+
+/* Assembler address directive. */
+#define ASM_ADDR .quad
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP	rax
+#define RBP_LP	rbp
+#define RBX_LP	rbx
+#define RCX_LP	rcx
+#define RDI_LP	rdi
+#define RDX_LP	rdx
+#define RSI_LP	rsi
+#define RSP_LP	rsp
+#define R8_LP	r8
+#define R9_LP	r9
+#define R10_LP	r10
+#define R11_LP	r11
+#define R12_LP	r12
+#define R13_LP	r13
+#define R14_LP	r14
+#define R15_LP	r15
+
+#else	/* __ASSEMBLER__ */
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE "8"
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) #insn "q"
+
+/* Assembler address directive. */
+#define ASM_ADDR ".quad"
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP	"rax"
+#define RBP_LP	"rbp"
+#define RBX_LP	"rbx"
+#define RCX_LP	"rcx"
+#define RDI_LP	"rdi"
+#define RDX_LP	"rdx"
+#define RSI_LP	"rsi"
+#define RSP_LP	"rsp"
+#define R8_LP	"r8"
+#define R9_LP	"r9"
+#define R10_LP	"r10"
+#define R11_LP	"r11"
+#define R12_LP	"r12"
+#define R13_LP	"r13"
+#define R14_LP	"r14"
+#define R15_LP	"r15"
+
+#endif	/* __ASSEMBLER__ */
+
+#endif	/* _X86_64_SYSDEP_H */
--- a/utils/memcpy-bench/glibc/sysdep_generic.h
+++ b/utils/memcpy-bench/glibc/sysdep_generic.h
@ -0,0 +1,113 @@
+/* Generic asm macros used on many machines.
+   Copyright (C) 1991-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define C_SYMBOL_NAME(name) name
+#define HIDDEN_JUMPTARGET(name) 0x0
+#define SHARED_CACHE_SIZE_HALF (1024*1024)
+#define DATA_CACHE_SIZE_HALF (1024*32/2)
+#define DATA_CACHE_SIZE (1024*32)
+#define SHARED_NON_TEMPORAL_THRESHOLD (1024*1024*4)
+#define REP_MOSB_THRESHOLD 1024
+
+#define USE_MULTIARCH
+
+#define ASM_LINE_SEP ;
+
+#define strong_alias(original, alias)				\
+  .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP		\
+  C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
+
+#ifndef C_LABEL
+
+/* Define a macro we can use to construct the asm name for a C symbol.  */
+# define C_LABEL(name)	name##:
+
+#endif
+
+#ifdef __ASSEMBLER__
+/* Mark the end of function named SYM.  This is used on some platforms
+   to generate correct debugging information.  */
+# ifndef END
+#  define END(sym)
+# endif
+
+# ifndef JUMPTARGET
+#  define JUMPTARGET(sym)	sym
+# endif
+#endif
+
+/* Makros to generate eh_frame unwind information.  */
+#ifdef __ASSEMBLER__
+# define cfi_startproc			.cfi_startproc
+# define cfi_endproc			.cfi_endproc
+# define cfi_def_cfa(reg, off)		.cfi_def_cfa reg, off
+# define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
+# define cfi_def_cfa_offset(off)	.cfi_def_cfa_offset off
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+# define cfi_offset(reg, off)		.cfi_offset reg, off
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+# define cfi_register(r1, r2)		.cfi_register r1, r2
+# define cfi_return_column(reg)	.cfi_return_column reg
+# define cfi_restore(reg)		.cfi_restore reg
+# define cfi_same_value(reg)		.cfi_same_value reg
+# define cfi_undefined(reg)		.cfi_undefined reg
+# define cfi_remember_state		.cfi_remember_state
+# define cfi_restore_state		.cfi_restore_state
+# define cfi_window_save		.cfi_window_save
+# define cfi_personality(enc, exp)	.cfi_personality enc, exp
+# define cfi_lsda(enc, exp)		.cfi_lsda enc, exp
+
+#else /* ! ASSEMBLER */
+
+# define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name)
+# define CFI_STRINGIFY2(Name) #Name
+# define CFI_STARTPROC	".cfi_startproc"
+# define CFI_ENDPROC	".cfi_endproc"
+# define CFI_DEF_CFA(reg, off)	\
+   ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_DEF_CFA_REGISTER(reg) \
+   ".cfi_def_cfa_register " CFI_STRINGIFY(reg)
+# define CFI_DEF_CFA_OFFSET(off) \
+   ".cfi_def_cfa_offset " CFI_STRINGIFY(off)
+# define CFI_ADJUST_CFA_OFFSET(off) \
+   ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off)
+# define CFI_OFFSET(reg, off) \
+   ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_REL_OFFSET(reg, off) \
+   ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_REGISTER(r1, r2) \
+   ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2)
+# define CFI_RETURN_COLUMN(reg) \
+   ".cfi_return_column " CFI_STRINGIFY(reg)
+# define CFI_RESTORE(reg) \
+   ".cfi_restore " CFI_STRINGIFY(reg)
+# define CFI_UNDEFINED(reg) \
+   ".cfi_undefined " CFI_STRINGIFY(reg)
+# define CFI_REMEMBER_STATE \
+   ".cfi_remember_state"
+# define CFI_RESTORE_STATE \
+   ".cfi_restore_state"
+# define CFI_WINDOW_SAVE \
+   ".cfi_window_save"
+# define CFI_PERSONALITY(enc, exp) \
+   ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+# define CFI_LSDA(enc, exp) \
+   ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+#endif
+
+#include "dwarf2.h"
--- a/utils/memcpy-bench/glibc/sysdep_x86.h
+++ b/utils/memcpy-bench/glibc/sysdep_x86.h
@ -0,0 +1,113 @@
+/* Assembler macros for x86.
+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_SYSDEP_H
+#define _X86_SYSDEP_H 1
+
+#include "sysdep_generic.h"
+
+/* __CET__ is defined by GCC with Control-Flow Protection values:
+
+enum cf_protection_level
+{
+  CF_NONE = 0,
+  CF_BRANCH = 1 << 0,
+  CF_RETURN = 1 << 1,
+  CF_FULL = CF_BRANCH | CF_RETURN,
+  CF_SET = 1 << 2
+};
+*/
+
+/* Set if CF_BRANCH (IBT) is enabled.  */
+#define X86_FEATURE_1_IBT	(1U << 0)
+/* Set if CF_RETURN (SHSTK) is enabled.  */
+#define X86_FEATURE_1_SHSTK	(1U << 1)
+
+#ifdef __CET__
+# define CET_ENABLED	1
+# define IBT_ENABLED	(__CET__ & X86_FEATURE_1_IBT)
+# define SHSTK_ENABLED	(__CET__ & X86_FEATURE_1_SHSTK)
+#else
+# define CET_ENABLED	0
+# define IBT_ENABLED	0
+# define SHSTK_ENABLED	0
+#endif
+
+/* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
+   space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
+   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
+#define STATE_SAVE_OFFSET (8 * 7 + 8)
+
+/* Save SSE, AVX, AVX512, mask and bound registers.  */
+#define STATE_SAVE_MASK \
+  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
+
+#ifdef	__ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+#ifdef _CET_ENDBR
+# define _CET_NOTRACK notrack
+#else
+# define _CET_ENDBR
+# define _CET_NOTRACK
+#endif
+
+/* ELF uses byte-counts for .align, most others use log2 of count of bytes.  */
+#define ALIGNARG(log2) 1<<log2
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+
+/* Define an entry point visible from C.  */
+#define	ENTRY(name)							      \
+  .globl C_SYMBOL_NAME(name);						      \
+  .type C_SYMBOL_NAME(name),@function;					      \
+  .align ALIGNARG(4);							      \
+  C_LABEL(name)								      \
+  cfi_startproc;							      \
+  _CET_ENDBR;								      \
+  CALL_MCOUNT
+
+#undef	END
+#define END(name)							      \
+  cfi_endproc;								      \
+  ASM_SIZE_DIRECTIVE(name)
+
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
+/* Since C identifiers are not normally prefixed with an underscore
+   on this system, the asm identifier `syscall_error' intrudes on the
+   C name space.  Make sure we use an innocuous name.  */
+#define	syscall_error	__syscall_error
+#define mcount		_mcount
+
+#undef	PSEUDO_END
+#define	PSEUDO_END(name)						      \
+  END (name)
+
+/* Local label name for asm code. */
+#ifndef L
+/* ELF-like local names start with `.L'.  */
+# define L(name)	.L##name
+#endif
+
+#define atom_text_section .section ".text.atom", "ax"
+
+#endif	/* __ASSEMBLER__ */
+
+#endif	/* _X86_SYSDEP_H */
--- a/utils/memcpy-bench/memcpy-bench.cpp
+++ b/utils/memcpy-bench/memcpy-bench.cpp
@ -1,5 +1,6 @@
 #include <memory>
 #include <cstddef>
+#include <stdexcept>
 #include <string>
 #include <random>
 #include <iostream>
@ -14,15 +15,11 @@

 #include <Common/Stopwatch.h>

-#pragma GCC diagnostic ignored "-Wold-style-cast"
-#pragma GCC diagnostic ignored "-Wcast-align"
-#pragma GCC diagnostic ignored "-Wcast-qual"
-#include "FastMemcpy.h"
-//#include "FastMemcpy_Avx.h"
-
 #include <emmintrin.h>
 #include <immintrin.h>

+#include <boost/program_options.hpp>
+

 template <typename F, typename MemcpyImpl>
 void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl)
@ -47,7 +44,7 @@ size_t generatorUniform(RNG & rng) { return rng() % N; };


 template <typename F, typename MemcpyImpl>
-void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl)
+uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl, const char * name)
 {
    Stopwatch watch;

@ -76,15 +73,22 @@ void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t n
    for (auto & thread : threads)
        thread.join();

-    double elapsed_ns = watch.elapsed();
+    uint64_t elapsed_ns = watch.elapsed();

    /// Validation
    size_t sum = 0;
+    size_t reference = 0;
    for (size_t i = 0; i < size; ++i)
+    {
        sum += dst[i];
+        reference += uint8_t(i);
+    }

-    std::cerr << std::fixed << std::setprecision(3)
-        << "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n";
+    if (sum != reference)
+        throw std::logic_error("Incorrect result");
+
+    std::cout << name;
+    return elapsed_ns;
 }


@ -101,9 +105,30 @@ static void * memcpy_erms(void * dst, const void * src, size_t size)
    return dst;
 }

+static void * memcpy_trivial(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+    void * ret = dst;
+
+    while (size > 0)
+    {
+        *dst = *src;
+        ++dst;
+        ++src;
+        --size;
+    }
+
+    return ret;
+}
+
 extern "C" void * memcpy_jart(void * dst, const void * src, size_t size);
 extern "C" void MemCpy(void * dst, const void * src, size_t size);

+void * memcpy_fast_sse(void * dst, const void * src, size_t size);
+void * memcpy_fast_avx(void * dst, const void * src, size_t size);
+void * memcpy_tiny(void * dst, const void * src, size_t size);
+

 static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size)
 {
@ -329,7 +354,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t *
    if (padding > 0)
    {
        __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
-        _mm256_storeu_si256((__m256i*)dst, head);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
        dst += padding;
        src += padding;
        size -= padding;
@ -539,70 +564,125 @@ tail:
    return ret;
 }

+extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_ssse3_back(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_no_vzeroupper(void * __restrict destination, const void * __restrict source, size_t size);
+
+
+#define VARIANT(N, NAME) \
+    if (memcpy_variant == N) \
+        return test(dst, src, size, iterations, num_threads, std::forward<F>(generator), NAME, #NAME);

 template <typename F>
-void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
+uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
 {
-    memcpy_type memcpy_libc = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
+    memcpy_type memcpy_libc_old = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));

-    if (memcpy_variant == 1)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy);
-    if (memcpy_variant == 2)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_libc);
-    if (memcpy_variant == 3)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_erms);
-    if (memcpy_variant == 4)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), MemCpy);
-    if (memcpy_variant == 5)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2);
-    if (memcpy_variant == 6)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled2);
-    if (memcpy_variant == 7)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled4);
-    if (memcpy_variant == 8)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled8);
-//    if (memcpy_variant == 9)
-//        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_fast_avx);
-    if (memcpy_variant == 10)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_my);
+    VARIANT(1, memcpy)
+    VARIANT(2, memcpy_trivial)
+    VARIANT(3, memcpy_libc_old)
+    VARIANT(4, memcpy_erms)
+    VARIANT(5, MemCpy)
+    VARIANT(6, memcpySSE2)
+    VARIANT(7, memcpySSE2Unrolled2)
+    VARIANT(8, memcpySSE2Unrolled4)
+    VARIANT(9, memcpySSE2Unrolled8)
+    VARIANT(10, memcpy_fast_sse)
+    VARIANT(11, memcpy_fast_avx)
+    VARIANT(12, memcpy_my)
+
+    VARIANT(21, __memcpy_erms)
+    VARIANT(22, __memcpy_sse2_unaligned)
+    VARIANT(23, __memcpy_ssse3)
+    VARIANT(24, __memcpy_ssse3_back)
+    VARIANT(25, __memcpy_avx_unaligned)
+    VARIANT(26, __memcpy_avx_unaligned_erms)
+    VARIANT(27, __memcpy_avx512_unaligned)
+    VARIANT(28, __memcpy_avx512_unaligned_erms)
+    VARIANT(29, __memcpy_avx512_no_vzeroupper)
+
+    return 0;
 }

-void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
+uint64_t dispatchVariants(
+    size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
 {
    if (generator_variant == 1)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
    if (generator_variant == 2)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
    if (generator_variant == 3)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
    if (generator_variant == 4)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
    if (generator_variant == 5)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
+
+    return 0;
 }


 int main(int argc, char ** argv)
 {
-    size_t size = 1000000000;
-    if (argc >= 2)
-        size = std::stoull(argv[1]);
+    boost::program_options::options_description desc("Allowed options");
+    desc.add_options()("help,h", "produce help message")
+        ("size", boost::program_options::value<size_t>()->default_value(1000000), "Bytes to copy on every iteration")
+        ("iterations", boost::program_options::value<size_t>(), "Number of iterations")
+        ("threads", boost::program_options::value<size_t>()->default_value(1), "Number of copying threads")
+        ("distribution", boost::program_options::value<size_t>()->default_value(4), "Distribution of chunk sizes to perform copy")
+        ("variant", boost::program_options::value<size_t>(), "Variant of memcpy implementation")
+        ("tsv", "Print result in tab-separated format")
+        ;

-    size_t iterations = 10;
-    if (argc >= 3)
-        iterations = std::stoull(argv[2]);
+    boost::program_options::variables_map options;
+    boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);

-    size_t num_threads = 1;
-    if (argc >= 4)
-        num_threads = std::stoull(argv[3]);
+    if (options.count("help") || !options.count("variant"))
+    {
+        std::cout << R"(Usage:

-    size_t memcpy_variant = 1;
-    if (argc >= 5)
-        memcpy_variant = std::stoull(argv[4]);
+for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do
+    for threads in 1 2 4 $(($(nproc) / 2)) $(nproc); do
+        for distribution in 1 2 3 4 5; do
+            for variant in {1..12} {21..29}; do
+                for i in {1..10}; do
+                    ./memcpy-bench --tsv --size $size --variant $variant --threads $threads --distribution $distribution;
+                done;
+            done;
+        done;
+    done;
+done | tee result.tsv

-    size_t generator_variant = 1;
-    if (argc >= 6)
-        generator_variant = std::stoull(argv[5]);
+)" << std::endl;
+        std::cout << desc << std::endl;
+        return 1;
+    }
+
+    size_t size = options["size"].as<size_t>();
+    size_t num_threads = options["threads"].as<size_t>();
+    size_t memcpy_variant = options["variant"].as<size_t>();
+    size_t generator_variant = options["distribution"].as<size_t>();
+
+    size_t iterations;
+    if (options.count("iterations"))
+    {
+        iterations = options["iterations"].as<size_t>();
+    }
+    else
+    {
+        iterations = 10000000000ULL * num_threads / size;
+
+        if (generator_variant == 1)
+            iterations /= 100;
+        if (generator_variant == 2)
+            iterations /= 10;
+    }

    std::unique_ptr<uint8_t[]> src(new uint8_t[size]);
    std::unique_ptr<uint8_t[]> dst(new uint8_t[size]);
@ -614,7 +694,25 @@ int main(int argc, char ** argv)
    /// Fill dst to avoid page faults.
    memset(dst.get(), 0, size);

-    dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
+    uint64_t elapsed_ns = dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
+
+    std::cout << std::fixed << std::setprecision(3);
+
+    if (options.count("tsv"))
+    {
+        std::cout
+            << '\t' << size
+            << '\t' << iterations
+            << '\t' << num_threads
+            << '\t' << generator_variant
+            << '\t' << memcpy_variant
+            << '\t' << elapsed_ns
+            << '\n';
+    }
+    else
+    {
+        std::cout << ": processed in " << (elapsed_ns / 1e9) << " sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec\n";
+    }

    return 0;
 }