From 1bc21789d239cd3f90703711629b6d15905d86c1 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 14 Mar 2021 19:52:51 +0300
Subject: [PATCH 1/6] Add more variants

---
 utils/memcpy-bench/CMakeLists.txt             |   23 +-
 utils/memcpy-bench/FastMemcpy.cpp             |    1 +
 utils/memcpy-bench/FastMemcpy.h               |    2 +-
 utils/memcpy-bench/FastMemcpy_Avx.cpp         |    1 +
 utils/memcpy-bench/glibc/asm-syntax.h         |   24 +
 utils/memcpy-bench/glibc/dwarf2.h             |  590 +++
 utils/memcpy-bench/glibc/memcpy-ssse3-back.S  | 3182 +++++++++++++++++
 utils/memcpy-bench/glibc/memcpy-ssse3.S       | 3152 ++++++++++++++++
 .../glibc/memmove-avx-unaligned-erms.S        |   12 +
 .../glibc/memmove-avx512-no-vzeroupper.S      |  419 +++
 .../glibc/memmove-avx512-unaligned-erms.S     |   12 +
 .../glibc/memmove-sse2-unaligned-erms.S       |   33 +
 .../glibc/memmove-vec-unaligned-erms.S        |  559 +++
 utils/memcpy-bench/glibc/memmove.S            |   71 +
 utils/memcpy-bench/glibc/sysdep.h             |  129 +
 utils/memcpy-bench/glibc/sysdep_generic.h     |  113 +
 utils/memcpy-bench/glibc/sysdep_x86.h         |  113 +
 utils/memcpy-bench/memcpy-bench.cpp           |  208 +-
 18 files changed, 8585 insertions(+), 59 deletions(-)
 create mode 100644 utils/memcpy-bench/FastMemcpy.cpp
 create mode 100644 utils/memcpy-bench/FastMemcpy_Avx.cpp
 create mode 100644 utils/memcpy-bench/glibc/asm-syntax.h
 create mode 100644 utils/memcpy-bench/glibc/dwarf2.h
 create mode 100644 utils/memcpy-bench/glibc/memcpy-ssse3-back.S
 create mode 100644 utils/memcpy-bench/glibc/memcpy-ssse3.S
 create mode 100644 utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
 create mode 100644 utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
 create mode 100644 utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
 create mode 100644 utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
 create mode 100644 utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
 create mode 100644 utils/memcpy-bench/glibc/memmove.S
 create mode 100644 utils/memcpy-bench/glibc/sysdep.h
 create mode 100644 utils/memcpy-bench/glibc/sysdep_generic.h
 create mode 100644 utils/memcpy-bench/glibc/sysdep_x86.h

diff --git a/utils/memcpy-bench/CMakeLists.txt b/utils/memcpy-bench/CMakeLists.txt
index 54dd0398912..5fcde231688 100644
--- a/utils/memcpy-bench/CMakeLists.txt
+++ b/utils/memcpy-bench/CMakeLists.txt
@@ -1,5 +1,22 @@
 enable_language(ASM)
-add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S)
-#target_compile_options(memcpy-bench PRIVATE -mavx)
-target_link_libraries(memcpy-bench PRIVATE dbms)
+
+add_executable (memcpy-bench
+    memcpy-bench.cpp
+    FastMemcpy.cpp
+    FastMemcpy_Avx.cpp
+    memcpy_jart.S
+    glibc/memcpy-ssse3.S
+    glibc/memcpy-ssse3-back.S
+    glibc/memmove-sse2-unaligned-erms.S
+    glibc/memmove-avx-unaligned-erms.S
+    glibc/memmove-avx512-unaligned-erms.S
+    glibc/memmove-avx512-no-vzeroupper.S
+    )
+
+add_compile_options(memcpy-bench PRIVATE -fno-tree-loop-distribute-patterns)
+
+set_source_files_properties(FastMemcpy.cpp PROPERTIES COMPILE_FLAGS "-Wno-old-style-cast")
+set_source_files_properties(FastMemcpy_Avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -Wno-old-style-cast -Wno-cast-qual -Wno-cast-align")
+
+target_link_libraries(memcpy-bench PRIVATE dbms boost::program_options)
 
diff --git a/utils/memcpy-bench/FastMemcpy.cpp b/utils/memcpy-bench/FastMemcpy.cpp
new file mode 100644
index 00000000000..9a50caba2b1
--- /dev/null
+++ b/utils/memcpy-bench/FastMemcpy.cpp
@@ -0,0 +1 @@
+#include "FastMemcpy.h"
diff --git a/utils/memcpy-bench/FastMemcpy.h b/utils/memcpy-bench/FastMemcpy.h
index 9c37524443a..85d09c5f53e 100644
--- a/utils/memcpy-bench/FastMemcpy.h
+++ b/utils/memcpy-bench/FastMemcpy.h
@@ -93,7 +93,7 @@ static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restric
 /// Attribute is used to avoid an error with undefined behaviour sanitizer
 /// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
 /// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
-__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
+__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
 {
     unsigned char *dd = ((unsigned char*)dst) + size;
     const unsigned char *ss = ((const unsigned char*)src) + size;
diff --git a/utils/memcpy-bench/FastMemcpy_Avx.cpp b/utils/memcpy-bench/FastMemcpy_Avx.cpp
new file mode 100644
index 00000000000..8cef0f89507
--- /dev/null
+++ b/utils/memcpy-bench/FastMemcpy_Avx.cpp
@@ -0,0 +1 @@
+#include "FastMemcpy_Avx.h"
diff --git a/utils/memcpy-bench/glibc/asm-syntax.h b/utils/memcpy-bench/glibc/asm-syntax.h
new file mode 100644
index 00000000000..6e299c1fec2
--- /dev/null
+++ b/utils/memcpy-bench/glibc/asm-syntax.h
@@ -0,0 +1,24 @@
+/* Definitions for x86 syntax variations.
+   Copyright (C) 1992-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.  Its master source is NOT part of
+   the C library, however.  The master source lives in the GNU MP Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#undef ALIGN
+#define ALIGN(log) .align 1<<log
+
+#undef L
+#define L(body) .L##body
diff --git a/utils/memcpy-bench/glibc/dwarf2.h b/utils/memcpy-bench/glibc/dwarf2.h
new file mode 100644
index 00000000000..4c7de0d8737
--- /dev/null
+++ b/utils/memcpy-bench/glibc/dwarf2.h
@@ -0,0 +1,590 @@
+/* Declarations and definitions of codes relating to the DWARF2 symbolic
+   debugging information format.
+   Copyright (C) 1992-2020 Free Software Foundation, Inc.
+   Contributed by Gary Funck (gary@intrepid.com).  Derived from the
+   DWARF 1 implementation written by Ron Guilmette (rfg@monkeys.com).
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DWARF2_H
+#define _DWARF2_H	1
+
+/* This file is derived from the DWARF specification (a public document)
+   Revision 2.0.0 (July 27, 1993) developed by the UNIX International
+   Programming Languages Special Interest Group (UI/PLSIG) and distributed
+   by UNIX International.  Copies of this specification are available from
+   UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.  */
+
+/* This file is shared between GCC and GDB, and should not contain
+   prototypes.  */
+
+#ifndef __ASSEMBLER__
+/* Tag names and codes.  */
+
+enum dwarf_tag
+  {
+    DW_TAG_padding = 0x00,
+    DW_TAG_array_type = 0x01,
+    DW_TAG_class_type = 0x02,
+    DW_TAG_entry_point = 0x03,
+    DW_TAG_enumeration_type = 0x04,
+    DW_TAG_formal_parameter = 0x05,
+    DW_TAG_imported_declaration = 0x08,
+    DW_TAG_label = 0x0a,
+    DW_TAG_lexical_block = 0x0b,
+    DW_TAG_member = 0x0d,
+    DW_TAG_pointer_type = 0x0f,
+    DW_TAG_reference_type = 0x10,
+    DW_TAG_compile_unit = 0x11,
+    DW_TAG_string_type = 0x12,
+    DW_TAG_structure_type = 0x13,
+    DW_TAG_subroutine_type = 0x15,
+    DW_TAG_typedef = 0x16,
+    DW_TAG_union_type = 0x17,
+    DW_TAG_unspecified_parameters = 0x18,
+    DW_TAG_variant = 0x19,
+    DW_TAG_common_block = 0x1a,
+    DW_TAG_common_inclusion = 0x1b,
+    DW_TAG_inheritance = 0x1c,
+    DW_TAG_inlined_subroutine = 0x1d,
+    DW_TAG_module = 0x1e,
+    DW_TAG_ptr_to_member_type = 0x1f,
+    DW_TAG_set_type = 0x20,
+    DW_TAG_subrange_type = 0x21,
+    DW_TAG_with_stmt = 0x22,
+    DW_TAG_access_declaration = 0x23,
+    DW_TAG_base_type = 0x24,
+    DW_TAG_catch_block = 0x25,
+    DW_TAG_const_type = 0x26,
+    DW_TAG_constant = 0x27,
+    DW_TAG_enumerator = 0x28,
+    DW_TAG_file_type = 0x29,
+    DW_TAG_friend = 0x2a,
+    DW_TAG_namelist = 0x2b,
+    DW_TAG_namelist_item = 0x2c,
+    DW_TAG_packed_type = 0x2d,
+    DW_TAG_subprogram = 0x2e,
+    DW_TAG_template_type_param = 0x2f,
+    DW_TAG_template_value_param = 0x30,
+    DW_TAG_thrown_type = 0x31,
+    DW_TAG_try_block = 0x32,
+    DW_TAG_variant_part = 0x33,
+    DW_TAG_variable = 0x34,
+    DW_TAG_volatile_type = 0x35,
+    /* SGI/MIPS Extensions */
+    DW_TAG_MIPS_loop = 0x4081,
+    /* GNU extensions */
+    DW_TAG_format_label = 0x4101,	/* for FORTRAN 77 and Fortran 90 */
+    DW_TAG_function_template = 0x4102,	/* for C++ */
+    DW_TAG_class_template = 0x4103,	/* for C++ */
+    DW_TAG_GNU_BINCL = 0x4104,
+    DW_TAG_GNU_EINCL = 0x4105
+  };
+
+#define DW_TAG_lo_user	0x4080
+#define DW_TAG_hi_user	0xffff
+
+/* flag that tells whether entry has a child or not */
+#define DW_children_no   0
+#define	DW_children_yes  1
+
+/* Form names and codes.  */
+enum dwarf_form
+  {
+    DW_FORM_addr = 0x01,
+    DW_FORM_block2 = 0x03,
+    DW_FORM_block4 = 0x04,
+    DW_FORM_data2 = 0x05,
+    DW_FORM_data4 = 0x06,
+    DW_FORM_data8 = 0x07,
+    DW_FORM_string = 0x08,
+    DW_FORM_block = 0x09,
+    DW_FORM_block1 = 0x0a,
+    DW_FORM_data1 = 0x0b,
+    DW_FORM_flag = 0x0c,
+    DW_FORM_sdata = 0x0d,
+    DW_FORM_strp = 0x0e,
+    DW_FORM_udata = 0x0f,
+    DW_FORM_ref_addr = 0x10,
+    DW_FORM_ref1 = 0x11,
+    DW_FORM_ref2 = 0x12,
+    DW_FORM_ref4 = 0x13,
+    DW_FORM_ref8 = 0x14,
+    DW_FORM_ref_udata = 0x15,
+    DW_FORM_indirect = 0x16
+  };
+
+/* Attribute names and codes.  */
+
+enum dwarf_attribute
+  {
+    DW_AT_sibling = 0x01,
+    DW_AT_location = 0x02,
+    DW_AT_name = 0x03,
+    DW_AT_ordering = 0x09,
+    DW_AT_subscr_data = 0x0a,
+    DW_AT_byte_size = 0x0b,
+    DW_AT_bit_offset = 0x0c,
+    DW_AT_bit_size = 0x0d,
+    DW_AT_element_list = 0x0f,
+    DW_AT_stmt_list = 0x10,
+    DW_AT_low_pc = 0x11,
+    DW_AT_high_pc = 0x12,
+    DW_AT_language = 0x13,
+    DW_AT_member = 0x14,
+    DW_AT_discr = 0x15,
+    DW_AT_discr_value = 0x16,
+    DW_AT_visibility = 0x17,
+    DW_AT_import = 0x18,
+    DW_AT_string_length = 0x19,
+    DW_AT_common_reference = 0x1a,
+    DW_AT_comp_dir = 0x1b,
+    DW_AT_const_value = 0x1c,
+    DW_AT_containing_type = 0x1d,
+    DW_AT_default_value = 0x1e,
+    DW_AT_inline = 0x20,
+    DW_AT_is_optional = 0x21,
+    DW_AT_lower_bound = 0x22,
+    DW_AT_producer = 0x25,
+    DW_AT_prototyped = 0x27,
+    DW_AT_return_addr = 0x2a,
+    DW_AT_start_scope = 0x2c,
+    DW_AT_stride_size = 0x2e,
+    DW_AT_upper_bound = 0x2f,
+    DW_AT_abstract_origin = 0x31,
+    DW_AT_accessibility = 0x32,
+    DW_AT_address_class = 0x33,
+    DW_AT_artificial = 0x34,
+    DW_AT_base_types = 0x35,
+    DW_AT_calling_convention = 0x36,
+    DW_AT_count = 0x37,
+    DW_AT_data_member_location = 0x38,
+    DW_AT_decl_column = 0x39,
+    DW_AT_decl_file = 0x3a,
+    DW_AT_decl_line = 0x3b,
+    DW_AT_declaration = 0x3c,
+    DW_AT_discr_list = 0x3d,
+    DW_AT_encoding = 0x3e,
+    DW_AT_external = 0x3f,
+    DW_AT_frame_base = 0x40,
+    DW_AT_friend = 0x41,
+    DW_AT_identifier_case = 0x42,
+    DW_AT_macro_info = 0x43,
+    DW_AT_namelist_items = 0x44,
+    DW_AT_priority = 0x45,
+    DW_AT_segment = 0x46,
+    DW_AT_specification = 0x47,
+    DW_AT_static_link = 0x48,
+    DW_AT_type = 0x49,
+    DW_AT_use_location = 0x4a,
+    DW_AT_variable_parameter = 0x4b,
+    DW_AT_virtuality = 0x4c,
+    DW_AT_vtable_elem_location = 0x4d,
+    /* SGI/MIPS Extensions */
+    DW_AT_MIPS_fde = 0x2001,
+    DW_AT_MIPS_loop_begin = 0x2002,
+    DW_AT_MIPS_tail_loop_begin = 0x2003,
+    DW_AT_MIPS_epilog_begin = 0x2004,
+    DW_AT_MIPS_loop_unroll_factor = 0x2005,
+    DW_AT_MIPS_software_pipeline_depth = 0x2006,
+    DW_AT_MIPS_linkage_name = 0x2007,
+    DW_AT_MIPS_stride = 0x2008,
+    DW_AT_MIPS_abstract_name = 0x2009,
+    DW_AT_MIPS_clone_origin = 0x200a,
+    DW_AT_MIPS_has_inlines = 0x200b,
+    /* GNU extensions.  */
+    DW_AT_sf_names = 0x2101,
+    DW_AT_src_info = 0x2102,
+    DW_AT_mac_info = 0x2103,
+    DW_AT_src_coords = 0x2104,
+    DW_AT_body_begin = 0x2105,
+    DW_AT_body_end = 0x2106
+  };
+
+#define DW_AT_lo_user	0x2000	/* implementation-defined range start */
+#define DW_AT_hi_user	0x3ff0	/* implementation-defined range end */
+
+/* Location atom names and codes.  */
+
+enum dwarf_location_atom
+  {
+    DW_OP_addr = 0x03,
+    DW_OP_deref = 0x06,
+    DW_OP_const1u = 0x08,
+    DW_OP_const1s = 0x09,
+    DW_OP_const2u = 0x0a,
+    DW_OP_const2s = 0x0b,
+    DW_OP_const4u = 0x0c,
+    DW_OP_const4s = 0x0d,
+    DW_OP_const8u = 0x0e,
+    DW_OP_const8s = 0x0f,
+    DW_OP_constu = 0x10,
+    DW_OP_consts = 0x11,
+    DW_OP_dup = 0x12,
+    DW_OP_drop = 0x13,
+    DW_OP_over = 0x14,
+    DW_OP_pick = 0x15,
+    DW_OP_swap = 0x16,
+    DW_OP_rot = 0x17,
+    DW_OP_xderef = 0x18,
+    DW_OP_abs = 0x19,
+    DW_OP_and = 0x1a,
+    DW_OP_div = 0x1b,
+    DW_OP_minus = 0x1c,
+    DW_OP_mod = 0x1d,
+    DW_OP_mul = 0x1e,
+    DW_OP_neg = 0x1f,
+    DW_OP_not = 0x20,
+    DW_OP_or = 0x21,
+    DW_OP_plus = 0x22,
+    DW_OP_plus_uconst = 0x23,
+    DW_OP_shl = 0x24,
+    DW_OP_shr = 0x25,
+    DW_OP_shra = 0x26,
+    DW_OP_xor = 0x27,
+    DW_OP_bra = 0x28,
+    DW_OP_eq = 0x29,
+    DW_OP_ge = 0x2a,
+    DW_OP_gt = 0x2b,
+    DW_OP_le = 0x2c,
+    DW_OP_lt = 0x2d,
+    DW_OP_ne = 0x2e,
+    DW_OP_skip = 0x2f,
+    DW_OP_lit0 = 0x30,
+    DW_OP_lit1 = 0x31,
+    DW_OP_lit2 = 0x32,
+    DW_OP_lit3 = 0x33,
+    DW_OP_lit4 = 0x34,
+    DW_OP_lit5 = 0x35,
+    DW_OP_lit6 = 0x36,
+    DW_OP_lit7 = 0x37,
+    DW_OP_lit8 = 0x38,
+    DW_OP_lit9 = 0x39,
+    DW_OP_lit10 = 0x3a,
+    DW_OP_lit11 = 0x3b,
+    DW_OP_lit12 = 0x3c,
+    DW_OP_lit13 = 0x3d,
+    DW_OP_lit14 = 0x3e,
+    DW_OP_lit15 = 0x3f,
+    DW_OP_lit16 = 0x40,
+    DW_OP_lit17 = 0x41,
+    DW_OP_lit18 = 0x42,
+    DW_OP_lit19 = 0x43,
+    DW_OP_lit20 = 0x44,
+    DW_OP_lit21 = 0x45,
+    DW_OP_lit22 = 0x46,
+    DW_OP_lit23 = 0x47,
+    DW_OP_lit24 = 0x48,
+    DW_OP_lit25 = 0x49,
+    DW_OP_lit26 = 0x4a,
+    DW_OP_lit27 = 0x4b,
+    DW_OP_lit28 = 0x4c,
+    DW_OP_lit29 = 0x4d,
+    DW_OP_lit30 = 0x4e,
+    DW_OP_lit31 = 0x4f,
+    DW_OP_reg0 = 0x50,
+    DW_OP_reg1 = 0x51,
+    DW_OP_reg2 = 0x52,
+    DW_OP_reg3 = 0x53,
+    DW_OP_reg4 = 0x54,
+    DW_OP_reg5 = 0x55,
+    DW_OP_reg6 = 0x56,
+    DW_OP_reg7 = 0x57,
+    DW_OP_reg8 = 0x58,
+    DW_OP_reg9 = 0x59,
+    DW_OP_reg10 = 0x5a,
+    DW_OP_reg11 = 0x5b,
+    DW_OP_reg12 = 0x5c,
+    DW_OP_reg13 = 0x5d,
+    DW_OP_reg14 = 0x5e,
+    DW_OP_reg15 = 0x5f,
+    DW_OP_reg16 = 0x60,
+    DW_OP_reg17 = 0x61,
+    DW_OP_reg18 = 0x62,
+    DW_OP_reg19 = 0x63,
+    DW_OP_reg20 = 0x64,
+    DW_OP_reg21 = 0x65,
+    DW_OP_reg22 = 0x66,
+    DW_OP_reg23 = 0x67,
+    DW_OP_reg24 = 0x68,
+    DW_OP_reg25 = 0x69,
+    DW_OP_reg26 = 0x6a,
+    DW_OP_reg27 = 0x6b,
+    DW_OP_reg28 = 0x6c,
+    DW_OP_reg29 = 0x6d,
+    DW_OP_reg30 = 0x6e,
+    DW_OP_reg31 = 0x6f,
+    DW_OP_breg0 = 0x70,
+    DW_OP_breg1 = 0x71,
+    DW_OP_breg2 = 0x72,
+    DW_OP_breg3 = 0x73,
+    DW_OP_breg4 = 0x74,
+    DW_OP_breg5 = 0x75,
+    DW_OP_breg6 = 0x76,
+    DW_OP_breg7 = 0x77,
+    DW_OP_breg8 = 0x78,
+    DW_OP_breg9 = 0x79,
+    DW_OP_breg10 = 0x7a,
+    DW_OP_breg11 = 0x7b,
+    DW_OP_breg12 = 0x7c,
+    DW_OP_breg13 = 0x7d,
+    DW_OP_breg14 = 0x7e,
+    DW_OP_breg15 = 0x7f,
+    DW_OP_breg16 = 0x80,
+    DW_OP_breg17 = 0x81,
+    DW_OP_breg18 = 0x82,
+    DW_OP_breg19 = 0x83,
+    DW_OP_breg20 = 0x84,
+    DW_OP_breg21 = 0x85,
+    DW_OP_breg22 = 0x86,
+    DW_OP_breg23 = 0x87,
+    DW_OP_breg24 = 0x88,
+    DW_OP_breg25 = 0x89,
+    DW_OP_breg26 = 0x8a,
+    DW_OP_breg27 = 0x8b,
+    DW_OP_breg28 = 0x8c,
+    DW_OP_breg29 = 0x8d,
+    DW_OP_breg30 = 0x8e,
+    DW_OP_breg31 = 0x8f,
+    DW_OP_regx = 0x90,
+    DW_OP_fbreg = 0x91,
+    DW_OP_bregx = 0x92,
+    DW_OP_piece = 0x93,
+    DW_OP_deref_size = 0x94,
+    DW_OP_xderef_size = 0x95,
+    DW_OP_nop = 0x96
+  };
+
+#define DW_OP_lo_user	0x80	/* implementation-defined range start */
+#define DW_OP_hi_user	0xff	/* implementation-defined range end */
+
+/* Type encodings.  */
+
+enum dwarf_type
+  {
+    DW_ATE_void = 0x0,
+    DW_ATE_address = 0x1,
+    DW_ATE_boolean = 0x2,
+    DW_ATE_complex_float = 0x3,
+    DW_ATE_float = 0x4,
+    DW_ATE_signed = 0x5,
+    DW_ATE_signed_char = 0x6,
+    DW_ATE_unsigned = 0x7,
+    DW_ATE_unsigned_char = 0x8
+  };
+
+#define	DW_ATE_lo_user 0x80
+#define	DW_ATE_hi_user 0xff
+
+/* Array ordering names and codes.  */
+enum dwarf_array_dim_ordering
+  {
+    DW_ORD_row_major = 0,
+    DW_ORD_col_major = 1
+  };
+
+/* access attribute */
+enum dwarf_access_attribute
+  {
+    DW_ACCESS_public = 1,
+    DW_ACCESS_protected = 2,
+    DW_ACCESS_private = 3
+  };
+
+/* visibility */
+enum dwarf_visibility_attribute
+  {
+    DW_VIS_local = 1,
+    DW_VIS_exported = 2,
+    DW_VIS_qualified = 3
+  };
+
+/* virtuality */
+enum dwarf_virtuality_attribute
+  {
+    DW_VIRTUALITY_none = 0,
+    DW_VIRTUALITY_virtual = 1,
+    DW_VIRTUALITY_pure_virtual = 2
+  };
+
+/* case sensitivity */
+enum dwarf_id_case
+  {
+    DW_ID_case_sensitive = 0,
+    DW_ID_up_case = 1,
+    DW_ID_down_case = 2,
+    DW_ID_case_insensitive = 3
+  };
+
+/* calling convention */
+enum dwarf_calling_convention
+  {
+    DW_CC_normal = 0x1,
+    DW_CC_program = 0x2,
+    DW_CC_nocall = 0x3
+  };
+
+#define DW_CC_lo_user 0x40
+#define DW_CC_hi_user 0xff
+
+/* inline attribute */
+enum dwarf_inline_attribute
+  {
+    DW_INL_not_inlined = 0,
+    DW_INL_inlined = 1,
+    DW_INL_declared_not_inlined = 2,
+    DW_INL_declared_inlined = 3
+  };
+
+/* discriminant lists */
+enum dwarf_discrim_list
+  {
+    DW_DSC_label = 0,
+    DW_DSC_range = 1
+  };
+
+/* line number opcodes */
+enum dwarf_line_number_ops
+  {
+    DW_LNS_extended_op = 0,
+    DW_LNS_copy = 1,
+    DW_LNS_advance_pc = 2,
+    DW_LNS_advance_line = 3,
+    DW_LNS_set_file = 4,
+    DW_LNS_set_column = 5,
+    DW_LNS_negate_stmt = 6,
+    DW_LNS_set_basic_block = 7,
+    DW_LNS_const_add_pc = 8,
+    DW_LNS_fixed_advance_pc = 9
+  };
+
+/* line number extended opcodes */
+enum dwarf_line_number_x_ops
+  {
+    DW_LNE_end_sequence = 1,
+    DW_LNE_set_address = 2,
+    DW_LNE_define_file = 3
+  };
+
+/* call frame information */
+enum dwarf_call_frame_info
+  {
+    DW_CFA_advance_loc = 0x40,
+    DW_CFA_offset = 0x80,
+    DW_CFA_restore = 0xc0,
+    DW_CFA_nop = 0x00,
+    DW_CFA_set_loc = 0x01,
+    DW_CFA_advance_loc1 = 0x02,
+    DW_CFA_advance_loc2 = 0x03,
+    DW_CFA_advance_loc4 = 0x04,
+    DW_CFA_offset_extended = 0x05,
+    DW_CFA_restore_extended = 0x06,
+    DW_CFA_undefined = 0x07,
+    DW_CFA_same_value = 0x08,
+    DW_CFA_register = 0x09,
+    DW_CFA_remember_state = 0x0a,
+    DW_CFA_restore_state = 0x0b,
+    DW_CFA_def_cfa = 0x0c,
+    DW_CFA_def_cfa_register = 0x0d,
+    DW_CFA_def_cfa_offset = 0x0e,
+    DW_CFA_def_cfa_expression = 0x0f,
+    DW_CFA_expression = 0x10,
+    /* Dwarf 2.1 */
+    DW_CFA_offset_extended_sf = 0x11,
+    DW_CFA_def_cfa_sf = 0x12,
+    DW_CFA_def_cfa_offset_sf = 0x13,
+
+    /* SGI/MIPS specific */
+    DW_CFA_MIPS_advance_loc8 = 0x1d,
+
+    /* GNU extensions */
+    DW_CFA_GNU_window_save = 0x2d,
+    DW_CFA_GNU_args_size = 0x2e,
+    DW_CFA_GNU_negative_offset_extended = 0x2f
+  };
+
+#define DW_CIE_ID	  0xffffffff
+#define DW_CIE_VERSION	  1
+
+#define DW_CFA_extended   0
+#define DW_CFA_low_user   0x1c
+#define DW_CFA_high_user  0x3f
+
+#define DW_CHILDREN_no		     0x00
+#define DW_CHILDREN_yes		     0x01
+
+#define DW_ADDR_none		0
+
+/* Source language names and codes.  */
+
+enum dwarf_source_language
+  {
+    DW_LANG_C89 = 0x0001,
+    DW_LANG_C = 0x0002,
+    DW_LANG_Ada83 = 0x0003,
+    DW_LANG_C_plus_plus = 0x0004,
+    DW_LANG_Cobol74 = 0x0005,
+    DW_LANG_Cobol85 = 0x0006,
+    DW_LANG_Fortran77 = 0x0007,
+    DW_LANG_Fortran90 = 0x0008,
+    DW_LANG_Pascal83 = 0x0009,
+    DW_LANG_Modula2 = 0x000a,
+    DW_LANG_Java = 0x000b,
+    DW_LANG_Mips_Assembler = 0x8001
+  };
+
+
+#define DW_LANG_lo_user 0x8000	/* implementation-defined range start */
+#define DW_LANG_hi_user 0xffff	/* implementation-defined range start */
+
+/* Names and codes for macro information.  */
+
+enum dwarf_macinfo_record_type
+  {
+    DW_MACINFO_define = 1,
+    DW_MACINFO_undef = 2,
+    DW_MACINFO_start_file = 3,
+    DW_MACINFO_end_file = 4,
+    DW_MACINFO_vendor_ext = 255
+  };
+
+#endif /* !ASSEMBLER */
+
+/* @@@ For use with GNU frame unwind information.  */
+
+#define DW_EH_PE_absptr		0x00
+#define DW_EH_PE_omit		0xff
+
+#define DW_EH_PE_uleb128	0x01
+#define DW_EH_PE_udata2		0x02
+#define DW_EH_PE_udata4		0x03
+#define DW_EH_PE_udata8		0x04
+#define DW_EH_PE_sleb128	0x09
+#define DW_EH_PE_sdata2		0x0A
+#define DW_EH_PE_sdata4		0x0B
+#define DW_EH_PE_sdata8		0x0C
+#define DW_EH_PE_signed		0x08
+
+#define DW_EH_PE_pcrel		0x10
+#define DW_EH_PE_textrel	0x20
+#define DW_EH_PE_datarel	0x30
+#define DW_EH_PE_funcrel	0x40
+#define DW_EH_PE_aligned	0x50
+
+#define DW_EH_PE_indirect	0x80
+
+#endif /* dwarf2.h */
diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
new file mode 100644
index 00000000000..1492dd38e73
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
@@ -0,0 +1,3182 @@
+/* memcpy with SSSE3 and REP string
+   Copyright (C) 2010-2020 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#if 1
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_back
+# define MEMCPY_CHK	__memcpy_chk_ssse3_back
+# define MEMPCPY	__mempcpy_ssse3_back
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  _CET_NOTRACK jmp *INDEX;					\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%RDI_LP, %RAX_LP
+#ifdef USE_AS_MEMPCPY
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(bwd_write_0bytes)
+	cmp	$144, %rdx
+	jae	L(copy_backward)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+L(copy_forward):
+#endif
+L(start):
+	cmp	$144, %rdx
+	jae	L(144bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jbe	L(bk_write)
+#endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+#endif
+
+	.p2align 4
+L(144bytesormore):
+
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %r8
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rdi, %r9
+	sub	%r8, %r9
+	sub	%r9, %rdx
+	add	%r9, %rsi
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %RCX_LP
+#else
+	mov	__x86_data_cache_size(%rip), %RCX_LP
+#endif
+	cmp	%rcx, %rdx
+	jae	L(gobble_mem_fwd)
+	lea    	L(shl_table_fwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	_CET_NOTRACK jmp *%r9
+	ud2
+
+	.p2align 4
+L(copy_backward):
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %RCX_LP
+#else
+	mov	__x86_data_cache_size(%rip), %RCX_LP
+#endif
+	shl	$1, %rcx
+	cmp	%rcx, %rdx
+	ja	L(gobble_mem_bwd)
+
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$0xf, %r9
+	xor	%r9, %rdi
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+	lea    	L(shl_table_bwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	_CET_NOTRACK jmp *%r9
+	ud2
+
+	.p2align 4
+L(shl_0):
+
+	mov	%rdx, %r9
+	shr	$8, %r9
+	add	%rdx, %r9
+#ifdef DATA_CACHE_SIZE
+	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %R9_LP
+#endif
+	jae	L(gobble_mem_fwd)
+	sub	$0x80, %rdx
+	.p2align 4
+L(shl_0_loop):
+	movdqa	(%rsi), %xmm1
+	movdqa	%xmm1, (%rdi)
+	movaps	0x10(%rsi), %xmm2
+	movaps	%xmm2, 0x10(%rdi)
+	movaps	0x20(%rsi), %xmm3
+	movaps	%xmm3, 0x20(%rdi)
+	movaps	0x30(%rsi), %xmm4
+	movaps	%xmm4, 0x30(%rdi)
+	movaps	0x40(%rsi), %xmm1
+	movaps	%xmm1, 0x40(%rdi)
+	movaps	0x50(%rsi), %xmm2
+	movaps	%xmm2, 0x50(%rdi)
+	movaps	0x60(%rsi), %xmm3
+	movaps	%xmm3, 0x60(%rdi)
+	movaps	0x70(%rsi), %xmm4
+	movaps	%xmm4, 0x70(%rdi)
+	sub	$0x80, %rdx
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_0_loop)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_0_bwd):
+	sub	$0x80, %rdx
+L(copy_backward_loop):
+	movaps	-0x10(%rsi), %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	-0x20(%rsi), %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+	movaps	-0x30(%rsi), %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+	movaps	-0x40(%rsi), %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+	movaps	-0x50(%rsi), %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+	movaps	-0x60(%rsi), %xmm5
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	-0x70(%rsi), %xmm5
+	movaps	%xmm5, -0x70(%rdi)
+	movaps	-0x80(%rsi), %xmm5
+	movaps	%xmm5, -0x80(%rdi)
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(copy_backward_loop)
+
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_1):
+	sub	$0x80, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movaps	0x4f(%rsi), %xmm6
+	movaps	0x5f(%rsi), %xmm7
+	movaps	0x6f(%rsi), %xmm8
+	movaps	0x7f(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$1, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$1, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$1, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$1, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$1, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$1, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_1)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_1_bwd):
+	movaps	-0x01(%rsi), %xmm1
+
+	movaps	-0x11(%rsi), %xmm2
+	palignr	$1, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x21(%rsi), %xmm3
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x31(%rsi), %xmm4
+	palignr	$1, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x41(%rsi), %xmm5
+	palignr	$1, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x51(%rsi), %xmm6
+	palignr	$1, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x61(%rsi), %xmm7
+	palignr	$1, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x71(%rsi), %xmm8
+	palignr	$1, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x81(%rsi), %xmm9
+	palignr	$1, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_1_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_2):
+	sub	$0x80, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movaps	0x4e(%rsi), %xmm6
+	movaps	0x5e(%rsi), %xmm7
+	movaps	0x6e(%rsi), %xmm8
+	movaps	0x7e(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$2, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$2, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$2, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$2, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$2, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$2, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_2)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_2_bwd):
+	movaps	-0x02(%rsi), %xmm1
+
+	movaps	-0x12(%rsi), %xmm2
+	palignr	$2, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x22(%rsi), %xmm3
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x32(%rsi), %xmm4
+	palignr	$2, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x42(%rsi), %xmm5
+	palignr	$2, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x52(%rsi), %xmm6
+	palignr	$2, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x62(%rsi), %xmm7
+	palignr	$2, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x72(%rsi), %xmm8
+	palignr	$2, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x82(%rsi), %xmm9
+	palignr	$2, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_2_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_3):
+	sub	$0x80, %rdx
+	movaps -0x03(%rsi), %xmm1
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movaps	0x4d(%rsi), %xmm6
+	movaps	0x5d(%rsi), %xmm7
+	movaps	0x6d(%rsi), %xmm8
+	movaps	0x7d(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$3, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$3, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$3, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$3, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$3, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$3, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_3)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_3_bwd):
+	movaps	-0x03(%rsi), %xmm1
+
+	movaps	-0x13(%rsi), %xmm2
+	palignr	$3, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x23(%rsi), %xmm3
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x33(%rsi), %xmm4
+	palignr	$3, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x43(%rsi), %xmm5
+	palignr	$3, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x53(%rsi), %xmm6
+	palignr	$3, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x63(%rsi), %xmm7
+	palignr	$3, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x73(%rsi), %xmm8
+	palignr	$3, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x83(%rsi), %xmm9
+	palignr	$3, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_3_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_4):
+	sub	$0x80, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movaps	0x4c(%rsi), %xmm6
+	movaps	0x5c(%rsi), %xmm7
+	movaps	0x6c(%rsi), %xmm8
+	movaps	0x7c(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$4, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$4, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$4, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$4, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$4, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$4, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_4)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_4_bwd):
+	movaps	-0x04(%rsi), %xmm1
+
+	movaps	-0x14(%rsi), %xmm2
+	palignr	$4, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x24(%rsi), %xmm3
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x34(%rsi), %xmm4
+	palignr	$4, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x44(%rsi), %xmm5
+	palignr	$4, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x54(%rsi), %xmm6
+	palignr	$4, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x64(%rsi), %xmm7
+	palignr	$4, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x74(%rsi), %xmm8
+	palignr	$4, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x84(%rsi), %xmm9
+	palignr	$4, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_4_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_5):
+	sub	$0x80, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movaps	0x4b(%rsi), %xmm6
+	movaps	0x5b(%rsi), %xmm7
+	movaps	0x6b(%rsi), %xmm8
+	movaps	0x7b(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$5, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$5, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$5, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$5, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$5, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$5, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_5)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_5_bwd):
+	movaps	-0x05(%rsi), %xmm1
+
+	movaps	-0x15(%rsi), %xmm2
+	palignr	$5, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x25(%rsi), %xmm3
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x35(%rsi), %xmm4
+	palignr	$5, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x45(%rsi), %xmm5
+	palignr	$5, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x55(%rsi), %xmm6
+	palignr	$5, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x65(%rsi), %xmm7
+	palignr	$5, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x75(%rsi), %xmm8
+	palignr	$5, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x85(%rsi), %xmm9
+	palignr	$5, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_5_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_6):
+	sub	$0x80, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movaps	0x4a(%rsi), %xmm6
+	movaps	0x5a(%rsi), %xmm7
+	movaps	0x6a(%rsi), %xmm8
+	movaps	0x7a(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$6, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$6, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$6, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$6, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$6, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$6, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_6)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_6_bwd):
+	movaps	-0x06(%rsi), %xmm1
+
+	movaps	-0x16(%rsi), %xmm2
+	palignr	$6, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x26(%rsi), %xmm3
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x36(%rsi), %xmm4
+	palignr	$6, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x46(%rsi), %xmm5
+	palignr	$6, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x56(%rsi), %xmm6
+	palignr	$6, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x66(%rsi), %xmm7
+	palignr	$6, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x76(%rsi), %xmm8
+	palignr	$6, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x86(%rsi), %xmm9
+	palignr	$6, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_6_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_7):
+	sub	$0x80, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movaps	0x49(%rsi), %xmm6
+	movaps	0x59(%rsi), %xmm7
+	movaps	0x69(%rsi), %xmm8
+	movaps	0x79(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$7, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$7, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$7, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$7, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$7, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$7, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_7)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_7_bwd):
+	movaps	-0x07(%rsi), %xmm1
+
+	movaps	-0x17(%rsi), %xmm2
+	palignr	$7, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x27(%rsi), %xmm3
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x37(%rsi), %xmm4
+	palignr	$7, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x47(%rsi), %xmm5
+	palignr	$7, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x57(%rsi), %xmm6
+	palignr	$7, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x67(%rsi), %xmm7
+	palignr	$7, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x77(%rsi), %xmm8
+	palignr	$7, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x87(%rsi), %xmm9
+	palignr	$7, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_7_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_8):
+	sub	$0x80, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movaps	0x48(%rsi), %xmm6
+	movaps	0x58(%rsi), %xmm7
+	movaps	0x68(%rsi), %xmm8
+	movaps	0x78(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$8, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$8, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$8, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$8, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$8, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$8, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_8)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_8_bwd):
+	movaps	-0x08(%rsi), %xmm1
+
+	movaps	-0x18(%rsi), %xmm2
+	palignr	$8, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x28(%rsi), %xmm3
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x38(%rsi), %xmm4
+	palignr	$8, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x48(%rsi), %xmm5
+	palignr	$8, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x58(%rsi), %xmm6
+	palignr	$8, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x68(%rsi), %xmm7
+	palignr	$8, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x78(%rsi), %xmm8
+	palignr	$8, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x88(%rsi), %xmm9
+	palignr	$8, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_8_bwd)
+L(shl_8_end_bwd):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_9):
+	sub	$0x80, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movaps	0x47(%rsi), %xmm6
+	movaps	0x57(%rsi), %xmm7
+	movaps	0x67(%rsi), %xmm8
+	movaps	0x77(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$9, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$9, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$9, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$9, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$9, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$9, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_9)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_9_bwd):
+	movaps	-0x09(%rsi), %xmm1
+
+	movaps	-0x19(%rsi), %xmm2
+	palignr	$9, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x29(%rsi), %xmm3
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x39(%rsi), %xmm4
+	palignr	$9, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x49(%rsi), %xmm5
+	palignr	$9, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x59(%rsi), %xmm6
+	palignr	$9, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x69(%rsi), %xmm7
+	palignr	$9, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x79(%rsi), %xmm8
+	palignr	$9, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x89(%rsi), %xmm9
+	palignr	$9, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_9_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_10):
+	sub	$0x80, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movaps	0x46(%rsi), %xmm6
+	movaps	0x56(%rsi), %xmm7
+	movaps	0x66(%rsi), %xmm8
+	movaps	0x76(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$10, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$10, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$10, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$10, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$10, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$10, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_10)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_10_bwd):
+	movaps	-0x0a(%rsi), %xmm1
+
+	movaps	-0x1a(%rsi), %xmm2
+	palignr	$10, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2a(%rsi), %xmm3
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3a(%rsi), %xmm4
+	palignr	$10, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4a(%rsi), %xmm5
+	palignr	$10, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5a(%rsi), %xmm6
+	palignr	$10, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6a(%rsi), %xmm7
+	palignr	$10, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7a(%rsi), %xmm8
+	palignr	$10, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8a(%rsi), %xmm9
+	palignr	$10, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_10_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_11):
+	sub	$0x80, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movaps	0x45(%rsi), %xmm6
+	movaps	0x55(%rsi), %xmm7
+	movaps	0x65(%rsi), %xmm8
+	movaps	0x75(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$11, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$11, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$11, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$11, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$11, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$11, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_11)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_11_bwd):
+	movaps	-0x0b(%rsi), %xmm1
+
+	movaps	-0x1b(%rsi), %xmm2
+	palignr	$11, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2b(%rsi), %xmm3
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3b(%rsi), %xmm4
+	palignr	$11, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4b(%rsi), %xmm5
+	palignr	$11, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5b(%rsi), %xmm6
+	palignr	$11, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6b(%rsi), %xmm7
+	palignr	$11, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7b(%rsi), %xmm8
+	palignr	$11, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8b(%rsi), %xmm9
+	palignr	$11, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_11_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_12):
+	sub	$0x80, %rdx
+	movdqa	-0x0c(%rsi), %xmm1
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movaps	0x44(%rsi), %xmm6
+	movaps	0x54(%rsi), %xmm7
+	movaps	0x64(%rsi), %xmm8
+	movaps	0x74(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$12, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$12, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$12, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$12, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$12, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$12, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_12)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_12_bwd):
+	movaps	-0x0c(%rsi), %xmm1
+
+	movaps	-0x1c(%rsi), %xmm2
+	palignr	$12, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2c(%rsi), %xmm3
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3c(%rsi), %xmm4
+	palignr	$12, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4c(%rsi), %xmm5
+	palignr	$12, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5c(%rsi), %xmm6
+	palignr	$12, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6c(%rsi), %xmm7
+	palignr	$12, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7c(%rsi), %xmm8
+	palignr	$12, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8c(%rsi), %xmm9
+	palignr	$12, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_12_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_13):
+	sub	$0x80, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movaps	0x43(%rsi), %xmm6
+	movaps	0x53(%rsi), %xmm7
+	movaps	0x63(%rsi), %xmm8
+	movaps	0x73(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$13, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$13, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$13, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$13, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$13, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$13, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_13)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_13_bwd):
+	movaps	-0x0d(%rsi), %xmm1
+
+	movaps	-0x1d(%rsi), %xmm2
+	palignr	$13, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2d(%rsi), %xmm3
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3d(%rsi), %xmm4
+	palignr	$13, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4d(%rsi), %xmm5
+	palignr	$13, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5d(%rsi), %xmm6
+	palignr	$13, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6d(%rsi), %xmm7
+	palignr	$13, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7d(%rsi), %xmm8
+	palignr	$13, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8d(%rsi), %xmm9
+	palignr	$13, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_13_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_14):
+	sub	$0x80, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movaps	0x42(%rsi), %xmm6
+	movaps	0x52(%rsi), %xmm7
+	movaps	0x62(%rsi), %xmm8
+	movaps	0x72(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$14, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$14, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$14, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$14, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$14, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$14, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_14)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_14_bwd):
+	movaps	-0x0e(%rsi), %xmm1
+
+	movaps	-0x1e(%rsi), %xmm2
+	palignr	$14, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2e(%rsi), %xmm3
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3e(%rsi), %xmm4
+	palignr	$14, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4e(%rsi), %xmm5
+	palignr	$14, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5e(%rsi), %xmm6
+	palignr	$14, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6e(%rsi), %xmm7
+	palignr	$14, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7e(%rsi), %xmm8
+	palignr	$14, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8e(%rsi), %xmm9
+	palignr	$14, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_14_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(shl_15):
+	sub	$0x80, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movaps	0x41(%rsi), %xmm6
+	movaps	0x51(%rsi), %xmm7
+	movaps	0x61(%rsi), %xmm8
+	movaps	0x71(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$15, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$15, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$15, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$15, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$15, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$15, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_15)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(shl_15_bwd):
+	movaps	-0x0f(%rsi), %xmm1
+
+	movaps	-0x1f(%rsi), %xmm2
+	palignr	$15, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2f(%rsi), %xmm3
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3f(%rsi), %xmm4
+	palignr	$15, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4f(%rsi), %xmm5
+	palignr	$15, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5f(%rsi), %xmm6
+	palignr	$15, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6f(%rsi), %xmm7
+	palignr	$15, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7f(%rsi), %xmm8
+	palignr	$15, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8f(%rsi), %xmm9
+	palignr	$15, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_15_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(gobble_mem_fwd):
+	movdqu	(%rsi), %xmm1
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, (%rdi)
+	sub	$16, %rdx
+	add	$16, %rsi
+	add	$16, %rdi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger_in_fwd)
+	mov	%rdx, %rcx
+L(bigger_in_fwd):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy_fwd)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy_fwd)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy_fwd):
+	sub	$0x80, %rdx
+L(gobble_mem_fwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 0x200(%rsi)
+	prefetcht0 0x300(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lfence
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_mem_fwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_fwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy_fwd):
+	add	%rcx, %rdx
+L(ll_cache_copy_fwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop_fwd):
+	prefetchnta 0x1c0(%rsi)
+	prefetchnta 0x280(%rsi)
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop_fwd)
+L(gobble_mem_fwd_end):
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	.p2align 4
+L(gobble_mem_bwd):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$-16, %rdi
+	sub	%rdi, %r9
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger)
+	mov	%rdx, %rcx
+L(bigger):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 -0x200(%rsi)
+	prefetcht0 -0x300(%rsi)
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	lfence
+	movntdq	%xmm1, -0x10(%rdi)
+	movntdq	%xmm2, -0x20(%rdi)
+	movntdq	%xmm3, -0x30(%rdi)
+	movntdq	%xmm4, -0x40(%rdi)
+	movntdq	%xmm5, -0x50(%rdi)
+	movntdq	%xmm6, -0x60(%rdi)
+	movntdq	%xmm7, -0x70(%rdi)
+	movntdq	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_mem_bwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_bwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy):
+	add	%rcx, %rdx
+L(ll_cache_copy_bwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop):
+	prefetchnta -0x1c0(%rsi)
+	prefetchnta -0x280(%rsi)
+	prefetchnta -0x1c0(%rdi)
+	prefetchnta -0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	movdqa	%xmm1, -0x10(%rdi)
+	movdqa	%xmm2, -0x20(%rdi)
+	movdqa	%xmm3, -0x30(%rdi)
+	movdqa	%xmm4, -0x40(%rdi)
+	movdqa	%xmm5, -0x50(%rdi)
+	movdqa	%xmm6, -0x60(%rdi)
+	movdqa	%xmm7, -0x70(%rdi)
+	movdqa	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop)
+L(gobble_mem_bwd_end):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rsi
+	sub	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(fwd_write_128bytes):
+	lddqu	-128(%rsi), %xmm0
+	movdqu	%xmm0, -128(%rdi)
+L(fwd_write_112bytes):
+	lddqu	-112(%rsi), %xmm0
+	movdqu	%xmm0, -112(%rdi)
+L(fwd_write_96bytes):
+	lddqu	-96(%rsi), %xmm0
+	movdqu	%xmm0, -96(%rdi)
+L(fwd_write_80bytes):
+	lddqu	-80(%rsi), %xmm0
+	movdqu	%xmm0, -80(%rdi)
+L(fwd_write_64bytes):
+	lddqu	-64(%rsi), %xmm0
+	movdqu	%xmm0, -64(%rdi)
+L(fwd_write_48bytes):
+	lddqu	-48(%rsi), %xmm0
+	movdqu	%xmm0, -48(%rdi)
+L(fwd_write_32bytes):
+	lddqu	-32(%rsi), %xmm0
+	movdqu	%xmm0, -32(%rdi)
+L(fwd_write_16bytes):
+	lddqu	-16(%rsi), %xmm0
+	movdqu	%xmm0, -16(%rdi)
+L(fwd_write_0bytes):
+	ret
+
+
+	.p2align 4
+L(fwd_write_143bytes):
+	lddqu	-143(%rsi), %xmm0
+	movdqu	%xmm0, -143(%rdi)
+L(fwd_write_127bytes):
+	lddqu	-127(%rsi), %xmm0
+	movdqu	%xmm0, -127(%rdi)
+L(fwd_write_111bytes):
+	lddqu	-111(%rsi), %xmm0
+	movdqu	%xmm0, -111(%rdi)
+L(fwd_write_95bytes):
+	lddqu	-95(%rsi), %xmm0
+	movdqu	%xmm0, -95(%rdi)
+L(fwd_write_79bytes):
+	lddqu	-79(%rsi), %xmm0
+	movdqu	%xmm0, -79(%rdi)
+L(fwd_write_63bytes):
+	lddqu	-63(%rsi), %xmm0
+	movdqu	%xmm0, -63(%rdi)
+L(fwd_write_47bytes):
+	lddqu	-47(%rsi), %xmm0
+	movdqu	%xmm0, -47(%rdi)
+L(fwd_write_31bytes):
+	lddqu	-31(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -31(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_15bytes):
+	mov	-15(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -15(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_142bytes):
+	lddqu	-142(%rsi), %xmm0
+	movdqu	%xmm0, -142(%rdi)
+L(fwd_write_126bytes):
+	lddqu	-126(%rsi), %xmm0
+	movdqu	%xmm0, -126(%rdi)
+L(fwd_write_110bytes):
+	lddqu	-110(%rsi), %xmm0
+	movdqu	%xmm0, -110(%rdi)
+L(fwd_write_94bytes):
+	lddqu	-94(%rsi), %xmm0
+	movdqu	%xmm0, -94(%rdi)
+L(fwd_write_78bytes):
+	lddqu	-78(%rsi), %xmm0
+	movdqu	%xmm0, -78(%rdi)
+L(fwd_write_62bytes):
+	lddqu	-62(%rsi), %xmm0
+	movdqu	%xmm0, -62(%rdi)
+L(fwd_write_46bytes):
+	lddqu	-46(%rsi), %xmm0
+	movdqu	%xmm0, -46(%rdi)
+L(fwd_write_30bytes):
+	lddqu	-30(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -30(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_14bytes):
+	mov	-14(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -14(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_141bytes):
+	lddqu	-141(%rsi), %xmm0
+	movdqu	%xmm0, -141(%rdi)
+L(fwd_write_125bytes):
+	lddqu	-125(%rsi), %xmm0
+	movdqu	%xmm0, -125(%rdi)
+L(fwd_write_109bytes):
+	lddqu	-109(%rsi), %xmm0
+	movdqu	%xmm0, -109(%rdi)
+L(fwd_write_93bytes):
+	lddqu	-93(%rsi), %xmm0
+	movdqu	%xmm0, -93(%rdi)
+L(fwd_write_77bytes):
+	lddqu	-77(%rsi), %xmm0
+	movdqu	%xmm0, -77(%rdi)
+L(fwd_write_61bytes):
+	lddqu	-61(%rsi), %xmm0
+	movdqu	%xmm0, -61(%rdi)
+L(fwd_write_45bytes):
+	lddqu	-45(%rsi), %xmm0
+	movdqu	%xmm0, -45(%rdi)
+L(fwd_write_29bytes):
+	lddqu	-29(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -29(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_13bytes):
+	mov	-13(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -13(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_140bytes):
+	lddqu	-140(%rsi), %xmm0
+	movdqu	%xmm0, -140(%rdi)
+L(fwd_write_124bytes):
+	lddqu	-124(%rsi), %xmm0
+	movdqu	%xmm0, -124(%rdi)
+L(fwd_write_108bytes):
+	lddqu	-108(%rsi), %xmm0
+	movdqu	%xmm0, -108(%rdi)
+L(fwd_write_92bytes):
+	lddqu	-92(%rsi), %xmm0
+	movdqu	%xmm0, -92(%rdi)
+L(fwd_write_76bytes):
+	lddqu	-76(%rsi), %xmm0
+	movdqu	%xmm0, -76(%rdi)
+L(fwd_write_60bytes):
+	lddqu	-60(%rsi), %xmm0
+	movdqu	%xmm0, -60(%rdi)
+L(fwd_write_44bytes):
+	lddqu	-44(%rsi), %xmm0
+	movdqu	%xmm0, -44(%rdi)
+L(fwd_write_28bytes):
+	lddqu	-28(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -28(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_12bytes):
+	mov	-12(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -12(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_139bytes):
+	lddqu	-139(%rsi), %xmm0
+	movdqu	%xmm0, -139(%rdi)
+L(fwd_write_123bytes):
+	lddqu	-123(%rsi), %xmm0
+	movdqu	%xmm0, -123(%rdi)
+L(fwd_write_107bytes):
+	lddqu	-107(%rsi), %xmm0
+	movdqu	%xmm0, -107(%rdi)
+L(fwd_write_91bytes):
+	lddqu	-91(%rsi), %xmm0
+	movdqu	%xmm0, -91(%rdi)
+L(fwd_write_75bytes):
+	lddqu	-75(%rsi), %xmm0
+	movdqu	%xmm0, -75(%rdi)
+L(fwd_write_59bytes):
+	lddqu	-59(%rsi), %xmm0
+	movdqu	%xmm0, -59(%rdi)
+L(fwd_write_43bytes):
+	lddqu	-43(%rsi), %xmm0
+	movdqu	%xmm0, -43(%rdi)
+L(fwd_write_27bytes):
+	lddqu	-27(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -27(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_11bytes):
+	mov	-11(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -11(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_138bytes):
+	lddqu	-138(%rsi), %xmm0
+	movdqu	%xmm0, -138(%rdi)
+L(fwd_write_122bytes):
+	lddqu	-122(%rsi), %xmm0
+	movdqu	%xmm0, -122(%rdi)
+L(fwd_write_106bytes):
+	lddqu	-106(%rsi), %xmm0
+	movdqu	%xmm0, -106(%rdi)
+L(fwd_write_90bytes):
+	lddqu	-90(%rsi), %xmm0
+	movdqu	%xmm0, -90(%rdi)
+L(fwd_write_74bytes):
+	lddqu	-74(%rsi), %xmm0
+	movdqu	%xmm0, -74(%rdi)
+L(fwd_write_58bytes):
+	lddqu	-58(%rsi), %xmm0
+	movdqu	%xmm0, -58(%rdi)
+L(fwd_write_42bytes):
+	lddqu	-42(%rsi), %xmm0
+	movdqu	%xmm0, -42(%rdi)
+L(fwd_write_26bytes):
+	lddqu	-26(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -26(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_10bytes):
+	mov	-10(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -10(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_137bytes):
+	lddqu	-137(%rsi), %xmm0
+	movdqu	%xmm0, -137(%rdi)
+L(fwd_write_121bytes):
+	lddqu	-121(%rsi), %xmm0
+	movdqu	%xmm0, -121(%rdi)
+L(fwd_write_105bytes):
+	lddqu	-105(%rsi), %xmm0
+	movdqu	%xmm0, -105(%rdi)
+L(fwd_write_89bytes):
+	lddqu	-89(%rsi), %xmm0
+	movdqu	%xmm0, -89(%rdi)
+L(fwd_write_73bytes):
+	lddqu	-73(%rsi), %xmm0
+	movdqu	%xmm0, -73(%rdi)
+L(fwd_write_57bytes):
+	lddqu	-57(%rsi), %xmm0
+	movdqu	%xmm0, -57(%rdi)
+L(fwd_write_41bytes):
+	lddqu	-41(%rsi), %xmm0
+	movdqu	%xmm0, -41(%rdi)
+L(fwd_write_25bytes):
+	lddqu	-25(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -25(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_9bytes):
+	mov	-9(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -9(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_136bytes):
+	lddqu	-136(%rsi), %xmm0
+	movdqu	%xmm0, -136(%rdi)
+L(fwd_write_120bytes):
+	lddqu	-120(%rsi), %xmm0
+	movdqu	%xmm0, -120(%rdi)
+L(fwd_write_104bytes):
+	lddqu	-104(%rsi), %xmm0
+	movdqu	%xmm0, -104(%rdi)
+L(fwd_write_88bytes):
+	lddqu	-88(%rsi), %xmm0
+	movdqu	%xmm0, -88(%rdi)
+L(fwd_write_72bytes):
+	lddqu	-72(%rsi), %xmm0
+	movdqu	%xmm0, -72(%rdi)
+L(fwd_write_56bytes):
+	lddqu	-56(%rsi), %xmm0
+	movdqu	%xmm0, -56(%rdi)
+L(fwd_write_40bytes):
+	lddqu	-40(%rsi), %xmm0
+	movdqu	%xmm0, -40(%rdi)
+L(fwd_write_24bytes):
+	lddqu	-24(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -24(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	%rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_135bytes):
+	lddqu	-135(%rsi), %xmm0
+	movdqu	%xmm0, -135(%rdi)
+L(fwd_write_119bytes):
+	lddqu	-119(%rsi), %xmm0
+	movdqu	%xmm0, -119(%rdi)
+L(fwd_write_103bytes):
+	lddqu	-103(%rsi), %xmm0
+	movdqu	%xmm0, -103(%rdi)
+L(fwd_write_87bytes):
+	lddqu	-87(%rsi), %xmm0
+	movdqu	%xmm0, -87(%rdi)
+L(fwd_write_71bytes):
+	lddqu	-71(%rsi), %xmm0
+	movdqu	%xmm0, -71(%rdi)
+L(fwd_write_55bytes):
+	lddqu	-55(%rsi), %xmm0
+	movdqu	%xmm0, -55(%rdi)
+L(fwd_write_39bytes):
+	lddqu	-39(%rsi), %xmm0
+	movdqu	%xmm0, -39(%rdi)
+L(fwd_write_23bytes):
+	lddqu	-23(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -23(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -7(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_134bytes):
+	lddqu	-134(%rsi), %xmm0
+	movdqu	%xmm0, -134(%rdi)
+L(fwd_write_118bytes):
+	lddqu	-118(%rsi), %xmm0
+	movdqu	%xmm0, -118(%rdi)
+L(fwd_write_102bytes):
+	lddqu	-102(%rsi), %xmm0
+	movdqu	%xmm0, -102(%rdi)
+L(fwd_write_86bytes):
+	lddqu	-86(%rsi), %xmm0
+	movdqu	%xmm0, -86(%rdi)
+L(fwd_write_70bytes):
+	lddqu	-70(%rsi), %xmm0
+	movdqu	%xmm0, -70(%rdi)
+L(fwd_write_54bytes):
+	lddqu	-54(%rsi), %xmm0
+	movdqu	%xmm0, -54(%rdi)
+L(fwd_write_38bytes):
+	lddqu	-38(%rsi), %xmm0
+	movdqu	%xmm0, -38(%rdi)
+L(fwd_write_22bytes):
+	lddqu	-22(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -22(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -6(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_133bytes):
+	lddqu	-133(%rsi), %xmm0
+	movdqu	%xmm0, -133(%rdi)
+L(fwd_write_117bytes):
+	lddqu	-117(%rsi), %xmm0
+	movdqu	%xmm0, -117(%rdi)
+L(fwd_write_101bytes):
+	lddqu	-101(%rsi), %xmm0
+	movdqu	%xmm0, -101(%rdi)
+L(fwd_write_85bytes):
+	lddqu	-85(%rsi), %xmm0
+	movdqu	%xmm0, -85(%rdi)
+L(fwd_write_69bytes):
+	lddqu	-69(%rsi), %xmm0
+	movdqu	%xmm0, -69(%rdi)
+L(fwd_write_53bytes):
+	lddqu	-53(%rsi), %xmm0
+	movdqu	%xmm0, -53(%rdi)
+L(fwd_write_37bytes):
+	lddqu	-37(%rsi), %xmm0
+	movdqu	%xmm0, -37(%rdi)
+L(fwd_write_21bytes):
+	lddqu	-21(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -21(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -5(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_132bytes):
+	lddqu	-132(%rsi), %xmm0
+	movdqu	%xmm0, -132(%rdi)
+L(fwd_write_116bytes):
+	lddqu	-116(%rsi), %xmm0
+	movdqu	%xmm0, -116(%rdi)
+L(fwd_write_100bytes):
+	lddqu	-100(%rsi), %xmm0
+	movdqu	%xmm0, -100(%rdi)
+L(fwd_write_84bytes):
+	lddqu	-84(%rsi), %xmm0
+	movdqu	%xmm0, -84(%rdi)
+L(fwd_write_68bytes):
+	lddqu	-68(%rsi), %xmm0
+	movdqu	%xmm0, -68(%rdi)
+L(fwd_write_52bytes):
+	lddqu	-52(%rsi), %xmm0
+	movdqu	%xmm0, -52(%rdi)
+L(fwd_write_36bytes):
+	lddqu	-36(%rsi), %xmm0
+	movdqu	%xmm0, -36(%rdi)
+L(fwd_write_20bytes):
+	lddqu	-20(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -20(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	%edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_131bytes):
+	lddqu	-131(%rsi), %xmm0
+	movdqu	%xmm0, -131(%rdi)
+L(fwd_write_115bytes):
+	lddqu	-115(%rsi), %xmm0
+	movdqu	%xmm0, -115(%rdi)
+L(fwd_write_99bytes):
+	lddqu	-99(%rsi), %xmm0
+	movdqu	%xmm0, -99(%rdi)
+L(fwd_write_83bytes):
+	lddqu	-83(%rsi), %xmm0
+	movdqu	%xmm0, -83(%rdi)
+L(fwd_write_67bytes):
+	lddqu	-67(%rsi), %xmm0
+	movdqu	%xmm0, -67(%rdi)
+L(fwd_write_51bytes):
+	lddqu	-51(%rsi), %xmm0
+	movdqu	%xmm0, -51(%rdi)
+L(fwd_write_35bytes):
+	lddqu	-35(%rsi), %xmm0
+	movdqu	%xmm0, -35(%rdi)
+L(fwd_write_19bytes):
+	lddqu	-19(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -19(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	%dx, -3(%rdi)
+	mov	%cx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_130bytes):
+	lddqu	-130(%rsi), %xmm0
+	movdqu	%xmm0, -130(%rdi)
+L(fwd_write_114bytes):
+	lddqu	-114(%rsi), %xmm0
+	movdqu	%xmm0, -114(%rdi)
+L(fwd_write_98bytes):
+	lddqu	-98(%rsi), %xmm0
+	movdqu	%xmm0, -98(%rdi)
+L(fwd_write_82bytes):
+	lddqu	-82(%rsi), %xmm0
+	movdqu	%xmm0, -82(%rdi)
+L(fwd_write_66bytes):
+	lddqu	-66(%rsi), %xmm0
+	movdqu	%xmm0, -66(%rdi)
+L(fwd_write_50bytes):
+	lddqu	-50(%rsi), %xmm0
+	movdqu	%xmm0, -50(%rdi)
+L(fwd_write_34bytes):
+	lddqu	-34(%rsi), %xmm0
+	movdqu	%xmm0, -34(%rdi)
+L(fwd_write_18bytes):
+	lddqu	-18(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -18(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_2bytes):
+	movzwl	-2(%rsi), %edx
+	mov	%dx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_129bytes):
+	lddqu	-129(%rsi), %xmm0
+	movdqu	%xmm0, -129(%rdi)
+L(fwd_write_113bytes):
+	lddqu	-113(%rsi), %xmm0
+	movdqu	%xmm0, -113(%rdi)
+L(fwd_write_97bytes):
+	lddqu	-97(%rsi), %xmm0
+	movdqu	%xmm0, -97(%rdi)
+L(fwd_write_81bytes):
+	lddqu	-81(%rsi), %xmm0
+	movdqu	%xmm0, -81(%rdi)
+L(fwd_write_65bytes):
+	lddqu	-65(%rsi), %xmm0
+	movdqu	%xmm0, -65(%rdi)
+L(fwd_write_49bytes):
+	lddqu	-49(%rsi), %xmm0
+	movdqu	%xmm0, -49(%rdi)
+L(fwd_write_33bytes):
+	lddqu	-33(%rsi), %xmm0
+	movdqu	%xmm0, -33(%rdi)
+L(fwd_write_17bytes):
+	lddqu	-17(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -17(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_1bytes):
+	movzbl	-1(%rsi), %edx
+	mov	%dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_128bytes):
+	lddqu	112(%rsi), %xmm0
+	movdqu	%xmm0, 112(%rdi)
+L(bwd_write_112bytes):
+	lddqu	96(%rsi), %xmm0
+	movdqu	%xmm0, 96(%rdi)
+L(bwd_write_96bytes):
+	lddqu	80(%rsi), %xmm0
+	movdqu	%xmm0, 80(%rdi)
+L(bwd_write_80bytes):
+	lddqu	64(%rsi), %xmm0
+	movdqu	%xmm0, 64(%rdi)
+L(bwd_write_64bytes):
+	lddqu	48(%rsi), %xmm0
+	movdqu	%xmm0, 48(%rdi)
+L(bwd_write_48bytes):
+	lddqu	32(%rsi), %xmm0
+	movdqu	%xmm0, 32(%rdi)
+L(bwd_write_32bytes):
+	lddqu	16(%rsi), %xmm0
+	movdqu	%xmm0, 16(%rdi)
+L(bwd_write_16bytes):
+	lddqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+L(bwd_write_0bytes):
+	ret
+
+	.p2align 4
+L(bwd_write_143bytes):
+	lddqu	127(%rsi), %xmm0
+	movdqu	%xmm0, 127(%rdi)
+L(bwd_write_127bytes):
+	lddqu	111(%rsi), %xmm0
+	movdqu	%xmm0, 111(%rdi)
+L(bwd_write_111bytes):
+	lddqu	95(%rsi), %xmm0
+	movdqu	%xmm0, 95(%rdi)
+L(bwd_write_95bytes):
+	lddqu	79(%rsi), %xmm0
+	movdqu	%xmm0, 79(%rdi)
+L(bwd_write_79bytes):
+	lddqu	63(%rsi), %xmm0
+	movdqu	%xmm0, 63(%rdi)
+L(bwd_write_63bytes):
+	lddqu	47(%rsi), %xmm0
+	movdqu	%xmm0, 47(%rdi)
+L(bwd_write_47bytes):
+	lddqu	31(%rsi), %xmm0
+	movdqu	%xmm0, 31(%rdi)
+L(bwd_write_31bytes):
+	lddqu	15(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 15(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+
+	.p2align 4
+L(bwd_write_15bytes):
+	mov	7(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 7(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_142bytes):
+	lddqu	126(%rsi), %xmm0
+	movdqu	%xmm0, 126(%rdi)
+L(bwd_write_126bytes):
+	lddqu	110(%rsi), %xmm0
+	movdqu	%xmm0, 110(%rdi)
+L(bwd_write_110bytes):
+	lddqu	94(%rsi), %xmm0
+	movdqu	%xmm0, 94(%rdi)
+L(bwd_write_94bytes):
+	lddqu	78(%rsi), %xmm0
+	movdqu	%xmm0, 78(%rdi)
+L(bwd_write_78bytes):
+	lddqu	62(%rsi), %xmm0
+	movdqu	%xmm0, 62(%rdi)
+L(bwd_write_62bytes):
+	lddqu	46(%rsi), %xmm0
+	movdqu	%xmm0, 46(%rdi)
+L(bwd_write_46bytes):
+	lddqu	30(%rsi), %xmm0
+	movdqu	%xmm0, 30(%rdi)
+L(bwd_write_30bytes):
+	lddqu	14(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 14(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_14bytes):
+	mov	6(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 6(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_141bytes):
+	lddqu	125(%rsi), %xmm0
+	movdqu	%xmm0, 125(%rdi)
+L(bwd_write_125bytes):
+	lddqu	109(%rsi), %xmm0
+	movdqu	%xmm0, 109(%rdi)
+L(bwd_write_109bytes):
+	lddqu	93(%rsi), %xmm0
+	movdqu	%xmm0, 93(%rdi)
+L(bwd_write_93bytes):
+	lddqu	77(%rsi), %xmm0
+	movdqu	%xmm0, 77(%rdi)
+L(bwd_write_77bytes):
+	lddqu	61(%rsi), %xmm0
+	movdqu	%xmm0, 61(%rdi)
+L(bwd_write_61bytes):
+	lddqu	45(%rsi), %xmm0
+	movdqu	%xmm0, 45(%rdi)
+L(bwd_write_45bytes):
+	lddqu	29(%rsi), %xmm0
+	movdqu	%xmm0, 29(%rdi)
+L(bwd_write_29bytes):
+	lddqu	13(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 13(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_13bytes):
+	mov	5(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 5(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_140bytes):
+	lddqu	124(%rsi), %xmm0
+	movdqu	%xmm0, 124(%rdi)
+L(bwd_write_124bytes):
+	lddqu	108(%rsi), %xmm0
+	movdqu	%xmm0, 108(%rdi)
+L(bwd_write_108bytes):
+	lddqu	92(%rsi), %xmm0
+	movdqu	%xmm0, 92(%rdi)
+L(bwd_write_92bytes):
+	lddqu	76(%rsi), %xmm0
+	movdqu	%xmm0, 76(%rdi)
+L(bwd_write_76bytes):
+	lddqu	60(%rsi), %xmm0
+	movdqu	%xmm0, 60(%rdi)
+L(bwd_write_60bytes):
+	lddqu	44(%rsi), %xmm0
+	movdqu	%xmm0, 44(%rdi)
+L(bwd_write_44bytes):
+	lddqu	28(%rsi), %xmm0
+	movdqu	%xmm0, 28(%rdi)
+L(bwd_write_28bytes):
+	lddqu	12(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 12(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_12bytes):
+	mov	4(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 4(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_139bytes):
+	lddqu	123(%rsi), %xmm0
+	movdqu	%xmm0, 123(%rdi)
+L(bwd_write_123bytes):
+	lddqu	107(%rsi), %xmm0
+	movdqu	%xmm0, 107(%rdi)
+L(bwd_write_107bytes):
+	lddqu	91(%rsi), %xmm0
+	movdqu	%xmm0, 91(%rdi)
+L(bwd_write_91bytes):
+	lddqu	75(%rsi), %xmm0
+	movdqu	%xmm0, 75(%rdi)
+L(bwd_write_75bytes):
+	lddqu	59(%rsi), %xmm0
+	movdqu	%xmm0, 59(%rdi)
+L(bwd_write_59bytes):
+	lddqu	43(%rsi), %xmm0
+	movdqu	%xmm0, 43(%rdi)
+L(bwd_write_43bytes):
+	lddqu	27(%rsi), %xmm0
+	movdqu	%xmm0, 27(%rdi)
+L(bwd_write_27bytes):
+	lddqu	11(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 11(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_11bytes):
+	mov	3(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 3(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_138bytes):
+	lddqu	122(%rsi), %xmm0
+	movdqu	%xmm0, 122(%rdi)
+L(bwd_write_122bytes):
+	lddqu	106(%rsi), %xmm0
+	movdqu	%xmm0, 106(%rdi)
+L(bwd_write_106bytes):
+	lddqu	90(%rsi), %xmm0
+	movdqu	%xmm0, 90(%rdi)
+L(bwd_write_90bytes):
+	lddqu	74(%rsi), %xmm0
+	movdqu	%xmm0, 74(%rdi)
+L(bwd_write_74bytes):
+	lddqu	58(%rsi), %xmm0
+	movdqu	%xmm0, 58(%rdi)
+L(bwd_write_58bytes):
+	lddqu	42(%rsi), %xmm0
+	movdqu	%xmm0, 42(%rdi)
+L(bwd_write_42bytes):
+	lddqu	26(%rsi), %xmm0
+	movdqu	%xmm0, 26(%rdi)
+L(bwd_write_26bytes):
+	lddqu	10(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 10(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_10bytes):
+	mov	2(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 2(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_137bytes):
+	lddqu	121(%rsi), %xmm0
+	movdqu	%xmm0, 121(%rdi)
+L(bwd_write_121bytes):
+	lddqu	105(%rsi), %xmm0
+	movdqu	%xmm0, 105(%rdi)
+L(bwd_write_105bytes):
+	lddqu	89(%rsi), %xmm0
+	movdqu	%xmm0, 89(%rdi)
+L(bwd_write_89bytes):
+	lddqu	73(%rsi), %xmm0
+	movdqu	%xmm0, 73(%rdi)
+L(bwd_write_73bytes):
+	lddqu	57(%rsi), %xmm0
+	movdqu	%xmm0, 57(%rdi)
+L(bwd_write_57bytes):
+	lddqu	41(%rsi), %xmm0
+	movdqu	%xmm0, 41(%rdi)
+L(bwd_write_41bytes):
+	lddqu	25(%rsi), %xmm0
+	movdqu	%xmm0, 25(%rdi)
+L(bwd_write_25bytes):
+	lddqu	9(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 9(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_9bytes):
+	mov	1(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 1(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_136bytes):
+	lddqu	120(%rsi), %xmm0
+	movdqu	%xmm0, 120(%rdi)
+L(bwd_write_120bytes):
+	lddqu	104(%rsi), %xmm0
+	movdqu	%xmm0, 104(%rdi)
+L(bwd_write_104bytes):
+	lddqu	88(%rsi), %xmm0
+	movdqu	%xmm0, 88(%rdi)
+L(bwd_write_88bytes):
+	lddqu	72(%rsi), %xmm0
+	movdqu	%xmm0, 72(%rdi)
+L(bwd_write_72bytes):
+	lddqu	56(%rsi), %xmm0
+	movdqu	%xmm0, 56(%rdi)
+L(bwd_write_56bytes):
+	lddqu	40(%rsi), %xmm0
+	movdqu	%xmm0, 40(%rdi)
+L(bwd_write_40bytes):
+	lddqu	24(%rsi), %xmm0
+	movdqu	%xmm0, 24(%rdi)
+L(bwd_write_24bytes):
+	lddqu	8(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 8(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_8bytes):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_135bytes):
+	lddqu	119(%rsi), %xmm0
+	movdqu	%xmm0, 119(%rdi)
+L(bwd_write_119bytes):
+	lddqu	103(%rsi), %xmm0
+	movdqu	%xmm0, 103(%rdi)
+L(bwd_write_103bytes):
+	lddqu	87(%rsi), %xmm0
+	movdqu	%xmm0, 87(%rdi)
+L(bwd_write_87bytes):
+	lddqu	71(%rsi), %xmm0
+	movdqu	%xmm0, 71(%rdi)
+L(bwd_write_71bytes):
+	lddqu	55(%rsi), %xmm0
+	movdqu	%xmm0, 55(%rdi)
+L(bwd_write_55bytes):
+	lddqu	39(%rsi), %xmm0
+	movdqu	%xmm0, 39(%rdi)
+L(bwd_write_39bytes):
+	lddqu	23(%rsi), %xmm0
+	movdqu	%xmm0, 23(%rdi)
+L(bwd_write_23bytes):
+	lddqu	7(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 7(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_7bytes):
+	mov	3(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 3(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_134bytes):
+	lddqu	118(%rsi), %xmm0
+	movdqu	%xmm0, 118(%rdi)
+L(bwd_write_118bytes):
+	lddqu	102(%rsi), %xmm0
+	movdqu	%xmm0, 102(%rdi)
+L(bwd_write_102bytes):
+	lddqu	86(%rsi), %xmm0
+	movdqu	%xmm0, 86(%rdi)
+L(bwd_write_86bytes):
+	lddqu	70(%rsi), %xmm0
+	movdqu	%xmm0, 70(%rdi)
+L(bwd_write_70bytes):
+	lddqu	54(%rsi), %xmm0
+	movdqu	%xmm0, 54(%rdi)
+L(bwd_write_54bytes):
+	lddqu	38(%rsi), %xmm0
+	movdqu	%xmm0, 38(%rdi)
+L(bwd_write_38bytes):
+	lddqu	22(%rsi), %xmm0
+	movdqu	%xmm0, 22(%rdi)
+L(bwd_write_22bytes):
+	lddqu	6(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 6(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_6bytes):
+	mov	2(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 2(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_133bytes):
+	lddqu	117(%rsi), %xmm0
+	movdqu	%xmm0, 117(%rdi)
+L(bwd_write_117bytes):
+	lddqu	101(%rsi), %xmm0
+	movdqu	%xmm0, 101(%rdi)
+L(bwd_write_101bytes):
+	lddqu	85(%rsi), %xmm0
+	movdqu	%xmm0, 85(%rdi)
+L(bwd_write_85bytes):
+	lddqu	69(%rsi), %xmm0
+	movdqu	%xmm0, 69(%rdi)
+L(bwd_write_69bytes):
+	lddqu	53(%rsi), %xmm0
+	movdqu	%xmm0, 53(%rdi)
+L(bwd_write_53bytes):
+	lddqu	37(%rsi), %xmm0
+	movdqu	%xmm0, 37(%rdi)
+L(bwd_write_37bytes):
+	lddqu	21(%rsi), %xmm0
+	movdqu	%xmm0, 21(%rdi)
+L(bwd_write_21bytes):
+	lddqu	5(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 5(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_5bytes):
+	mov	1(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 1(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_132bytes):
+	lddqu	116(%rsi), %xmm0
+	movdqu	%xmm0, 116(%rdi)
+L(bwd_write_116bytes):
+	lddqu	100(%rsi), %xmm0
+	movdqu	%xmm0, 100(%rdi)
+L(bwd_write_100bytes):
+	lddqu	84(%rsi), %xmm0
+	movdqu	%xmm0, 84(%rdi)
+L(bwd_write_84bytes):
+	lddqu	68(%rsi), %xmm0
+	movdqu	%xmm0, 68(%rdi)
+L(bwd_write_68bytes):
+	lddqu	52(%rsi), %xmm0
+	movdqu	%xmm0, 52(%rdi)
+L(bwd_write_52bytes):
+	lddqu	36(%rsi), %xmm0
+	movdqu	%xmm0, 36(%rdi)
+L(bwd_write_36bytes):
+	lddqu	20(%rsi), %xmm0
+	movdqu	%xmm0, 20(%rdi)
+L(bwd_write_20bytes):
+	lddqu	4(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 4(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_4bytes):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_131bytes):
+	lddqu	115(%rsi), %xmm0
+	movdqu	%xmm0, 115(%rdi)
+L(bwd_write_115bytes):
+	lddqu	99(%rsi), %xmm0
+	movdqu	%xmm0, 99(%rdi)
+L(bwd_write_99bytes):
+	lddqu	83(%rsi), %xmm0
+	movdqu	%xmm0, 83(%rdi)
+L(bwd_write_83bytes):
+	lddqu	67(%rsi), %xmm0
+	movdqu	%xmm0, 67(%rdi)
+L(bwd_write_67bytes):
+	lddqu	51(%rsi), %xmm0
+	movdqu	%xmm0, 51(%rdi)
+L(bwd_write_51bytes):
+	lddqu	35(%rsi), %xmm0
+	movdqu	%xmm0, 35(%rdi)
+L(bwd_write_35bytes):
+	lddqu	19(%rsi), %xmm0
+	movdqu	%xmm0, 19(%rdi)
+L(bwd_write_19bytes):
+	lddqu	3(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 3(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_3bytes):
+	mov	1(%rsi), %dx
+	mov	(%rsi), %cx
+	mov	%dx, 1(%rdi)
+	mov	%cx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_130bytes):
+	lddqu	114(%rsi), %xmm0
+	movdqu	%xmm0, 114(%rdi)
+L(bwd_write_114bytes):
+	lddqu	98(%rsi), %xmm0
+	movdqu	%xmm0, 98(%rdi)
+L(bwd_write_98bytes):
+	lddqu	82(%rsi), %xmm0
+	movdqu	%xmm0, 82(%rdi)
+L(bwd_write_82bytes):
+	lddqu	66(%rsi), %xmm0
+	movdqu	%xmm0, 66(%rdi)
+L(bwd_write_66bytes):
+	lddqu	50(%rsi), %xmm0
+	movdqu	%xmm0, 50(%rdi)
+L(bwd_write_50bytes):
+	lddqu	34(%rsi), %xmm0
+	movdqu	%xmm0, 34(%rdi)
+L(bwd_write_34bytes):
+	lddqu	18(%rsi), %xmm0
+	movdqu	%xmm0, 18(%rdi)
+L(bwd_write_18bytes):
+	lddqu	2(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 2(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_2bytes):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_129bytes):
+	lddqu	113(%rsi), %xmm0
+	movdqu	%xmm0, 113(%rdi)
+L(bwd_write_113bytes):
+	lddqu	97(%rsi), %xmm0
+	movdqu	%xmm0, 97(%rdi)
+L(bwd_write_97bytes):
+	lddqu	81(%rsi), %xmm0
+	movdqu	%xmm0, 81(%rdi)
+L(bwd_write_81bytes):
+	lddqu	65(%rsi), %xmm0
+	movdqu	%xmm0, 65(%rdi)
+L(bwd_write_65bytes):
+	lddqu	49(%rsi), %xmm0
+	movdqu	%xmm0, 49(%rdi)
+L(bwd_write_49bytes):
+	lddqu	33(%rsi), %xmm0
+	movdqu	%xmm0, 33(%rdi)
+L(bwd_write_33bytes):
+	lddqu	17(%rsi), %xmm0
+	movdqu	%xmm0, 17(%rdi)
+L(bwd_write_17bytes):
+	lddqu	1(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 1(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_1bytes):
+	movzbl	(%rsi), %edx
+	mov	%dl, (%rdi)
+	ret
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	.p2align 3
+L(table_144_bytes_bwd):
+	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
+
+	.p2align 3
+L(table_144_bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
+
+	.p2align 3
+L(shl_table_fwd):
+	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
+
+	.p2align 3
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3.S b/utils/memcpy-bench/glibc/memcpy-ssse3.S
new file mode 100644
index 00000000000..2fd26651645
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3.S
@@ -0,0 +1,3152 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010-2020 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#if 1
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  _CET_NOTRACK jmp *INDEX;					\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%RDI_LP, %RAX_LP
+#ifdef USE_AS_MEMPCPY
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(write_0bytes)
+	cmp	$79, %rdx
+	jbe	L(copy_forward)
+	jmp	L(copy_backward)
+L(copy_forward):
+#endif
+L(start):
+	cmp	$79, %rdx
+	lea     L(table_less_80bytes)(%rip), %r11
+	ja	L(80bytesormore)
+	movslq	(%r11, %rdx, 4), %r9
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	add	%r11, %r9
+	_CET_NOTRACK jmp *%r9
+	ud2
+
+	.p2align 4
+L(80bytesormore):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %rcx
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rcx, %r8
+	sub	%rdi, %rcx
+	add	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_fwd)
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
+
+	.p2align 4
+L(copy_backward):
+	movdqu	-16(%rsi, %rdx), %xmm0
+	add	%rdx, %rsi
+	lea	-16(%rdi, %rdx), %r8
+	add	%rdx, %rdi
+
+	mov	%rdi, %rcx
+	and	$0xf, %rcx
+	xor	%rcx, %rdi
+	sub	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+#endif
+
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_bwd)
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+#else
+	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
+
+	.p2align 4
+L(shl_0):
+	sub	$16, %rdx
+	movdqa	(%rsi), %xmm1
+	add	$16, %rsi
+	movdqa	%xmm1, (%rdi)
+	add	$16, %rdi
+	cmp	$128, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes)
+	movaps	(%rsi), %xmm4
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm4, (%rdi)
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	sub	$64, %rdx
+	add	$64, %rsi
+	add	$64, %rdi
+L(shl_0_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%rsi), %xmm4
+	movaps	0x10(%rsi), %xmm1
+	movaps	0x20(%rsi), %xmm2
+	movaps	0x30(%rsi), %xmm3
+
+	movdqa	%xmm4, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+
+	sub	$128, %rdx
+	movaps	0x40(%rsi), %xmm4
+	movaps	0x50(%rsi), %xmm5
+	movaps	0x60(%rsi), %xmm6
+	movaps	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%rsi), %xmm4
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm4, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm4
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm4, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_cache_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x280(%rsi)
+
+	movdqa	(%rsi), %xmm0
+	movdqa	0x10(%rsi), %xmm1
+	movdqa	0x20(%rsi), %xmm2
+	movdqa	0x30(%rsi), %xmm3
+	movdqa	0x40(%rsi), %xmm4
+	movdqa	0x50(%rsi), %xmm5
+	movdqa	0x60(%rsi), %xmm6
+	movdqa	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm0
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm0, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	0x10(%rsi), %xmm1
+	add	$0x20, %rsi
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	add	$0x20, %rdi
+L(shl_0_mem_less_32bytes):
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_bwd):
+	sub	$16, %rdx
+	movdqa	-0x10(%rsi), %xmm1
+	sub	$16, %rsi
+	movdqa	%xmm1, -0x10(%rdi)
+	sub	$16, %rdi
+	cmp	$0x80, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble_bwd)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes_bwd)
+	movaps	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	sub	$64, %rdx
+	sub	$0x40, %rsi
+	sub	$0x40, %rdi
+L(shl_0_less_64bytes_bwd):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_bwd):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_bwd_loop)
+L(shl_0_gobble_bwd_loop):
+	movdqa	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+
+	sub	$0x80, %rdx
+	movaps	-0x50(%rsi), %xmm4
+	movaps	-0x60(%rsi), %xmm5
+	movaps	-0x70(%rsi), %xmm6
+	movaps	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_gobble_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_gobble_bwd_less_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_0_gobble_mem_bwd_loop):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x280(%rsi)
+	movdqa	-0x10(%rsi), %xmm0
+	movdqa	-0x20(%rsi), %xmm1
+	movdqa	-0x30(%rsi), %xmm2
+	movdqa	-0x40(%rsi), %xmm3
+	movdqa	-0x50(%rsi), %xmm4
+	movdqa	-0x60(%rsi), %xmm5
+	movdqa	-0x70(%rsi), %xmm6
+	movdqa	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	movdqa	%xmm2, -0x30(%rdi)
+	movdqa	%xmm3, -0x40(%rdi)
+	movdqa	%xmm4, -0x50(%rdi)
+	movdqa	%xmm5, -0x60(%rdi)
+	movdqa	%xmm6, -0x70(%rdi)
+	movdqa	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_mem_bwd_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_bwd_less_32bytes)
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+	sub	$0x20, %rsi
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	sub	$0x20, %rdi
+L(shl_0_mem_bwd_less_32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_1):
+	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_fwd)
+	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
+L(L1_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_1_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_1_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$1, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$1, %xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$1, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_1_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_1_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_1_bwd):
+	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_bwd)
+	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
+L(L1_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_1_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_1_bwd_loop_L1):
+	movaps	-0x11(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x21(%rsi), %xmm3
+	movaps	-0x31(%rsi), %xmm4
+	movaps	-0x41(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$1, %xmm2, %xmm1
+	palignr	$1, %xmm3, %xmm2
+	palignr	$1, %xmm4, %xmm3
+	palignr	$1, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_1_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_1_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_2):
+	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_fwd)
+	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
+L(L2_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_2_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_2_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$2, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$2, %xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$2, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_2_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_2_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_2_bwd):
+	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_bwd)
+	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
+L(L2_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_2_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_2_bwd_loop_L1):
+	movaps	-0x12(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x22(%rsi), %xmm3
+	movaps	-0x32(%rsi), %xmm4
+	movaps	-0x42(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$2, %xmm2, %xmm1
+	palignr	$2, %xmm3, %xmm2
+	palignr	$2, %xmm4, %xmm3
+	palignr	$2, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_2_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_2_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_3):
+	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_fwd)
+	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
+L(L3_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_3_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_3_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$3, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$3, %xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$3, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_3_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_3_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_3_bwd):
+	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_bwd)
+	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
+L(L3_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_3_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_3_bwd_loop_L1):
+	movaps	-0x13(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x23(%rsi), %xmm3
+	movaps	-0x33(%rsi), %xmm4
+	movaps	-0x43(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$3, %xmm2, %xmm1
+	palignr	$3, %xmm3, %xmm2
+	palignr	$3, %xmm4, %xmm3
+	palignr	$3, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_3_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_3_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_4):
+	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_fwd)
+	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
+L(L4_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_4_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_4_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$4, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$4, %xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$4, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_4_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_4_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_4_bwd):
+	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_bwd)
+	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
+L(L4_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_4_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_4_bwd_loop_L1):
+	movaps	-0x14(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x24(%rsi), %xmm3
+	movaps	-0x34(%rsi), %xmm4
+	movaps	-0x44(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$4, %xmm2, %xmm1
+	palignr	$4, %xmm3, %xmm2
+	palignr	$4, %xmm4, %xmm3
+	palignr	$4, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_4_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_4_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_5):
+	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_fwd)
+	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
+L(L5_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_5_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_5_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$5, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$5, %xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$5, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_5_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_5_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_5_bwd):
+	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_bwd)
+	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
+L(L5_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_5_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_5_bwd_loop_L1):
+	movaps	-0x15(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x25(%rsi), %xmm3
+	movaps	-0x35(%rsi), %xmm4
+	movaps	-0x45(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$5, %xmm2, %xmm1
+	palignr	$5, %xmm3, %xmm2
+	palignr	$5, %xmm4, %xmm3
+	palignr	$5, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_5_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_5_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_6):
+	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_fwd)
+	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
+L(L6_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_6_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_6_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$6, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$6, %xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$6, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_6_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_6_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_6_bwd):
+	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_bwd)
+	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
+L(L6_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_6_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_6_bwd_loop_L1):
+	movaps	-0x16(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x26(%rsi), %xmm3
+	movaps	-0x36(%rsi), %xmm4
+	movaps	-0x46(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$6, %xmm2, %xmm1
+	palignr	$6, %xmm3, %xmm2
+	palignr	$6, %xmm4, %xmm3
+	palignr	$6, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_6_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_6_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_7):
+	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_fwd)
+	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
+L(L7_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_7_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_7_loop_L1):
+	sub	$64, %rdx
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$7, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$7, %xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$7, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_7_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_7_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_7_bwd):
+	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_bwd)
+	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
+L(L7_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_7_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_7_bwd_loop_L1):
+	movaps	-0x17(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x27(%rsi), %xmm3
+	movaps	-0x37(%rsi), %xmm4
+	movaps	-0x47(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$7, %xmm2, %xmm1
+	palignr	$7, %xmm3, %xmm2
+	palignr	$7, %xmm4, %xmm3
+	palignr	$7, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_7_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_7_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_8):
+	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_fwd)
+	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
+L(L8_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+L(shl_8_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_8_loop_L1):
+	sub	$64, %rdx
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$8, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$8, %xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$8, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_8_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+	.p2align 4
+L(shl_8_end):
+	lea	64(%rdx), %rdx
+	movaps	%xmm4, -0x20(%rdi)
+	add	%rdx, %rsi
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_8_bwd):
+	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_bwd)
+	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
+L(L8_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_8_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_8_bwd_loop_L1):
+	movaps	-0x18(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x28(%rsi), %xmm3
+	movaps	-0x38(%rsi), %xmm4
+	movaps	-0x48(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$8, %xmm2, %xmm1
+	palignr	$8, %xmm3, %xmm2
+	palignr	$8, %xmm4, %xmm3
+	palignr	$8, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_8_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_8_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_9):
+	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_fwd)
+	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
+L(L9_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_9_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_9_loop_L1):
+	sub	$64, %rdx
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$9, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$9, %xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$9, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_9_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_9_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_9_bwd):
+	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_bwd)
+	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
+L(L9_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_9_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_9_bwd_loop_L1):
+	movaps	-0x19(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x29(%rsi), %xmm3
+	movaps	-0x39(%rsi), %xmm4
+	movaps	-0x49(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$9, %xmm2, %xmm1
+	palignr	$9, %xmm3, %xmm2
+	palignr	$9, %xmm4, %xmm3
+	palignr	$9, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_9_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_9_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_10):
+	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_fwd)
+	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
+L(L10_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_10_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_10_loop_L1):
+	sub	$64, %rdx
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$10, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$10, %xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$10, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_10_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_10_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_10_bwd):
+	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_bwd)
+	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
+L(L10_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_10_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_10_bwd_loop_L1):
+	movaps	-0x1a(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2a(%rsi), %xmm3
+	movaps	-0x3a(%rsi), %xmm4
+	movaps	-0x4a(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$10, %xmm2, %xmm1
+	palignr	$10, %xmm3, %xmm2
+	palignr	$10, %xmm4, %xmm3
+	palignr	$10, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_10_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_10_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_11):
+	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_fwd)
+	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
+L(L11_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_11_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_11_loop_L1):
+	sub	$64, %rdx
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$11, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$11, %xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$11, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_11_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_11_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_11_bwd):
+	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_bwd)
+	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
+L(L11_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_11_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_11_bwd_loop_L1):
+	movaps	-0x1b(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2b(%rsi), %xmm3
+	movaps	-0x3b(%rsi), %xmm4
+	movaps	-0x4b(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$11, %xmm2, %xmm1
+	palignr	$11, %xmm3, %xmm2
+	palignr	$11, %xmm4, %xmm3
+	palignr	$11, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_11_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_11_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_12):
+	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_fwd)
+	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
+L(L12_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_12_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_12_loop_L1):
+	sub	$64, %rdx
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$12, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$12, %xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$12, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_12_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_12_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_12_bwd):
+	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_bwd)
+	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
+L(L12_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_12_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_12_bwd_loop_L1):
+	movaps	-0x1c(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2c(%rsi), %xmm3
+	movaps	-0x3c(%rsi), %xmm4
+	movaps	-0x4c(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$12, %xmm2, %xmm1
+	palignr	$12, %xmm3, %xmm2
+	palignr	$12, %xmm4, %xmm3
+	palignr	$12, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_12_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_12_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_13):
+	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_fwd)
+	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
+L(L13_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_13_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_13_loop_L1):
+	sub	$64, %rdx
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$13, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$13, %xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$13, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_13_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_13_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_13_bwd):
+	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_bwd)
+	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
+L(L13_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_13_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_13_bwd_loop_L1):
+	movaps	-0x1d(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2d(%rsi), %xmm3
+	movaps	-0x3d(%rsi), %xmm4
+	movaps	-0x4d(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$13, %xmm2, %xmm1
+	palignr	$13, %xmm3, %xmm2
+	palignr	$13, %xmm4, %xmm3
+	palignr	$13, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_13_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_13_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_14):
+	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_fwd)
+	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
+L(L14_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_14_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_14_loop_L1):
+	sub	$64, %rdx
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$14, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$14, %xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$14, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_14_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_14_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_14_bwd):
+	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_bwd)
+	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
+L(L14_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_14_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_14_bwd_loop_L1):
+	movaps	-0x1e(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2e(%rsi), %xmm3
+	movaps	-0x3e(%rsi), %xmm4
+	movaps	-0x4e(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$14, %xmm2, %xmm1
+	palignr	$14, %xmm3, %xmm2
+	palignr	$14, %xmm4, %xmm3
+	palignr	$14, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_14_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_14_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_15):
+	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_fwd)
+	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
+L(L15_fwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_15_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_15_loop_L1):
+	sub	$64, %rdx
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$15, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$15, %xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$15, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_15_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_15_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(shl_15_bwd):
+	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_bwd)
+	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
+L(L15_bwd):
+	lea	-64(%rdx), %rdx
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_15_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_15_bwd_loop_L1):
+	movaps	-0x1f(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2f(%rsi), %xmm3
+	movaps	-0x3f(%rsi), %xmm4
+	movaps	-0x4f(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$15, %xmm2, %xmm1
+	palignr	$15, %xmm3, %xmm2
+	palignr	$15, %xmm4, %xmm3
+	palignr	$15, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_15_bwd_end)
+	movaps	%xmm4, (%rdi)
+	_CET_NOTRACK jmp *%r9
+	ud2
+L(shl_15_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	.p2align 4
+L(write_72bytes):
+	movdqu	-72(%rsi), %xmm0
+	movdqu	-56(%rsi), %xmm1
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -72(%rdi)
+	movdqu	 %xmm1, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_64bytes):
+	movdqu	-64(%rsi), %xmm0
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -64(%rdi)
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_56bytes):
+	movdqu	-56(%rsi), %xmm0
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_48bytes):
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_40bytes):
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_32bytes):
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_24bytes):
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_16bytes):
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	 %rdx, -8(%rdi)
+L(write_0bytes):
+	ret
+
+	.p2align 4
+L(write_73bytes):
+	movdqu	-73(%rsi), %xmm0
+	movdqu	-57(%rsi), %xmm1
+	mov	-41(%rsi), %rcx
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %r8
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -73(%rdi)
+	movdqu	 %xmm1, -57(%rdi)
+	mov	 %rcx, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %r8, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_65bytes):
+	movdqu	-65(%rsi), %xmm0
+	movdqu	-49(%rsi), %xmm1
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -65(%rdi)
+	movdqu	 %xmm1, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_57bytes):
+	movdqu	-57(%rsi), %xmm0
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -57(%rdi)
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_49bytes):
+	movdqu	-49(%rsi), %xmm0
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_41bytes):
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_33bytes):
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_25bytes):
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_17bytes):
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_9bytes):
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_1bytes):
+	mov	-1(%rsi), %dl
+	mov	 %dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(write_74bytes):
+	movdqu	-74(%rsi), %xmm0
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -74(%rdi)
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_66bytes):
+	movdqu	-66(%rsi), %xmm0
+	movdqu	-50(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -66(%rdi)
+	movdqu	 %xmm1, -50(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_58bytes):
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_50bytes):
+	movdqu	-50(%rsi), %xmm0
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -50(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_42bytes):
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_34bytes):
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_26bytes):
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_18bytes):
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_10bytes):
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_2bytes):
+	mov	-2(%rsi), %dx
+	mov	 %dx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(write_75bytes):
+	movdqu	-75(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -75(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_67bytes):
+	movdqu	-67(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -67(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_59bytes):
+	movdqu	-59(%rsi), %xmm0
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_51bytes):
+	movdqu	-51(%rsi), %xmm0
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -51(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_43bytes):
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_35bytes):
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_27bytes):
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_19bytes):
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_11bytes):
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	 %dx, -3(%rdi)
+	mov	 %cx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(write_76bytes):
+	movdqu	-76(%rsi), %xmm0
+	movdqu	-60(%rsi), %xmm1
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -76(%rdi)
+	movdqu	 %xmm1, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_68bytes):
+	movdqu	-68(%rsi), %xmm0
+	movdqu	-52(%rsi), %xmm1
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -68(%rdi)
+	movdqu	 %xmm1, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_60bytes):
+	movdqu	-60(%rsi), %xmm0
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_52bytes):
+	movdqu	-52(%rsi), %xmm0
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_44bytes):
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_36bytes):
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_28bytes):
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_20bytes):
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_12bytes):
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	 %edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_77bytes):
+	movdqu	-77(%rsi), %xmm0
+	movdqu	-61(%rsi), %xmm1
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -77(%rdi)
+	movdqu	 %xmm1, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_69bytes):
+	movdqu	-69(%rsi), %xmm0
+	movdqu	-53(%rsi), %xmm1
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -69(%rdi)
+	movdqu	 %xmm1, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_61bytes):
+	movdqu	-61(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_53bytes):
+	movdqu	-53(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_45bytes):
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_37bytes):
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_29bytes):
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_21bytes):
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_13bytes):
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -5(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_78bytes):
+	movdqu	-78(%rsi), %xmm0
+	movdqu	-62(%rsi), %xmm1
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -78(%rdi)
+	movdqu	 %xmm1, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_70bytes):
+	movdqu	-70(%rsi), %xmm0
+	movdqu	-54(%rsi), %xmm1
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -70(%rdi)
+	movdqu	 %xmm1, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_62bytes):
+	movdqu	-62(%rsi), %xmm0
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_54bytes):
+	movdqu	-54(%rsi), %xmm0
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_46bytes):
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_38bytes):
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_30bytes):
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_22bytes):
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_14bytes):
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -6(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(write_79bytes):
+	movdqu	-79(%rsi), %xmm0
+	movdqu	-63(%rsi), %xmm1
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -79(%rdi)
+	movdqu	 %xmm1, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_71bytes):
+	movdqu	-71(%rsi), %xmm0
+	movdqu	-55(%rsi), %xmm1
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -71(%rdi)
+	movdqu	 %xmm1, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_63bytes):
+	movdqu	-63(%rsi), %xmm0
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_55bytes):
+	movdqu	-55(%rsi), %xmm0
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_47bytes):
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_39bytes):
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_31bytes):
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_23bytes):
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_15bytes):
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -7(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(large_page_fwd):
+	movdqu	(%rsi), %xmm1
+	lea	16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movntdq	%xmm1, (%rdi)
+	lea	16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	shl	$2, %rcx
+	cmp	%rcx, %rdx
+	jb	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+L(large_page_loop):
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(large_page_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(ll_cache_copy_fwd_start):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x200(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_fwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_fwd_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_fwd_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#endif
+	.p2align 4
+L(large_page_bwd):
+	movdqu	-0x10(%rsi), %xmm1
+	lea	-16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, -0x10(%rdi)
+	lea	-16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jb	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+L(large_page_bwd_loop):
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	movntdq	%xmm4, -0x50(%rdi)
+	movntdq	%xmm5, -0x60(%rdi)
+	movntdq	%xmm6, -0x70(%rdi)
+	movntdq	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(large_page_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_bwd_64bytes):
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(ll_cache_copy_bwd_start):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x200(%rsi)
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_bwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_bwd_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+#endif
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	.p2align 3
+L(table_less_80bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
+
+	.p2align 3
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	.p2align 3
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
new file mode 100644
index 00000000000..9ee6f0a71c3
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
@@ -0,0 +1,12 @@
+#if 1
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+
+# define SECTION(p)		p##.avx
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
new file mode 100644
index 00000000000..b14d92fd6a8
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
@@ -0,0 +1,419 @@
+/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#if 1
+
+# include "asm-syntax.h"
+
+	.section .text.avx512,"ax",@progbits
+ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx512_no_vzeroupper)
+
+ENTRY (__mempcpy_avx512_no_vzeroupper)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END (__mempcpy_avx512_no_vzeroupper)
+
+ENTRY (__memmove_chk_avx512_no_vzeroupper)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_avx512_no_vzeroupper)
+
+ENTRY (__memmove_avx512_no_vzeroupper)
+	mov	%RDI_LP, %RAX_LP
+# ifdef USE_AS_MEMPCPY
+	add	%RDX_LP, %RAX_LP
+# endif
+L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+	lea	(%rsi, %rdx), %rcx
+	lea	(%rdi, %rdx), %r9
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+
+L(check):
+	cmp	$16, %rdx
+	jbe	L(less_16bytes)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	-0x100(%rcx), %zmm4
+	vmovups -0xC0(%rcx), %zmm5
+	vmovups -0x80(%rcx), %zmm6
+	vmovups -0x40(%rcx), %zmm7
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, -0x100(%r9)
+	vmovups %zmm5, -0xC0(%r9)
+	vmovups %zmm6, -0x80(%r9)
+	vmovups %zmm7, -0x40(%r9)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups -0x80(%rcx), %zmm2
+	vmovups -0x40(%rcx), %zmm3
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%r9)
+	vmovups %zmm3, -0x40(%r9)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu -0x40(%rcx), %ymm2
+	vmovdqu -0x20(%rcx), %ymm3
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 0x20(%rdi)
+	vmovdqu %ymm2, -0x40(%r9)
+	vmovdqu %ymm3, -0x20(%r9)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu -0x20(%rcx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -0x20(%r9)
+	ret
+
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rcx), %xmm1
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, -0x10(%r9)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq	(%rsi), %rsi
+	movq	-0x8(%rcx), %rcx
+	movq	%rsi, (%rdi)
+	movq	%rcx, -0x8(%r9)
+	ret
+
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	(%rsi), %esi
+	mov	-0x4(%rcx), %ecx
+	mov	%esi, (%rdi)
+	mov	%ecx, -0x4(%r9)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	(%rsi), %si
+	mov	-0x2(%rcx), %cx
+	mov	%si, (%rdi)
+	mov	%cx, -0x2(%r9)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+# ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r8
+# else
+	mov	__x86_shared_cache_size_half(%rip), %r8
+# endif
+	cmp	%r8, %rdx
+	jae	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	prefetcht1 -0x200(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0x40(%rcx)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	vmovups	-0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups %zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	vmovups	%zmm8, -0x200(%r9)
+	vmovups %zmm9, -0x1C0(%r9)
+	vmovups %zmm10, -0x180(%r9)
+	vmovups %zmm11, -0x140(%r9)
+	vmovups	%zmm12, -0x100(%r9)
+	vmovups %zmm13, -0xC0(%r9)
+	vmovups %zmm14, -0x80(%r9)
+	vmovups %zmm15, -0x40(%r9)
+	ret
+
+L(1024bytesormore):
+	cmp	%rsi, %rdi
+	ja	L(1024bytesormore_bkw)
+	sub	$512, %r9
+	vmovups -0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	add	$512, %rsi
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	add	$512, %rdi
+	cmp	%r9, %rdi
+	jb	L(gobble_512bytes_loop)
+	vmovups %zmm8, (%r9)
+	vmovups %zmm9, 0x40(%r9)
+	vmovups %zmm10, 0x80(%r9)
+	vmovups %zmm11, 0xC0(%r9)
+	vmovups %zmm12, 0x100(%r9)
+	vmovups %zmm13, 0x140(%r9)
+	vmovups %zmm14, 0x180(%r9)
+	vmovups %zmm15, 0x1C0(%r9)
+	ret
+
+L(1024bytesormore_bkw):
+	add	$512, %rdi
+	vmovups	0x1C0(%rsi), %zmm8
+	vmovups 0x180(%rsi), %zmm9
+	vmovups 0x140(%rsi), %zmm10
+	vmovups 0x100(%rsi), %zmm11
+	vmovups	0xC0(%rsi), %zmm12
+	vmovups 0x80(%rsi), %zmm13
+	vmovups 0x40(%rsi), %zmm14
+	vmovups (%rsi), %zmm15
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access.  */
+L(gobble_512bytes_loop_bkw):
+	vmovups -0x40(%rcx), %zmm0
+	vmovups -0x80(%rcx), %zmm1
+	vmovups -0xC0(%rcx), %zmm2
+	vmovups	-0x100(%rcx), %zmm3
+	vmovups -0x140(%rcx), %zmm4
+	vmovups -0x180(%rcx), %zmm5
+	vmovups -0x1C0(%rcx), %zmm6
+	vmovups	-0x200(%rcx), %zmm7
+	sub	$512, %rcx
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+	vmovups %zmm0, -0x40(%r9)
+	vmovups %zmm1, -0x80(%r9)
+	vmovups %zmm2, -0xC0(%r9)
+	vmovups	%zmm3, -0x100(%r9)
+	vmovups %zmm4, -0x140(%r9)
+	vmovups %zmm5, -0x180(%r9)
+	vmovups %zmm6, -0x1C0(%r9)
+	vmovups	%zmm7, -0x200(%r9)
+	sub	$512, %r9
+	cmp	%rdi, %r9
+	ja	L(gobble_512bytes_loop_bkw)
+	vmovups %zmm8, -0x40(%rdi)
+	vmovups %zmm9, -0x80(%rdi)
+	vmovups %zmm10, -0xC0(%rdi)
+	vmovups %zmm11, -0x100(%rdi)
+	vmovups %zmm12, -0x140(%rdi)
+	vmovups %zmm13, -0x180(%rdi)
+	vmovups %zmm14, -0x1C0(%rdi)
+	vmovups %zmm15, -0x200(%rdi)
+	ret
+
+L(preloop_large):
+	cmp	%rsi, %rdi
+	ja	L(preloop_large_bkw)
+	vmovups	(%rsi), %zmm4
+	vmovups	0x40(%rsi), %zmm5
+
+	mov	%rdi, %r11
+/* Align destination for access with non-temporal stores in the loop.  */
+	mov	%rdi, %r8
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	sub	%rdi, %r8
+	sub	%r8, %rsi
+	add	%r8, %rdx
+L(gobble_256bytes_nt_loop):
+	prefetcht1 0x200(%rsi)
+	prefetcht1 0x240(%rsi)
+	prefetcht1 0x280(%rsi)
+	prefetcht1 0x2C0(%rsi)
+	prefetcht1 0x300(%rsi)
+	prefetcht1 0x340(%rsi)
+	prefetcht1 0x380(%rsi)
+	prefetcht1 0x3C0(%rsi)
+	vmovdqu64 (%rsi), %zmm0
+	vmovdqu64 0x40(%rsi), %zmm1
+	vmovdqu64 0x80(%rsi), %zmm2
+	vmovdqu64 0xC0(%rsi), %zmm3
+	vmovntdq %zmm0, (%rdi)
+	vmovntdq %zmm1, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm3, 0xC0(%rdi)
+	sub	$256, %rdx
+	add	$256, %rsi
+	add	$256, %rdi
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop)
+	sfence
+	vmovups	%zmm4, (%r11)
+	vmovups	%zmm5, 0x40(%r11)
+	jmp	L(check)
+
+L(preloop_large_bkw):
+	vmovups -0x80(%rcx), %zmm4
+	vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+	mov	%r9, %r8
+	and	$-0x80, %r9
+	sub	%r9, %r8
+	sub	%r8, %rcx
+	sub	%r8, %rdx
+	add	%r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+	prefetcht1 -0x400(%rcx)
+	prefetcht1 -0x3C0(%rcx)
+	prefetcht1 -0x380(%rcx)
+	prefetcht1 -0x340(%rcx)
+	prefetcht1 -0x300(%rcx)
+	prefetcht1 -0x2C0(%rcx)
+	prefetcht1 -0x280(%rcx)
+	prefetcht1 -0x240(%rcx)
+	vmovdqu64 -0x100(%rcx), %zmm0
+	vmovdqu64 -0xC0(%rcx), %zmm1
+	vmovdqu64 -0x80(%rcx), %zmm2
+	vmovdqu64 -0x40(%rcx), %zmm3
+	vmovntdq %zmm0,	-0x100(%r9)
+	vmovntdq %zmm1,	-0xC0(%r9)
+	vmovntdq %zmm2,	-0x80(%r9)
+	vmovntdq %zmm3,	-0x40(%r9)
+	sub	$256, %rdx
+	sub	$256, %rcx
+	sub	$256, %r9
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop_bkw)
+	sfence
+	vmovups	%zmm4, -0x80(%r8)
+	vmovups	%zmm5, -0x40(%r8)
+	jmp	L(check)
+END (__memmove_avx512_no_vzeroupper)
+
+strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
+strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
+#endif
diff --git a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
new file mode 100644
index 00000000000..db70fdf1b4e
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
@@ -0,0 +1,12 @@
+#if 1
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define SECTION(p)		p##.avx512
+# define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
new file mode 100644
index 00000000000..17b4f861621
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
@@ -0,0 +1,33 @@
+/* memmove with SSE2.
+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if 1
+# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+#else
+weak_alias (__mempcpy, mempcpy)
+#endif
+
+#include "memmove.S"
+
+#if defined SHARED
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_sse2_unaligned to support overlapping addresses.  */
+compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# endif
+#endif
diff --git a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
new file mode 100644
index 00000000000..21be351b4e7
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
@@ -0,0 +1,559 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Load all sources into registers and store them together to avoid
+      possible address overlap between source and destination.
+   3. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   4. If address of destination > address of source, backward copy
+      4 * VEC_SIZE at a time with unaligned load and aligned store.
+      Load the first 4 * VEC and last VEC before the loop and store
+      them after the loop to support overlapping addresses.
+   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+      load and aligned store.  Load the last 4 * VEC and first VEC
+      before the loop and store them after the loop to support
+      overlapping addresses.
+   6. If size >= __x86_shared_non_temporal_threshold and there is no
+      overlap between destination and source, use non-temporal store
+      instead of aligned store.  */
+
+#include "sysdep.h"
+
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef PREFETCH
+# define PREFETCH(addr) prefetcht0 addr
+#endif
+
+/* Assume 64-byte prefetch size.  */
+#ifndef PREFETCH_SIZE
+# define PREFETCH_SIZE 64
+#endif
+
+#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
+
+#if PREFETCH_SIZE == 64
+# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base)
+# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+# else
+#   error Unsupported PREFETCHED_LOAD_SIZE!
+# endif
+#else
+# error Unsupported PREFETCH_SIZE!
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+	.section SECTION(.text),"ax",@progbits
+#if defined SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+#endif
+
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+
+#if defined SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+#endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+	movq	%rdi, %rax
+L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(more_2x_vec)
+#if !defined USE_MULTIARCH
+L(last_2x_vec):
+#endif
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VZEROUPPER
+#if !defined USE_MULTIARCH
+L(nop):
+#endif
+	ret
+#if defined USE_MULTIARCH
+END (MEMMOVE_SYMBOL (__memmove, unaligned))
+
+# if VEC_SIZE == 16
+ENTRY (__mempcpy_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_erms)
+
+/* Only used to measure performance of REP MOVSB.  */
+ENTRY (__mempcpy_erms)
+	mov	%RDI_LP, %RAX_LP
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start_movsb)
+END (__mempcpy_erms)
+
+ENTRY (__memmove_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_erms)
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+L(start_movsb):
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
+	jb	1f
+	/* Source == destination is less common.  */
+	je	2f
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
+	jb	L(movsb_backward)
+1:
+	rep movsb
+2:
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+# endif
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+	movq	%rdi, %rax
+L(start_erms):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(movsb_more_2x_vec)
+L(last_2x_vec):
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(movsb):
+	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+	jae	L(more_8x_vec)
+	cmpq	%rsi, %rdi
+	jb	1f
+	/* Source == destination is less common.  */
+	je	L(nop)
+	leaq	(%rsi,%rdx), %r9
+	cmpq	%r9, %rdi
+	/* Avoid slow backward REP MOVSB.  */
+	jb	L(more_8x_vec_backward)
+1:
+	mov	%RDX_LP, %RCX_LP
+	rep movsb
+L(nop):
+	ret
+#endif
+
+L(less_vec):
+	/* Less than 1 VEC.  */
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+#endif
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+1:
+	ret
+#if VEC_SIZE > 32
+L(between_32_63):
+	/* From 32 to 63.  No branch when size == 32.  */
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	-32(%rsi,%rdx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VZEROUPPER
+	ret
+#endif
+#if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi,%rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	ret
+#endif
+L(between_8_15):
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	-8(%rsi,%rdx), %rcx
+	movq	(%rsi), %rsi
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rsi, (%rdi)
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi,%rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%esi, (%rdi)
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movzwl	-2(%rsi,%rdx), %ecx
+	movzwl	(%rsi), %esi
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	ret
+
+#if defined USE_MULTIARCH
+L(movsb_more_2x_vec):
+	cmp	$REP_MOSB_THRESHOLD, %RDX_LP
+	ja	L(movsb)
+#endif
+L(more_2x_vec):
+	/* More than 2 * VEC and there may be overlap between destination
+	   and source.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(last_4x_vec):
+	/* Copy from 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec):
+	cmpq	%rsi, %rdi
+	ja	L(more_8x_vec_backward)
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r11
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+	/* Align destination for aligned stores in the loop.  Compute
+	   how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+	/* Check non-temporal store threshold.  */
+	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+	ja	L(large_forward)
+#endif
+L(loop_4x_vec_forward):
+	/* Copy 4 * VEC a time forward.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_forward)
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(more_8x_vec_backward):
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Align destination end for aligned stores in the loop.  Compute
+	   how much destination end is misaligned.  */
+	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Adjust length.  */
+	subq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+	/* Check non-temporal store threshold.  */
+	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+	ja	L(large_backward)
+#endif
+L(loop_4x_vec_backward):
+	/* Copy 4 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_backward)
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+
+#if (defined USE_MULTIARCH || VEC_SIZE == 16)
+L(large_forward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rdi, %rdx), %r10
+	cmpq    %r10, %rsi
+	jb	L(loop_4x_vec_forward)
+L(loop_large_forward):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$PREFETCHED_LOAD_SIZE, %rsi
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$PREFETCHED_LOAD_SIZE, %rdi
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_forward)
+	sfence
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+L(large_backward):
+	/* Don't use non-temporal store if there is overlap between
+	   destination and source since destination may be in cache
+	   when source is loaded.  */
+	leaq    (%rcx, %rdx), %r10
+	cmpq    %r10, %r9
+	jb	L(loop_4x_vec_backward)
+L(loop_large_backward):
+	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$PREFETCHED_LOAD_SIZE, %rcx
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%r9)
+	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$PREFETCHED_LOAD_SIZE, %r9
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_backward)
+	sfence
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+#endif
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+#if 1
+# ifdef USE_MULTIARCH
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+#  ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+#  endif
+# endif
+# ifdef SHARED
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
+# endif
+#endif
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
+	      MEMCPY_SYMBOL (__memcpy, unaligned))
diff --git a/utils/memcpy-bench/glibc/memmove.S b/utils/memcpy-bench/glibc/memmove.S
new file mode 100644
index 00000000000..97e735facff
--- /dev/null
+++ b/utils/memcpy-bench/glibc/memmove.S
@@ -0,0 +1,71 @@
+/* Optimized memmove for x86-64.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sysdep.h"
+
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define PREFETCHNT	prefetchnta
+#define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+#define VMOVU		movups
+#define VMOVA		movaps
+
+#define SECTION(p)		p
+
+#ifdef USE_MULTIARCH
+# if 0
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#else
+# if defined SHARED
+#  define MEMCPY_SYMBOL(p,s)		__memcpy
+# else
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#endif
+#if !defined USE_MULTIARCH
+# define MEMPCPY_SYMBOL(p,s)		__mempcpy
+#endif
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	p
+# define MEMMOVE_SYMBOL(p,s)		memmove
+#endif
+
+#include "memmove-vec-unaligned-erms.S"
+
+#ifndef USE_MULTIARCH
+libc_hidden_builtin_def (memmove)
+# if defined SHARED && IS_IN (libc)
+strong_alias (memmove, __memcpy)
+libc_hidden_ver (memmove, memcpy)
+# endif
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+# if defined SHARED && IS_IN (libc)
+#  undef memcpy
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
+#endif
diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h
new file mode 100644
index 00000000000..099134b2a2f
--- /dev/null
+++ b/utils/memcpy-bench/glibc/sysdep.h
@@ -0,0 +1,129 @@
+/* Assembler macros for x86-64.
+   Copyright (C) 2001-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_64_SYSDEP_H
+#define _X86_64_SYSDEP_H 1
+
+#include "sysdep_x86.h"
+
+#ifdef	__ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+/* This macro is for setting proper CFI with DW_CFA_expression describing
+   the register as saved relative to %rsp instead of relative to the CFA.
+   Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
+   from %rsp.  */
+#define cfi_offset_rel_rsp(regn, off)	.cfi_escape 0x10, regn, 0x4, 0x13, \
+					0x77, off & 0x7F | 0x80, off >> 7
+
+/* If compiled for profiling, call `mcount' at the start of each function.  */
+#ifdef	PROF
+/* The mcount code relies on a normal frame pointer being on the stack
+   to locate our caller, so push one just for its benefit.  */
+#define CALL_MCOUNT                                                          \
+  pushq %rbp;                                                                \
+  cfi_adjust_cfa_offset(8);                                                  \
+  movq %rsp, %rbp;                                                           \
+  cfi_def_cfa_register(%rbp);                                                \
+  call JUMPTARGET(mcount);                                                   \
+  popq %rbp;                                                                 \
+  cfi_def_cfa(rsp,8);
+#else
+#define CALL_MCOUNT		/* Do nothing.  */
+#endif
+
+#define	PSEUDO(name, syscall_name, args)				      \
+lose:									      \
+  jmp JUMPTARGET(syscall_error)						      \
+  .globl syscall_error;							      \
+  ENTRY (name)								      \
+  DO_CALL (syscall_name, args);						      \
+  jb lose
+
+#undef JUMPTARGET
+#ifdef SHARED
+# ifdef BIND_NOW
+#  define JUMPTARGET(name)	*name##@GOTPCREL(%rip)
+# else
+#  define JUMPTARGET(name)	name##@PLT
+# endif
+#else
+/* For static archives, branch to target directly.  */
+# define JUMPTARGET(name)	name
+#endif
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE	8
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) insn##q
+
+/* Assembler address directive. */
+#define ASM_ADDR .quad
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP	rax
+#define RBP_LP	rbp
+#define RBX_LP	rbx
+#define RCX_LP	rcx
+#define RDI_LP	rdi
+#define RDX_LP	rdx
+#define RSI_LP	rsi
+#define RSP_LP	rsp
+#define R8_LP	r8
+#define R9_LP	r9
+#define R10_LP	r10
+#define R11_LP	r11
+#define R12_LP	r12
+#define R13_LP	r13
+#define R14_LP	r14
+#define R15_LP	r15
+
+#else	/* __ASSEMBLER__ */
+
+/* Long and pointer size in bytes.  */
+#define LP_SIZE "8"
+
+/* Instruction to operate on long and pointer.  */
+#define LP_OP(insn) #insn "q"
+
+/* Assembler address directive. */
+#define ASM_ADDR ".quad"
+
+/* Registers to hold long and pointer.  */
+#define RAX_LP	"rax"
+#define RBP_LP	"rbp"
+#define RBX_LP	"rbx"
+#define RCX_LP	"rcx"
+#define RDI_LP	"rdi"
+#define RDX_LP	"rdx"
+#define RSI_LP	"rsi"
+#define RSP_LP	"rsp"
+#define R8_LP	"r8"
+#define R9_LP	"r9"
+#define R10_LP	"r10"
+#define R11_LP	"r11"
+#define R12_LP	"r12"
+#define R13_LP	"r13"
+#define R14_LP	"r14"
+#define R15_LP	"r15"
+
+#endif	/* __ASSEMBLER__ */
+
+#endif	/* _X86_64_SYSDEP_H */
diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h
new file mode 100644
index 00000000000..91f78e1b04d
--- /dev/null
+++ b/utils/memcpy-bench/glibc/sysdep_generic.h
@@ -0,0 +1,113 @@
+/* Generic asm macros used on many machines.
+   Copyright (C) 1991-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define C_SYMBOL_NAME(name) name
+#define HIDDEN_JUMPTARGET(name) 0x0
+#define SHARED_CACHE_SIZE_HALF (1024*1024)
+#define DATA_CACHE_SIZE_HALF (1024*32/2)
+#define DATA_CACHE_SIZE (1024*32)
+#define SHARED_NON_TEMPORAL_THRESHOLD (1024*1024*4)
+#define REP_MOSB_THRESHOLD 1024
+
+#define USE_MULTIARCH
+
+#define ASM_LINE_SEP ;
+
+#define strong_alias(original, alias)				\
+  .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP		\
+  C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
+
+#ifndef C_LABEL
+
+/* Define a macro we can use to construct the asm name for a C symbol.  */
+# define C_LABEL(name)	name##:
+
+#endif
+
+#ifdef __ASSEMBLER__
+/* Mark the end of function named SYM.  This is used on some platforms
+   to generate correct debugging information.  */
+# ifndef END
+#  define END(sym)
+# endif
+
+# ifndef JUMPTARGET
+#  define JUMPTARGET(sym)	sym
+# endif
+#endif
+
+/* Makros to generate eh_frame unwind information.  */
+#ifdef __ASSEMBLER__
+# define cfi_startproc			.cfi_startproc
+# define cfi_endproc			.cfi_endproc
+# define cfi_def_cfa(reg, off)		.cfi_def_cfa reg, off
+# define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
+# define cfi_def_cfa_offset(off)	.cfi_def_cfa_offset off
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+# define cfi_offset(reg, off)		.cfi_offset reg, off
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+# define cfi_register(r1, r2)		.cfi_register r1, r2
+# define cfi_return_column(reg)	.cfi_return_column reg
+# define cfi_restore(reg)		.cfi_restore reg
+# define cfi_same_value(reg)		.cfi_same_value reg
+# define cfi_undefined(reg)		.cfi_undefined reg
+# define cfi_remember_state		.cfi_remember_state
+# define cfi_restore_state		.cfi_restore_state
+# define cfi_window_save		.cfi_window_save
+# define cfi_personality(enc, exp)	.cfi_personality enc, exp
+# define cfi_lsda(enc, exp)		.cfi_lsda enc, exp
+
+#else /* ! ASSEMBLER */
+
+# define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name)
+# define CFI_STRINGIFY2(Name) #Name
+# define CFI_STARTPROC	".cfi_startproc"
+# define CFI_ENDPROC	".cfi_endproc"
+# define CFI_DEF_CFA(reg, off)	\
+   ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_DEF_CFA_REGISTER(reg) \
+   ".cfi_def_cfa_register " CFI_STRINGIFY(reg)
+# define CFI_DEF_CFA_OFFSET(off) \
+   ".cfi_def_cfa_offset " CFI_STRINGIFY(off)
+# define CFI_ADJUST_CFA_OFFSET(off) \
+   ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off)
+# define CFI_OFFSET(reg, off) \
+   ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_REL_OFFSET(reg, off) \
+   ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+# define CFI_REGISTER(r1, r2) \
+   ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2)
+# define CFI_RETURN_COLUMN(reg) \
+   ".cfi_return_column " CFI_STRINGIFY(reg)
+# define CFI_RESTORE(reg) \
+   ".cfi_restore " CFI_STRINGIFY(reg)
+# define CFI_UNDEFINED(reg) \
+   ".cfi_undefined " CFI_STRINGIFY(reg)
+# define CFI_REMEMBER_STATE \
+   ".cfi_remember_state"
+# define CFI_RESTORE_STATE \
+   ".cfi_restore_state"
+# define CFI_WINDOW_SAVE \
+   ".cfi_window_save"
+# define CFI_PERSONALITY(enc, exp) \
+   ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+# define CFI_LSDA(enc, exp) \
+   ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+#endif
+
+#include "dwarf2.h"
diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h
new file mode 100644
index 00000000000..a3fecd01268
--- /dev/null
+++ b/utils/memcpy-bench/glibc/sysdep_x86.h
@@ -0,0 +1,113 @@
+/* Assembler macros for x86.
+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_SYSDEP_H
+#define _X86_SYSDEP_H 1
+
+#include "sysdep_generic.h"
+
+/* __CET__ is defined by GCC with Control-Flow Protection values:
+
+enum cf_protection_level
+{
+  CF_NONE = 0,
+  CF_BRANCH = 1 << 0,
+  CF_RETURN = 1 << 1,
+  CF_FULL = CF_BRANCH | CF_RETURN,
+  CF_SET = 1 << 2
+};
+*/
+
+/* Set if CF_BRANCH (IBT) is enabled.  */
+#define X86_FEATURE_1_IBT	(1U << 0)
+/* Set if CF_RETURN (SHSTK) is enabled.  */
+#define X86_FEATURE_1_SHSTK	(1U << 1)
+
+#ifdef __CET__
+# define CET_ENABLED	1
+# define IBT_ENABLED	(__CET__ & X86_FEATURE_1_IBT)
+# define SHSTK_ENABLED	(__CET__ & X86_FEATURE_1_SHSTK)
+#else
+# define CET_ENABLED	0
+# define IBT_ENABLED	0
+# define SHSTK_ENABLED	0
+#endif
+
+/* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
+   space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
+   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
+#define STATE_SAVE_OFFSET (8 * 7 + 8)
+
+/* Save SSE, AVX, AVX512, mask and bound registers.  */
+#define STATE_SAVE_MASK \
+  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
+
+#ifdef	__ASSEMBLER__
+
+/* Syntactic details of assembler.  */
+
+#ifdef _CET_ENDBR
+# define _CET_NOTRACK notrack
+#else
+# define _CET_ENDBR
+# define _CET_NOTRACK
+#endif
+
+/* ELF uses byte-counts for .align, most others use log2 of count of bytes.  */
+#define ALIGNARG(log2) 1<<log2
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+
+/* Define an entry point visible from C.  */
+#define	ENTRY(name)							      \
+  .globl C_SYMBOL_NAME(name);						      \
+  .type C_SYMBOL_NAME(name),@function;					      \
+  .align ALIGNARG(4);							      \
+  C_LABEL(name)								      \
+  cfi_startproc;							      \
+  _CET_ENDBR;								      \
+  CALL_MCOUNT
+
+#undef	END
+#define END(name)							      \
+  cfi_endproc;								      \
+  ASM_SIZE_DIRECTIVE(name)
+
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
+/* Since C identifiers are not normally prefixed with an underscore
+   on this system, the asm identifier `syscall_error' intrudes on the
+   C name space.  Make sure we use an innocuous name.  */
+#define	syscall_error	__syscall_error
+#define mcount		_mcount
+
+#undef	PSEUDO_END
+#define	PSEUDO_END(name)						      \
+  END (name)
+
+/* Local label name for asm code. */
+#ifndef L
+/* ELF-like local names start with `.L'.  */
+# define L(name)	.L##name
+#endif
+
+#define atom_text_section .section ".text.atom", "ax"
+
+#endif	/* __ASSEMBLER__ */
+
+#endif	/* _X86_SYSDEP_H */
diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp
index 2df72cb5ccb..365abe1f01e 100644
--- a/utils/memcpy-bench/memcpy-bench.cpp
+++ b/utils/memcpy-bench/memcpy-bench.cpp
@@ -1,5 +1,6 @@
 #include <memory>
 #include <cstddef>
+#include <stdexcept>
 #include <string>
 #include <random>
 #include <iostream>
@@ -14,15 +15,11 @@
 
 #include <Common/Stopwatch.h>
 
-#pragma GCC diagnostic ignored "-Wold-style-cast"
-#pragma GCC diagnostic ignored "-Wcast-align"
-#pragma GCC diagnostic ignored "-Wcast-qual"
-#include "FastMemcpy.h"
-//#include "FastMemcpy_Avx.h"
-
 #include <emmintrin.h>
 #include <immintrin.h>
 
+#include <boost/program_options.hpp>
+
 
 template <typename F, typename MemcpyImpl>
 void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl)
@@ -47,7 +44,7 @@ size_t generatorUniform(RNG & rng) { return rng() % N; };
 
 
 template <typename F, typename MemcpyImpl>
-void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl)
+uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl, const char * name)
 {
     Stopwatch watch;
 
@@ -76,15 +73,22 @@ void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t n
     for (auto & thread : threads)
         thread.join();
 
-    double elapsed_ns = watch.elapsed();
+    uint64_t elapsed_ns = watch.elapsed();
 
     /// Validation
     size_t sum = 0;
+    size_t reference = 0;
     for (size_t i = 0; i < size; ++i)
+    {
         sum += dst[i];
+        reference += uint8_t(i);
+    }
 
-    std::cerr << std::fixed << std::setprecision(3)
-        << "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n";
+    if (sum != reference)
+        throw std::logic_error("Incorrect result");
+
+    std::cout << name;
+    return elapsed_ns;
 }
 
 
@@ -101,9 +105,30 @@ static void * memcpy_erms(void * dst, const void * src, size_t size)
     return dst;
 }
 
+static void * memcpy_trivial(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+    void * ret = dst;
+
+    while (size > 0)
+    {
+        *dst = *src;
+        ++dst;
+        ++src;
+        --size;
+    }
+
+    return ret;
+}
+
 extern "C" void * memcpy_jart(void * dst, const void * src, size_t size);
 extern "C" void MemCpy(void * dst, const void * src, size_t size);
 
+void * memcpy_fast_sse(void * dst, const void * src, size_t size);
+void * memcpy_fast_avx(void * dst, const void * src, size_t size);
+void * memcpy_tiny(void * dst, const void * src, size_t size);
+
 
 static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size)
 {
@@ -329,7 +354,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t *
     if (padding > 0)
     {
         __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
-        _mm256_storeu_si256((__m256i*)dst, head);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
         dst += padding;
         src += padding;
         size -= padding;
@@ -539,70 +564,125 @@ tail:
     return ret;
 }
 
+extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_ssse3_back(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
+extern "C" void * __memcpy_avx512_no_vzeroupper(void * __restrict destination, const void * __restrict source, size_t size);
+
+
+#define VARIANT(N, NAME) \
+    if (memcpy_variant == N) \
+        return test(dst, src, size, iterations, num_threads, std::forward<F>(generator), NAME, #NAME);
 
 template <typename F>
-void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
+uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
 {
-    memcpy_type memcpy_libc = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
+    memcpy_type memcpy_libc_old = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
 
-    if (memcpy_variant == 1)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy);
-    if (memcpy_variant == 2)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_libc);
-    if (memcpy_variant == 3)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_erms);
-    if (memcpy_variant == 4)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), MemCpy);
-    if (memcpy_variant == 5)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2);
-    if (memcpy_variant == 6)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled2);
-    if (memcpy_variant == 7)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled4);
-    if (memcpy_variant == 8)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled8);
-//    if (memcpy_variant == 9)
-//        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_fast_avx);
-    if (memcpy_variant == 10)
-        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_my);
+    VARIANT(1, memcpy)
+    VARIANT(2, memcpy_trivial)
+    VARIANT(3, memcpy_libc_old)
+    VARIANT(4, memcpy_erms)
+    VARIANT(5, MemCpy)
+    VARIANT(6, memcpySSE2)
+    VARIANT(7, memcpySSE2Unrolled2)
+    VARIANT(8, memcpySSE2Unrolled4)
+    VARIANT(9, memcpySSE2Unrolled8)
+    VARIANT(10, memcpy_fast_sse)
+    VARIANT(11, memcpy_fast_avx)
+    VARIANT(12, memcpy_my)
+
+    VARIANT(21, __memcpy_erms)
+    VARIANT(22, __memcpy_sse2_unaligned)
+    VARIANT(23, __memcpy_ssse3)
+    VARIANT(24, __memcpy_ssse3_back)
+    VARIANT(25, __memcpy_avx_unaligned)
+    VARIANT(26, __memcpy_avx_unaligned_erms)
+    VARIANT(27, __memcpy_avx512_unaligned)
+    VARIANT(28, __memcpy_avx512_unaligned_erms)
+    VARIANT(29, __memcpy_avx512_no_vzeroupper)
+
+    return 0;
 }
 
-void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
+uint64_t dispatchVariants(
+    size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
 {
     if (generator_variant == 1)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
     if (generator_variant == 2)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
     if (generator_variant == 3)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
     if (generator_variant == 4)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
     if (generator_variant == 5)
-        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
+        return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
+
+    return 0;
 }
 
 
 int main(int argc, char ** argv)
 {
-    size_t size = 1000000000;
-    if (argc >= 2)
-        size = std::stoull(argv[1]);
+    boost::program_options::options_description desc("Allowed options");
+    desc.add_options()("help,h", "produce help message")
+        ("size", boost::program_options::value<size_t>()->default_value(1000000), "Bytes to copy on every iteration")
+        ("iterations", boost::program_options::value<size_t>(), "Number of iterations")
+        ("threads", boost::program_options::value<size_t>()->default_value(1), "Number of copying threads")
+        ("distribution", boost::program_options::value<size_t>()->default_value(4), "Distribution of chunk sizes to perform copy")
+        ("variant", boost::program_options::value<size_t>(), "Variant of memcpy implementation")
+        ("tsv", "Print result in tab-separated format")
+        ;
 
-    size_t iterations = 10;
-    if (argc >= 3)
-        iterations = std::stoull(argv[2]);
+    boost::program_options::variables_map options;
+    boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
 
-    size_t num_threads = 1;
-    if (argc >= 4)
-        num_threads = std::stoull(argv[3]);
+    if (options.count("help") || !options.count("variant"))
+    {
+        std::cout << R"(Usage:
 
-    size_t memcpy_variant = 1;
-    if (argc >= 5)
-        memcpy_variant = std::stoull(argv[4]);
+for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do
+    for threads in 1 2 4 $(($(nproc) / 2)) $(nproc); do
+        for distribution in 1 2 3 4 5; do
+            for variant in {1..12} {21..29}; do
+                for i in {1..10}; do
+                    ./memcpy-bench --tsv --size $size --variant $variant --threads $threads --distribution $distribution;
+                done;
+            done;
+        done;
+    done;
+done | tee result.tsv
 
-    size_t generator_variant = 1;
-    if (argc >= 6)
-        generator_variant = std::stoull(argv[5]);
+)" << std::endl;
+        std::cout << desc << std::endl;
+        return 1;
+    }
+
+    size_t size = options["size"].as<size_t>();
+    size_t num_threads = options["threads"].as<size_t>();
+    size_t memcpy_variant = options["variant"].as<size_t>();
+    size_t generator_variant = options["distribution"].as<size_t>();
+
+    size_t iterations;
+    if (options.count("iterations"))
+    {
+        iterations = options["iterations"].as<size_t>();
+    }
+    else
+    {
+        iterations = 10000000000ULL * num_threads / size;
+
+        if (generator_variant == 1)
+            iterations /= 100;
+        if (generator_variant == 2)
+            iterations /= 10;
+    }
 
     std::unique_ptr<uint8_t[]> src(new uint8_t[size]);
     std::unique_ptr<uint8_t[]> dst(new uint8_t[size]);
@@ -614,7 +694,25 @@ int main(int argc, char ** argv)
     /// Fill dst to avoid page faults.
     memset(dst.get(), 0, size);
 
-    dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
+    uint64_t elapsed_ns = dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
+
+    std::cout << std::fixed << std::setprecision(3);
+
+    if (options.count("tsv"))
+    {
+        std::cout
+            << '\t' << size
+            << '\t' << iterations
+            << '\t' << num_threads
+            << '\t' << generator_variant
+            << '\t' << memcpy_variant
+            << '\t' << elapsed_ns
+            << '\n';
+    }
+    else
+    {
+        std::cout << ": processed in " << (elapsed_ns / 1e9) << " sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec\n";
+    }
 
     return 0;
 }

From 145116bfb64a9a135cd60a8d5b5ebdd6d8310676 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 14 Mar 2021 23:22:12 +0300
Subject: [PATCH 2/6] Fix style

---
 utils/memcpy-bench/glibc/dwarf2.h             |   74 +-
 utils/memcpy-bench/glibc/memcpy-ssse3-back.S  | 4988 ++++++++--------
 utils/memcpy-bench/glibc/memcpy-ssse3.S       | 5176 ++++++++---------
 .../glibc/memmove-avx-unaligned-erms.S        |   14 +-
 .../glibc/memmove-avx512-no-vzeroupper.S      |  662 +--
 .../glibc/memmove-avx512-unaligned-erms.S     |   14 +-
 .../glibc/memmove-sse2-unaligned-erms.S       |    2 +-
 .../glibc/memmove-vec-unaligned-erms.S        |  694 +--
 utils/memcpy-bench/glibc/memmove.S            |   26 +-
 utils/memcpy-bench/glibc/sysdep.h             |  100 +-
 utils/memcpy-bench/glibc/sysdep_generic.h     |   52 +-
 utils/memcpy-bench/glibc/sysdep_x86.h         |   52 +-
 12 files changed, 5927 insertions(+), 5927 deletions(-)

diff --git a/utils/memcpy-bench/glibc/dwarf2.h b/utils/memcpy-bench/glibc/dwarf2.h
index 4c7de0d8737..2be827f00ae 100644
--- a/utils/memcpy-bench/glibc/dwarf2.h
+++ b/utils/memcpy-bench/glibc/dwarf2.h
@@ -21,7 +21,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #ifndef _DWARF2_H
-#define _DWARF2_H	1
+#define _DWARF2_H    1
 
 /* This file is derived from the DWARF specification (a public document)
    Revision 2.0.0 (July 27, 1993) developed by the UNIX International
@@ -88,19 +88,19 @@ enum dwarf_tag
     /* SGI/MIPS Extensions */
     DW_TAG_MIPS_loop = 0x4081,
     /* GNU extensions */
-    DW_TAG_format_label = 0x4101,	/* for FORTRAN 77 and Fortran 90 */
-    DW_TAG_function_template = 0x4102,	/* for C++ */
-    DW_TAG_class_template = 0x4103,	/* for C++ */
+    DW_TAG_format_label = 0x4101,    /* for FORTRAN 77 and Fortran 90 */
+    DW_TAG_function_template = 0x4102,    /* for C++ */
+    DW_TAG_class_template = 0x4103,    /* for C++ */
     DW_TAG_GNU_BINCL = 0x4104,
     DW_TAG_GNU_EINCL = 0x4105
   };
 
-#define DW_TAG_lo_user	0x4080
-#define DW_TAG_hi_user	0xffff
+#define DW_TAG_lo_user    0x4080
+#define DW_TAG_hi_user    0xffff
 
 /* flag that tells whether entry has a child or not */
 #define DW_children_no   0
-#define	DW_children_yes  1
+#define    DW_children_yes  1
 
 /* Form names and codes.  */
 enum dwarf_form
@@ -215,8 +215,8 @@ enum dwarf_attribute
     DW_AT_body_end = 0x2106
   };
 
-#define DW_AT_lo_user	0x2000	/* implementation-defined range start */
-#define DW_AT_hi_user	0x3ff0	/* implementation-defined range end */
+#define DW_AT_lo_user    0x2000    /* implementation-defined range start */
+#define DW_AT_hi_user    0x3ff0    /* implementation-defined range end */
 
 /* Location atom names and codes.  */
 
@@ -369,8 +369,8 @@ enum dwarf_location_atom
     DW_OP_nop = 0x96
   };
 
-#define DW_OP_lo_user	0x80	/* implementation-defined range start */
-#define DW_OP_hi_user	0xff	/* implementation-defined range end */
+#define DW_OP_lo_user    0x80    /* implementation-defined range start */
+#define DW_OP_hi_user    0xff    /* implementation-defined range end */
 
 /* Type encodings.  */
 
@@ -387,8 +387,8 @@ enum dwarf_type
     DW_ATE_unsigned_char = 0x8
   };
 
-#define	DW_ATE_lo_user 0x80
-#define	DW_ATE_hi_user 0xff
+#define    DW_ATE_lo_user 0x80
+#define    DW_ATE_hi_user 0xff
 
 /* Array ordering names and codes.  */
 enum dwarf_array_dim_ordering
@@ -517,17 +517,17 @@ enum dwarf_call_frame_info
     DW_CFA_GNU_negative_offset_extended = 0x2f
   };
 
-#define DW_CIE_ID	  0xffffffff
-#define DW_CIE_VERSION	  1
+#define DW_CIE_ID      0xffffffff
+#define DW_CIE_VERSION      1
 
 #define DW_CFA_extended   0
 #define DW_CFA_low_user   0x1c
 #define DW_CFA_high_user  0x3f
 
-#define DW_CHILDREN_no		     0x00
-#define DW_CHILDREN_yes		     0x01
+#define DW_CHILDREN_no             0x00
+#define DW_CHILDREN_yes             0x01
 
-#define DW_ADDR_none		0
+#define DW_ADDR_none        0
 
 /* Source language names and codes.  */
 
@@ -548,8 +548,8 @@ enum dwarf_source_language
   };
 
 
-#define DW_LANG_lo_user 0x8000	/* implementation-defined range start */
-#define DW_LANG_hi_user 0xffff	/* implementation-defined range start */
+#define DW_LANG_lo_user 0x8000    /* implementation-defined range start */
+#define DW_LANG_hi_user 0xffff    /* implementation-defined range start */
 
 /* Names and codes for macro information.  */
 
@@ -566,25 +566,25 @@ enum dwarf_macinfo_record_type
 
 /* @@@ For use with GNU frame unwind information.  */
 
-#define DW_EH_PE_absptr		0x00
-#define DW_EH_PE_omit		0xff
+#define DW_EH_PE_absptr        0x00
+#define DW_EH_PE_omit        0xff
 
-#define DW_EH_PE_uleb128	0x01
-#define DW_EH_PE_udata2		0x02
-#define DW_EH_PE_udata4		0x03
-#define DW_EH_PE_udata8		0x04
-#define DW_EH_PE_sleb128	0x09
-#define DW_EH_PE_sdata2		0x0A
-#define DW_EH_PE_sdata4		0x0B
-#define DW_EH_PE_sdata8		0x0C
-#define DW_EH_PE_signed		0x08
+#define DW_EH_PE_uleb128    0x01
+#define DW_EH_PE_udata2        0x02
+#define DW_EH_PE_udata4        0x03
+#define DW_EH_PE_udata8        0x04
+#define DW_EH_PE_sleb128    0x09
+#define DW_EH_PE_sdata2        0x0A
+#define DW_EH_PE_sdata4        0x0B
+#define DW_EH_PE_sdata8        0x0C
+#define DW_EH_PE_signed        0x08
 
-#define DW_EH_PE_pcrel		0x10
-#define DW_EH_PE_textrel	0x20
-#define DW_EH_PE_datarel	0x30
-#define DW_EH_PE_funcrel	0x40
-#define DW_EH_PE_aligned	0x50
+#define DW_EH_PE_pcrel        0x10
+#define DW_EH_PE_textrel    0x20
+#define DW_EH_PE_datarel    0x30
+#define DW_EH_PE_funcrel    0x40
+#define DW_EH_PE_aligned    0x50
 
-#define DW_EH_PE_indirect	0x80
+#define DW_EH_PE_indirect    0x80
 
 #endif /* dwarf2.h */
diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
index 1492dd38e73..c5257592efa 100644
--- a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S
@@ -24,3159 +24,3159 @@
 #include "asm-syntax.h"
 
 #ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
+# define MEMCPY        __memcpy_ssse3_back
+# define MEMCPY_CHK    __memcpy_chk_ssse3_back
+# define MEMPCPY    __mempcpy_ssse3_back
+# define MEMPCPY_CHK    __mempcpy_chk_ssse3_back
 #endif
 
-#define JMPTBL(I, B)	I - B
+#define JMPTBL(I, B)    I - B
 
 /* Branch to an entry in a jump table.  TABLE is a jump table with
    relative offsets.  INDEX is a register contains the index into the
    jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)        \
+  lea        TABLE(%rip), %r11;                \
+  movslq    (%r11, INDEX, SCALE), INDEX;            \
+  lea        (%r11, INDEX), INDEX;                \
+  _CET_NOTRACK jmp *INDEX;                    \
   ud2
 
-	.section .text.ssse3,"ax",@progbits
+    .section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start)
 END (MEMPCPY)
 #endif
 
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
+    mov    %RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
 #endif
 
 #ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
+    /* Clear the upper 32 bits.  */
+    mov    %edx, %edx
 #endif
 
 #ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    cmp    %rsi, %rdi
+    jb    L(copy_forward)
+    je    L(bwd_write_0bytes)
+    cmp    $144, %rdx
+    jae    L(copy_backward)
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 L(copy_forward):
 #endif
 L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
+    cmp    $144, %rdx
+    jae    L(144bytesormore)
 
 L(fwd_write_less32bytes):
 #ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
+    cmp    %dil, %sil
+    jbe    L(bk_write)
 #endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 #ifndef USE_AS_MEMMOVE
 L(bk_write):
 
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 #endif
 
-	.p2align 4
+    .p2align 4
 L(144bytesormore):
 
 #ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
+    cmp    %dil, %sil
+    jle    L(copy_backward)
 #endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
+    movdqu    (%rsi), %xmm0
+    mov    %rdi, %r8
+    and    $-16, %rdi
+    add    $16, %rdi
+    mov    %rdi, %r9
+    sub    %r8, %r9
+    sub    %r9, %rdx
+    add    %r9, %rsi
+    mov    %rsi, %r9
+    and    $0xf, %r9
+    jz    L(shl_0)
 #ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
+    mov    $DATA_CACHE_SIZE, %RCX_LP
 #else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
+    mov    __x86_data_cache_size(%rip), %RCX_LP
 #endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
+    cmp    %rcx, %rdx
+    jae    L(gobble_mem_fwd)
+    lea        L(shl_table_fwd)(%rip), %r11
+    sub    $0x80, %rdx
+    movslq    (%r11, %r9, 4), %r9
+    add    %r11, %r9
+    _CET_NOTRACK jmp *%r9
+    ud2
 
-	.p2align 4
+    .p2align 4
 L(copy_backward):
 #ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
+    mov    $DATA_CACHE_SIZE, %RCX_LP
 #else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
+    mov    __x86_data_cache_size(%rip), %RCX_LP
 #endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
+    shl    $1, %rcx
+    cmp    %rcx, %rdx
+    ja    L(gobble_mem_bwd)
 
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    movdqu    -16(%rsi), %xmm0
+    lea    -16(%rdi), %r8
+    mov    %rdi, %r9
+    and    $0xf, %r9
+    xor    %r9, %rdi
+    sub    %r9, %rsi
+    sub    %r9, %rdx
+    mov    %rsi, %r9
+    and    $0xf, %r9
+    jz    L(shl_0_bwd)
+    lea        L(shl_table_bwd)(%rip), %r11
+    sub    $0x80, %rdx
+    movslq    (%r11, %r9, 4), %r9
+    add    %r11, %r9
+    _CET_NOTRACK jmp *%r9
+    ud2
 
-	.p2align 4
+    .p2align 4
 L(shl_0):
 
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
+    mov    %rdx, %r9
+    shr    $8, %r9
+    add    %rdx, %r9
 #ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
+    cmp    $DATA_CACHE_SIZE_HALF, %R9_LP
 #else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
+    cmp    __x86_data_cache_size_half(%rip), %R9_LP
 #endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
+    jae    L(gobble_mem_fwd)
+    sub    $0x80, %rdx
+    .p2align 4
 L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    movdqa    (%rsi), %xmm1
+    movdqa    %xmm1, (%rdi)
+    movaps    0x10(%rsi), %xmm2
+    movaps    %xmm2, 0x10(%rdi)
+    movaps    0x20(%rsi), %xmm3
+    movaps    %xmm3, 0x20(%rdi)
+    movaps    0x30(%rsi), %xmm4
+    movaps    %xmm4, 0x30(%rdi)
+    movaps    0x40(%rsi), %xmm1
+    movaps    %xmm1, 0x40(%rdi)
+    movaps    0x50(%rsi), %xmm2
+    movaps    %xmm2, 0x50(%rdi)
+    movaps    0x60(%rsi), %xmm3
+    movaps    %xmm3, 0x60(%rdi)
+    movaps    0x70(%rsi), %xmm4
+    movaps    %xmm4, 0x70(%rdi)
+    sub    $0x80, %rdx
+    lea    0x80(%rsi), %rsi
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_0_loop)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_0_bwd):
-	sub	$0x80, %rdx
+    sub    $0x80, %rdx
 L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
+    movaps    -0x10(%rsi), %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    -0x20(%rsi), %xmm2
+    movaps    %xmm2, -0x20(%rdi)
+    movaps    -0x30(%rsi), %xmm3
+    movaps    %xmm3, -0x30(%rdi)
+    movaps    -0x40(%rsi), %xmm4
+    movaps    %xmm4, -0x40(%rdi)
+    movaps    -0x50(%rsi), %xmm5
+    movaps    %xmm5, -0x50(%rdi)
+    movaps    -0x60(%rsi), %xmm5
+    movaps    %xmm5, -0x60(%rdi)
+    movaps    -0x70(%rsi), %xmm5
+    movaps    %xmm5, -0x70(%rdi)
+    movaps    -0x80(%rsi), %xmm5
+    movaps    %xmm5, -0x80(%rdi)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(copy_backward_loop)
 
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x01(%rsi), %xmm1
+    movaps    0x0f(%rsi), %xmm2
+    movaps    0x1f(%rsi), %xmm3
+    movaps    0x2f(%rsi), %xmm4
+    movaps    0x3f(%rsi), %xmm5
+    movaps    0x4f(%rsi), %xmm6
+    movaps    0x5f(%rsi), %xmm7
+    movaps    0x6f(%rsi), %xmm8
+    movaps    0x7f(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $1, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $1, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $1, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $1, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $1, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $1, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $1, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $1, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_1)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
+    movaps    -0x01(%rsi), %xmm1
 
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x11(%rsi), %xmm2
+    palignr    $1, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x21(%rsi), %xmm3
+    palignr    $1, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x31(%rsi), %xmm4
+    palignr    $1, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x41(%rsi), %xmm5
+    palignr    $1, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x51(%rsi), %xmm6
+    palignr    $1, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x61(%rsi), %xmm7
+    palignr    $1, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x71(%rsi), %xmm8
+    palignr    $1, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x81(%rsi), %xmm9
+    palignr    $1, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_1_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x02(%rsi), %xmm1
+    movaps    0x0e(%rsi), %xmm2
+    movaps    0x1e(%rsi), %xmm3
+    movaps    0x2e(%rsi), %xmm4
+    movaps    0x3e(%rsi), %xmm5
+    movaps    0x4e(%rsi), %xmm6
+    movaps    0x5e(%rsi), %xmm7
+    movaps    0x6e(%rsi), %xmm8
+    movaps    0x7e(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $2, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $2, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $2, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $2, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $2, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $2, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $2, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $2, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_2)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
+    movaps    -0x02(%rsi), %xmm1
 
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x12(%rsi), %xmm2
+    palignr    $2, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x22(%rsi), %xmm3
+    palignr    $2, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x32(%rsi), %xmm4
+    palignr    $2, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x42(%rsi), %xmm5
+    palignr    $2, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x52(%rsi), %xmm6
+    palignr    $2, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x62(%rsi), %xmm7
+    palignr    $2, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x72(%rsi), %xmm8
+    palignr    $2, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x82(%rsi), %xmm9
+    palignr    $2, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_2_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps -0x03(%rsi), %xmm1
+    movaps    0x0d(%rsi), %xmm2
+    movaps    0x1d(%rsi), %xmm3
+    movaps    0x2d(%rsi), %xmm4
+    movaps    0x3d(%rsi), %xmm5
+    movaps    0x4d(%rsi), %xmm6
+    movaps    0x5d(%rsi), %xmm7
+    movaps    0x6d(%rsi), %xmm8
+    movaps    0x7d(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $3, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $3, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $3, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $3, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $3, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $3, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $3, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $3, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_3)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
+    movaps    -0x03(%rsi), %xmm1
 
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x13(%rsi), %xmm2
+    palignr    $3, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x23(%rsi), %xmm3
+    palignr    $3, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x33(%rsi), %xmm4
+    palignr    $3, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x43(%rsi), %xmm5
+    palignr    $3, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x53(%rsi), %xmm6
+    palignr    $3, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x63(%rsi), %xmm7
+    palignr    $3, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x73(%rsi), %xmm8
+    palignr    $3, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x83(%rsi), %xmm9
+    palignr    $3, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_3_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x04(%rsi), %xmm1
+    movaps    0x0c(%rsi), %xmm2
+    movaps    0x1c(%rsi), %xmm3
+    movaps    0x2c(%rsi), %xmm4
+    movaps    0x3c(%rsi), %xmm5
+    movaps    0x4c(%rsi), %xmm6
+    movaps    0x5c(%rsi), %xmm7
+    movaps    0x6c(%rsi), %xmm8
+    movaps    0x7c(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $4, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $4, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $4, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $4, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $4, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $4, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $4, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $4, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_4)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
+    movaps    -0x04(%rsi), %xmm1
 
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x14(%rsi), %xmm2
+    palignr    $4, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x24(%rsi), %xmm3
+    palignr    $4, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x34(%rsi), %xmm4
+    palignr    $4, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x44(%rsi), %xmm5
+    palignr    $4, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x54(%rsi), %xmm6
+    palignr    $4, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x64(%rsi), %xmm7
+    palignr    $4, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x74(%rsi), %xmm8
+    palignr    $4, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x84(%rsi), %xmm9
+    palignr    $4, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_4_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x05(%rsi), %xmm1
+    movaps    0x0b(%rsi), %xmm2
+    movaps    0x1b(%rsi), %xmm3
+    movaps    0x2b(%rsi), %xmm4
+    movaps    0x3b(%rsi), %xmm5
+    movaps    0x4b(%rsi), %xmm6
+    movaps    0x5b(%rsi), %xmm7
+    movaps    0x6b(%rsi), %xmm8
+    movaps    0x7b(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $5, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $5, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $5, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $5, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $5, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $5, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $5, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $5, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_5)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
+    movaps    -0x05(%rsi), %xmm1
 
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x15(%rsi), %xmm2
+    palignr    $5, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x25(%rsi), %xmm3
+    palignr    $5, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x35(%rsi), %xmm4
+    palignr    $5, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x45(%rsi), %xmm5
+    palignr    $5, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x55(%rsi), %xmm6
+    palignr    $5, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x65(%rsi), %xmm7
+    palignr    $5, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x75(%rsi), %xmm8
+    palignr    $5, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x85(%rsi), %xmm9
+    palignr    $5, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_5_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_6):
-	sub	$0x80, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movaps	0x4a(%rsi), %xmm6
-	movaps	0x5a(%rsi), %xmm7
-	movaps	0x6a(%rsi), %xmm8
-	movaps	0x7a(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$6, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$6, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$6, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$6, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$6, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$6, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_6)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x06(%rsi), %xmm1
+    movaps    0x0a(%rsi), %xmm2
+    movaps    0x1a(%rsi), %xmm3
+    movaps    0x2a(%rsi), %xmm4
+    movaps    0x3a(%rsi), %xmm5
+    movaps    0x4a(%rsi), %xmm6
+    movaps    0x5a(%rsi), %xmm7
+    movaps    0x6a(%rsi), %xmm8
+    movaps    0x7a(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $6, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $6, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $6, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $6, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $6, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $6, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $6, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $6, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_6)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_6_bwd):
-	movaps	-0x06(%rsi), %xmm1
+    movaps    -0x06(%rsi), %xmm1
 
-	movaps	-0x16(%rsi), %xmm2
-	palignr	$6, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x16(%rsi), %xmm2
+    palignr    $6, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x26(%rsi), %xmm3
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x26(%rsi), %xmm3
+    palignr    $6, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x36(%rsi), %xmm4
-	palignr	$6, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x36(%rsi), %xmm4
+    palignr    $6, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x46(%rsi), %xmm5
-	palignr	$6, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x46(%rsi), %xmm5
+    palignr    $6, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x56(%rsi), %xmm6
-	palignr	$6, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x56(%rsi), %xmm6
+    palignr    $6, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x66(%rsi), %xmm7
-	palignr	$6, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x66(%rsi), %xmm7
+    palignr    $6, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x76(%rsi), %xmm8
-	palignr	$6, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x76(%rsi), %xmm8
+    palignr    $6, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x86(%rsi), %xmm9
-	palignr	$6, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x86(%rsi), %xmm9
+    palignr    $6, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_6_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_6_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_7):
-	sub	$0x80, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movaps	0x49(%rsi), %xmm6
-	movaps	0x59(%rsi), %xmm7
-	movaps	0x69(%rsi), %xmm8
-	movaps	0x79(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$7, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$7, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$7, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$7, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$7, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$7, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_7)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x07(%rsi), %xmm1
+    movaps    0x09(%rsi), %xmm2
+    movaps    0x19(%rsi), %xmm3
+    movaps    0x29(%rsi), %xmm4
+    movaps    0x39(%rsi), %xmm5
+    movaps    0x49(%rsi), %xmm6
+    movaps    0x59(%rsi), %xmm7
+    movaps    0x69(%rsi), %xmm8
+    movaps    0x79(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $7, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $7, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $7, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $7, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $7, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $7, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $7, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $7, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_7)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_7_bwd):
-	movaps	-0x07(%rsi), %xmm1
+    movaps    -0x07(%rsi), %xmm1
 
-	movaps	-0x17(%rsi), %xmm2
-	palignr	$7, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x17(%rsi), %xmm2
+    palignr    $7, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x27(%rsi), %xmm3
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x27(%rsi), %xmm3
+    palignr    $7, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x37(%rsi), %xmm4
-	palignr	$7, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x37(%rsi), %xmm4
+    palignr    $7, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x47(%rsi), %xmm5
-	palignr	$7, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x47(%rsi), %xmm5
+    palignr    $7, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x57(%rsi), %xmm6
-	palignr	$7, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x57(%rsi), %xmm6
+    palignr    $7, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x67(%rsi), %xmm7
-	palignr	$7, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x67(%rsi), %xmm7
+    palignr    $7, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x77(%rsi), %xmm8
-	palignr	$7, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x77(%rsi), %xmm8
+    palignr    $7, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x87(%rsi), %xmm9
-	palignr	$7, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x87(%rsi), %xmm9
+    palignr    $7, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_7_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_7_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_8):
-	sub	$0x80, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movaps	0x48(%rsi), %xmm6
-	movaps	0x58(%rsi), %xmm7
-	movaps	0x68(%rsi), %xmm8
-	movaps	0x78(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$8, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$8, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$8, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$8, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$8, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$8, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_8)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x08(%rsi), %xmm1
+    movaps    0x08(%rsi), %xmm2
+    movaps    0x18(%rsi), %xmm3
+    movaps    0x28(%rsi), %xmm4
+    movaps    0x38(%rsi), %xmm5
+    movaps    0x48(%rsi), %xmm6
+    movaps    0x58(%rsi), %xmm7
+    movaps    0x68(%rsi), %xmm8
+    movaps    0x78(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $8, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $8, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $8, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $8, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $8, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $8, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $8, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $8, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_8)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_8_bwd):
-	movaps	-0x08(%rsi), %xmm1
+    movaps    -0x08(%rsi), %xmm1
 
-	movaps	-0x18(%rsi), %xmm2
-	palignr	$8, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x18(%rsi), %xmm2
+    palignr    $8, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x28(%rsi), %xmm3
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x28(%rsi), %xmm3
+    palignr    $8, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x38(%rsi), %xmm4
-	palignr	$8, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x38(%rsi), %xmm4
+    palignr    $8, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x48(%rsi), %xmm5
-	palignr	$8, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x48(%rsi), %xmm5
+    palignr    $8, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x58(%rsi), %xmm6
-	palignr	$8, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x58(%rsi), %xmm6
+    palignr    $8, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x68(%rsi), %xmm7
-	palignr	$8, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x68(%rsi), %xmm7
+    palignr    $8, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x78(%rsi), %xmm8
-	palignr	$8, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x78(%rsi), %xmm8
+    palignr    $8, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x88(%rsi), %xmm9
-	palignr	$8, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x88(%rsi), %xmm9
+    palignr    $8, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_8_bwd)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_8_bwd)
 L(shl_8_end_bwd):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_9):
-	sub	$0x80, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movaps	0x47(%rsi), %xmm6
-	movaps	0x57(%rsi), %xmm7
-	movaps	0x67(%rsi), %xmm8
-	movaps	0x77(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$9, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$9, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$9, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$9, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$9, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$9, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_9)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x09(%rsi), %xmm1
+    movaps    0x07(%rsi), %xmm2
+    movaps    0x17(%rsi), %xmm3
+    movaps    0x27(%rsi), %xmm4
+    movaps    0x37(%rsi), %xmm5
+    movaps    0x47(%rsi), %xmm6
+    movaps    0x57(%rsi), %xmm7
+    movaps    0x67(%rsi), %xmm8
+    movaps    0x77(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $9, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $9, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $9, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $9, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $9, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $9, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $9, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $9, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_9)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_9_bwd):
-	movaps	-0x09(%rsi), %xmm1
+    movaps    -0x09(%rsi), %xmm1
 
-	movaps	-0x19(%rsi), %xmm2
-	palignr	$9, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x19(%rsi), %xmm2
+    palignr    $9, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x29(%rsi), %xmm3
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x29(%rsi), %xmm3
+    palignr    $9, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x39(%rsi), %xmm4
-	palignr	$9, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x39(%rsi), %xmm4
+    palignr    $9, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x49(%rsi), %xmm5
-	palignr	$9, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x49(%rsi), %xmm5
+    palignr    $9, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x59(%rsi), %xmm6
-	palignr	$9, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x59(%rsi), %xmm6
+    palignr    $9, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x69(%rsi), %xmm7
-	palignr	$9, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x69(%rsi), %xmm7
+    palignr    $9, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x79(%rsi), %xmm8
-	palignr	$9, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x79(%rsi), %xmm8
+    palignr    $9, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x89(%rsi), %xmm9
-	palignr	$9, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x89(%rsi), %xmm9
+    palignr    $9, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_9_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_9_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_10):
-	sub	$0x80, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movaps	0x46(%rsi), %xmm6
-	movaps	0x56(%rsi), %xmm7
-	movaps	0x66(%rsi), %xmm8
-	movaps	0x76(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$10, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$10, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$10, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$10, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$10, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$10, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_10)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x0a(%rsi), %xmm1
+    movaps    0x06(%rsi), %xmm2
+    movaps    0x16(%rsi), %xmm3
+    movaps    0x26(%rsi), %xmm4
+    movaps    0x36(%rsi), %xmm5
+    movaps    0x46(%rsi), %xmm6
+    movaps    0x56(%rsi), %xmm7
+    movaps    0x66(%rsi), %xmm8
+    movaps    0x76(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $10, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $10, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $10, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $10, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $10, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $10, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $10, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $10, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_10)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_10_bwd):
-	movaps	-0x0a(%rsi), %xmm1
+    movaps    -0x0a(%rsi), %xmm1
 
-	movaps	-0x1a(%rsi), %xmm2
-	palignr	$10, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x1a(%rsi), %xmm2
+    palignr    $10, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x2a(%rsi), %xmm3
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x2a(%rsi), %xmm3
+    palignr    $10, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x3a(%rsi), %xmm4
-	palignr	$10, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x3a(%rsi), %xmm4
+    palignr    $10, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x4a(%rsi), %xmm5
-	palignr	$10, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x4a(%rsi), %xmm5
+    palignr    $10, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x5a(%rsi), %xmm6
-	palignr	$10, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x5a(%rsi), %xmm6
+    palignr    $10, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x6a(%rsi), %xmm7
-	palignr	$10, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x6a(%rsi), %xmm7
+    palignr    $10, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x7a(%rsi), %xmm8
-	palignr	$10, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x7a(%rsi), %xmm8
+    palignr    $10, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x8a(%rsi), %xmm9
-	palignr	$10, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x8a(%rsi), %xmm9
+    palignr    $10, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_10_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_10_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_11):
-	sub	$0x80, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movaps	0x45(%rsi), %xmm6
-	movaps	0x55(%rsi), %xmm7
-	movaps	0x65(%rsi), %xmm8
-	movaps	0x75(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$11, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$11, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$11, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$11, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$11, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$11, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_11)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x0b(%rsi), %xmm1
+    movaps    0x05(%rsi), %xmm2
+    movaps    0x15(%rsi), %xmm3
+    movaps    0x25(%rsi), %xmm4
+    movaps    0x35(%rsi), %xmm5
+    movaps    0x45(%rsi), %xmm6
+    movaps    0x55(%rsi), %xmm7
+    movaps    0x65(%rsi), %xmm8
+    movaps    0x75(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $11, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $11, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $11, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $11, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $11, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $11, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $11, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $11, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_11)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_11_bwd):
-	movaps	-0x0b(%rsi), %xmm1
+    movaps    -0x0b(%rsi), %xmm1
 
-	movaps	-0x1b(%rsi), %xmm2
-	palignr	$11, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x1b(%rsi), %xmm2
+    palignr    $11, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x2b(%rsi), %xmm3
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x2b(%rsi), %xmm3
+    palignr    $11, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x3b(%rsi), %xmm4
-	palignr	$11, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x3b(%rsi), %xmm4
+    palignr    $11, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x4b(%rsi), %xmm5
-	palignr	$11, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x4b(%rsi), %xmm5
+    palignr    $11, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x5b(%rsi), %xmm6
-	palignr	$11, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x5b(%rsi), %xmm6
+    palignr    $11, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x6b(%rsi), %xmm7
-	palignr	$11, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x6b(%rsi), %xmm7
+    palignr    $11, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x7b(%rsi), %xmm8
-	palignr	$11, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x7b(%rsi), %xmm8
+    palignr    $11, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x8b(%rsi), %xmm9
-	palignr	$11, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x8b(%rsi), %xmm9
+    palignr    $11, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_11_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_11_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_12):
-	sub	$0x80, %rdx
-	movdqa	-0x0c(%rsi), %xmm1
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movaps	0x44(%rsi), %xmm6
-	movaps	0x54(%rsi), %xmm7
-	movaps	0x64(%rsi), %xmm8
-	movaps	0x74(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$12, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$12, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$12, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$12, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$12, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$12, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
+    sub    $0x80, %rdx
+    movdqa    -0x0c(%rsi), %xmm1
+    movaps    0x04(%rsi), %xmm2
+    movaps    0x14(%rsi), %xmm3
+    movaps    0x24(%rsi), %xmm4
+    movaps    0x34(%rsi), %xmm5
+    movaps    0x44(%rsi), %xmm6
+    movaps    0x54(%rsi), %xmm7
+    movaps    0x64(%rsi), %xmm8
+    movaps    0x74(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $12, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $12, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $12, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $12, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $12, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $12, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $12, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $12, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
 
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_12)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_12)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_12_bwd):
-	movaps	-0x0c(%rsi), %xmm1
+    movaps    -0x0c(%rsi), %xmm1
 
-	movaps	-0x1c(%rsi), %xmm2
-	palignr	$12, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x1c(%rsi), %xmm2
+    palignr    $12, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x2c(%rsi), %xmm3
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x2c(%rsi), %xmm3
+    palignr    $12, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x3c(%rsi), %xmm4
-	palignr	$12, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x3c(%rsi), %xmm4
+    palignr    $12, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x4c(%rsi), %xmm5
-	palignr	$12, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x4c(%rsi), %xmm5
+    palignr    $12, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x5c(%rsi), %xmm6
-	palignr	$12, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x5c(%rsi), %xmm6
+    palignr    $12, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x6c(%rsi), %xmm7
-	palignr	$12, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x6c(%rsi), %xmm7
+    palignr    $12, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x7c(%rsi), %xmm8
-	palignr	$12, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x7c(%rsi), %xmm8
+    palignr    $12, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x8c(%rsi), %xmm9
-	palignr	$12, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x8c(%rsi), %xmm9
+    palignr    $12, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_12_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_12_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_13):
-	sub	$0x80, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movaps	0x43(%rsi), %xmm6
-	movaps	0x53(%rsi), %xmm7
-	movaps	0x63(%rsi), %xmm8
-	movaps	0x73(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$13, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$13, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$13, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$13, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$13, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$13, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_13)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x0d(%rsi), %xmm1
+    movaps    0x03(%rsi), %xmm2
+    movaps    0x13(%rsi), %xmm3
+    movaps    0x23(%rsi), %xmm4
+    movaps    0x33(%rsi), %xmm5
+    movaps    0x43(%rsi), %xmm6
+    movaps    0x53(%rsi), %xmm7
+    movaps    0x63(%rsi), %xmm8
+    movaps    0x73(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $13, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $13, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $13, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $13, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $13, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $13, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $13, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $13, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_13)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_13_bwd):
-	movaps	-0x0d(%rsi), %xmm1
+    movaps    -0x0d(%rsi), %xmm1
 
-	movaps	-0x1d(%rsi), %xmm2
-	palignr	$13, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x1d(%rsi), %xmm2
+    palignr    $13, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x2d(%rsi), %xmm3
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x2d(%rsi), %xmm3
+    palignr    $13, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x3d(%rsi), %xmm4
-	palignr	$13, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x3d(%rsi), %xmm4
+    palignr    $13, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x4d(%rsi), %xmm5
-	palignr	$13, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x4d(%rsi), %xmm5
+    palignr    $13, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x5d(%rsi), %xmm6
-	palignr	$13, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x5d(%rsi), %xmm6
+    palignr    $13, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x6d(%rsi), %xmm7
-	palignr	$13, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x6d(%rsi), %xmm7
+    palignr    $13, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x7d(%rsi), %xmm8
-	palignr	$13, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x7d(%rsi), %xmm8
+    palignr    $13, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x8d(%rsi), %xmm9
-	palignr	$13, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x8d(%rsi), %xmm9
+    palignr    $13, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_13_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_13_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_14):
-	sub	$0x80, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movaps	0x42(%rsi), %xmm6
-	movaps	0x52(%rsi), %xmm7
-	movaps	0x62(%rsi), %xmm8
-	movaps	0x72(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$14, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$14, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$14, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$14, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$14, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$14, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_14)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x0e(%rsi), %xmm1
+    movaps    0x02(%rsi), %xmm2
+    movaps    0x12(%rsi), %xmm3
+    movaps    0x22(%rsi), %xmm4
+    movaps    0x32(%rsi), %xmm5
+    movaps    0x42(%rsi), %xmm6
+    movaps    0x52(%rsi), %xmm7
+    movaps    0x62(%rsi), %xmm8
+    movaps    0x72(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $14, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $14, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $14, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $14, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $14, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $14, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $14, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $14, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_14)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_14_bwd):
-	movaps	-0x0e(%rsi), %xmm1
+    movaps    -0x0e(%rsi), %xmm1
 
-	movaps	-0x1e(%rsi), %xmm2
-	palignr	$14, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x1e(%rsi), %xmm2
+    palignr    $14, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x2e(%rsi), %xmm3
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x2e(%rsi), %xmm3
+    palignr    $14, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x3e(%rsi), %xmm4
-	palignr	$14, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x3e(%rsi), %xmm4
+    palignr    $14, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x4e(%rsi), %xmm5
-	palignr	$14, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x4e(%rsi), %xmm5
+    palignr    $14, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x5e(%rsi), %xmm6
-	palignr	$14, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x5e(%rsi), %xmm6
+    palignr    $14, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x6e(%rsi), %xmm7
-	palignr	$14, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x6e(%rsi), %xmm7
+    palignr    $14, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x7e(%rsi), %xmm8
-	palignr	$14, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x7e(%rsi), %xmm8
+    palignr    $14, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x8e(%rsi), %xmm9
-	palignr	$14, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x8e(%rsi), %xmm9
+    palignr    $14, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_14_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_14_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_15):
-	sub	$0x80, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movaps	0x41(%rsi), %xmm6
-	movaps	0x51(%rsi), %xmm7
-	movaps	0x61(%rsi), %xmm8
-	movaps	0x71(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$15, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$15, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$15, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$15, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$15, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$15, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_15)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    sub    $0x80, %rdx
+    movaps    -0x0f(%rsi), %xmm1
+    movaps    0x01(%rsi), %xmm2
+    movaps    0x11(%rsi), %xmm3
+    movaps    0x21(%rsi), %xmm4
+    movaps    0x31(%rsi), %xmm5
+    movaps    0x41(%rsi), %xmm6
+    movaps    0x51(%rsi), %xmm7
+    movaps    0x61(%rsi), %xmm8
+    movaps    0x71(%rsi), %xmm9
+    lea    0x80(%rsi), %rsi
+    palignr    $15, %xmm8, %xmm9
+    movaps    %xmm9, 0x70(%rdi)
+    palignr    $15, %xmm7, %xmm8
+    movaps    %xmm8, 0x60(%rdi)
+    palignr    $15, %xmm6, %xmm7
+    movaps    %xmm7, 0x50(%rdi)
+    palignr    $15, %xmm5, %xmm6
+    movaps    %xmm6, 0x40(%rdi)
+    palignr    $15, %xmm4, %xmm5
+    movaps    %xmm5, 0x30(%rdi)
+    palignr    $15, %xmm3, %xmm4
+    movaps    %xmm4, 0x20(%rdi)
+    palignr    $15, %xmm2, %xmm3
+    movaps    %xmm3, 0x10(%rdi)
+    palignr    $15, %xmm1, %xmm2
+    movaps    %xmm2, (%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(shl_15)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_15_bwd):
-	movaps	-0x0f(%rsi), %xmm1
+    movaps    -0x0f(%rsi), %xmm1
 
-	movaps	-0x1f(%rsi), %xmm2
-	palignr	$15, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
+    movaps    -0x1f(%rsi), %xmm2
+    palignr    $15, %xmm2, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
 
-	movaps	-0x2f(%rsi), %xmm3
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
+    movaps    -0x2f(%rsi), %xmm3
+    palignr    $15, %xmm3, %xmm2
+    movaps    %xmm2, -0x20(%rdi)
 
-	movaps	-0x3f(%rsi), %xmm4
-	palignr	$15, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
+    movaps    -0x3f(%rsi), %xmm4
+    palignr    $15, %xmm4, %xmm3
+    movaps    %xmm3, -0x30(%rdi)
 
-	movaps	-0x4f(%rsi), %xmm5
-	palignr	$15, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
+    movaps    -0x4f(%rsi), %xmm5
+    palignr    $15, %xmm5, %xmm4
+    movaps    %xmm4, -0x40(%rdi)
 
-	movaps	-0x5f(%rsi), %xmm6
-	palignr	$15, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
+    movaps    -0x5f(%rsi), %xmm6
+    palignr    $15, %xmm6, %xmm5
+    movaps    %xmm5, -0x50(%rdi)
 
-	movaps	-0x6f(%rsi), %xmm7
-	palignr	$15, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
+    movaps    -0x6f(%rsi), %xmm7
+    palignr    $15, %xmm7, %xmm6
+    movaps    %xmm6, -0x60(%rdi)
 
-	movaps	-0x7f(%rsi), %xmm8
-	palignr	$15, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
+    movaps    -0x7f(%rsi), %xmm8
+    palignr    $15, %xmm8, %xmm7
+    movaps    %xmm7, -0x70(%rdi)
 
-	movaps	-0x8f(%rsi), %xmm9
-	palignr	$15, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
+    movaps    -0x8f(%rsi), %xmm9
+    palignr    $15, %xmm9, %xmm8
+    movaps    %xmm8, -0x80(%rdi)
 
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_15_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    sub    $0x80, %rdx
+    lea    -0x80(%rdi), %rdi
+    lea    -0x80(%rsi), %rsi
+    jae    L(shl_15_bwd)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rdi
+    sub    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(gobble_mem_fwd):
-	movdqu	(%rsi), %xmm1
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, (%rdi)
-	sub	$16, %rdx
-	add	$16, %rsi
-	add	$16, %rdi
+    movdqu    (%rsi), %xmm1
+    movdqu    %xmm0, (%r8)
+    movdqa    %xmm1, (%rdi)
+    sub    $16, %rdx
+    add    $16, %rsi
+    add    $16, %rdi
 
 #ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+    mov    $SHARED_CACHE_SIZE_HALF, %RCX_LP
 #else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+    mov    __x86_shared_cache_size_half(%rip), %RCX_LP
 #endif
 #ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_fwd_start)
+    mov    %rsi, %r9
+    sub    %rdi, %r9
+    cmp    %rdx, %r9
+    jae    L(memmove_is_memcpy_fwd)
+    cmp    %rcx, %r9
+    jbe    L(ll_cache_copy_fwd_start)
 L(memmove_is_memcpy_fwd):
 #endif
-	cmp	%rcx, %rdx
-	ja	L(bigger_in_fwd)
-	mov	%rdx, %rcx
+    cmp    %rcx, %rdx
+    ja    L(bigger_in_fwd)
+    mov    %rdx, %rcx
 L(bigger_in_fwd):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy_fwd)
+    sub    %rcx, %rdx
+    cmp    $0x1000, %rdx
+    jbe    L(ll_cache_copy_fwd)
 
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy_fwd)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
+    mov    %rcx, %r9
+    shl    $3, %r9
+    cmp    %r9, %rdx
+    jbe    L(2steps_copy_fwd)
+    add    %rcx, %rdx
+    xor    %rcx, %rcx
 L(2steps_copy_fwd):
-	sub	$0x80, %rdx
+    sub    $0x80, %rdx
 L(gobble_mem_fwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 0x200(%rsi)
-	prefetcht0 0x300(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lfence
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_mem_fwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_fwd_end)
-	add	$0x80, %rdx
+    sub    $0x80, %rdx
+    prefetcht0 0x200(%rsi)
+    prefetcht0 0x300(%rsi)
+    movdqu    (%rsi), %xmm0
+    movdqu    0x10(%rsi), %xmm1
+    movdqu    0x20(%rsi), %xmm2
+    movdqu    0x30(%rsi), %xmm3
+    movdqu    0x40(%rsi), %xmm4
+    movdqu    0x50(%rsi), %xmm5
+    movdqu    0x60(%rsi), %xmm6
+    movdqu    0x70(%rsi), %xmm7
+    lfence
+    movntdq    %xmm0, (%rdi)
+    movntdq    %xmm1, 0x10(%rdi)
+    movntdq    %xmm2, 0x20(%rdi)
+    movntdq    %xmm3, 0x30(%rdi)
+    movntdq    %xmm4, 0x40(%rdi)
+    movntdq    %xmm5, 0x50(%rdi)
+    movntdq    %xmm6, 0x60(%rdi)
+    movntdq    %xmm7, 0x70(%rdi)
+    lea    0x80(%rsi), %rsi
+    lea    0x80(%rdi), %rdi
+    jae    L(gobble_mem_fwd_loop)
+    sfence
+    cmp    $0x80, %rcx
+    jb    L(gobble_mem_fwd_end)
+    add    $0x80, %rdx
 L(ll_cache_copy_fwd):
-	add	%rcx, %rdx
+    add    %rcx, %rdx
 L(ll_cache_copy_fwd_start):
-	sub	$0x80, %rdx
+    sub    $0x80, %rdx
 L(gobble_ll_loop_fwd):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop_fwd)
+    prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x280(%rsi)
+    prefetchnta 0x1c0(%rdi)
+    prefetchnta 0x280(%rdi)
+    sub    $0x80, %rdx
+    movdqu    (%rsi), %xmm0
+    movdqu    0x10(%rsi), %xmm1
+    movdqu    0x20(%rsi), %xmm2
+    movdqu    0x30(%rsi), %xmm3
+    movdqu    0x40(%rsi), %xmm4
+    movdqu    0x50(%rsi), %xmm5
+    movdqu    0x60(%rsi), %xmm6
+    movdqu    0x70(%rsi), %xmm7
+    movdqa    %xmm0, (%rdi)
+    movdqa    %xmm1, 0x10(%rdi)
+    movdqa    %xmm2, 0x20(%rdi)
+    movdqa    %xmm3, 0x30(%rdi)
+    movdqa    %xmm4, 0x40(%rdi)
+    movdqa    %xmm5, 0x50(%rdi)
+    movdqa    %xmm6, 0x60(%rdi)
+    movdqa    %xmm7, 0x70(%rdi)
+    lea    0x80(%rsi), %rsi
+    lea    0x80(%rdi), %rdi
+    jae    L(gobble_ll_loop_fwd)
 L(gobble_mem_fwd_end):
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+    add    $0x80, %rdx
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(gobble_mem_bwd):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
+    add    %rdx, %rsi
+    add    %rdx, %rdi
 
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$-16, %rdi
-	sub	%rdi, %r9
-	sub	%r9, %rsi
-	sub	%r9, %rdx
+    movdqu    -16(%rsi), %xmm0
+    lea    -16(%rdi), %r8
+    mov    %rdi, %r9
+    and    $-16, %rdi
+    sub    %rdi, %r9
+    sub    %r9, %rsi
+    sub    %r9, %rdx
 
 
 #ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+    mov    $SHARED_CACHE_SIZE_HALF, %RCX_LP
 #else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+    mov    __x86_shared_cache_size_half(%rip), %RCX_LP
 #endif
 #ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_bwd_start)
+    mov    %rdi, %r9
+    sub    %rsi, %r9
+    cmp    %rdx, %r9
+    jae    L(memmove_is_memcpy_bwd)
+    cmp    %rcx, %r9
+    jbe    L(ll_cache_copy_bwd_start)
 L(memmove_is_memcpy_bwd):
 #endif
-	cmp	%rcx, %rdx
-	ja	L(bigger)
-	mov	%rdx, %rcx
+    cmp    %rcx, %rdx
+    ja    L(bigger)
+    mov    %rdx, %rcx
 L(bigger):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy)
+    sub    %rcx, %rdx
+    cmp    $0x1000, %rdx
+    jbe    L(ll_cache_copy)
 
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
+    mov    %rcx, %r9
+    shl    $3, %r9
+    cmp    %r9, %rdx
+    jbe    L(2steps_copy)
+    add    %rcx, %rdx
+    xor    %rcx, %rcx
 L(2steps_copy):
-	sub	$0x80, %rdx
+    sub    $0x80, %rdx
 L(gobble_mem_bwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 -0x200(%rsi)
-	prefetcht0 -0x300(%rsi)
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	lfence
-	movntdq	%xmm1, -0x10(%rdi)
-	movntdq	%xmm2, -0x20(%rdi)
-	movntdq	%xmm3, -0x30(%rdi)
-	movntdq	%xmm4, -0x40(%rdi)
-	movntdq	%xmm5, -0x50(%rdi)
-	movntdq	%xmm6, -0x60(%rdi)
-	movntdq	%xmm7, -0x70(%rdi)
-	movntdq	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_mem_bwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_bwd_end)
-	add	$0x80, %rdx
+    sub    $0x80, %rdx
+    prefetcht0 -0x200(%rsi)
+    prefetcht0 -0x300(%rsi)
+    movdqu    -0x10(%rsi), %xmm1
+    movdqu    -0x20(%rsi), %xmm2
+    movdqu    -0x30(%rsi), %xmm3
+    movdqu    -0x40(%rsi), %xmm4
+    movdqu    -0x50(%rsi), %xmm5
+    movdqu    -0x60(%rsi), %xmm6
+    movdqu    -0x70(%rsi), %xmm7
+    movdqu    -0x80(%rsi), %xmm8
+    lfence
+    movntdq    %xmm1, -0x10(%rdi)
+    movntdq    %xmm2, -0x20(%rdi)
+    movntdq    %xmm3, -0x30(%rdi)
+    movntdq    %xmm4, -0x40(%rdi)
+    movntdq    %xmm5, -0x50(%rdi)
+    movntdq    %xmm6, -0x60(%rdi)
+    movntdq    %xmm7, -0x70(%rdi)
+    movntdq    %xmm8, -0x80(%rdi)
+    lea    -0x80(%rsi), %rsi
+    lea    -0x80(%rdi), %rdi
+    jae    L(gobble_mem_bwd_loop)
+    sfence
+    cmp    $0x80, %rcx
+    jb    L(gobble_mem_bwd_end)
+    add    $0x80, %rdx
 L(ll_cache_copy):
-	add	%rcx, %rdx
+    add    %rcx, %rdx
 L(ll_cache_copy_bwd_start):
-	sub	$0x80, %rdx
+    sub    $0x80, %rdx
 L(gobble_ll_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	prefetchnta -0x1c0(%rdi)
-	prefetchnta -0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	movdqa	%xmm1, -0x10(%rdi)
-	movdqa	%xmm2, -0x20(%rdi)
-	movdqa	%xmm3, -0x30(%rdi)
-	movdqa	%xmm4, -0x40(%rdi)
-	movdqa	%xmm5, -0x50(%rdi)
-	movdqa	%xmm6, -0x60(%rdi)
-	movdqa	%xmm7, -0x70(%rdi)
-	movdqa	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop)
+    prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x280(%rsi)
+    prefetchnta -0x1c0(%rdi)
+    prefetchnta -0x280(%rdi)
+    sub    $0x80, %rdx
+    movdqu    -0x10(%rsi), %xmm1
+    movdqu    -0x20(%rsi), %xmm2
+    movdqu    -0x30(%rsi), %xmm3
+    movdqu    -0x40(%rsi), %xmm4
+    movdqu    -0x50(%rsi), %xmm5
+    movdqu    -0x60(%rsi), %xmm6
+    movdqu    -0x70(%rsi), %xmm7
+    movdqu    -0x80(%rsi), %xmm8
+    movdqa    %xmm1, -0x10(%rdi)
+    movdqa    %xmm2, -0x20(%rdi)
+    movdqa    %xmm3, -0x30(%rdi)
+    movdqa    %xmm4, -0x40(%rdi)
+    movdqa    %xmm5, -0x50(%rdi)
+    movdqa    %xmm6, -0x60(%rdi)
+    movdqa    %xmm7, -0x70(%rdi)
+    movdqa    %xmm8, -0x80(%rdi)
+    lea    -0x80(%rsi), %rsi
+    lea    -0x80(%rdi), %rdi
+    jae    L(gobble_ll_loop)
 L(gobble_mem_bwd_end):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rsi
-	sub	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+    movdqu    %xmm0, (%r8)
+    add    $0x80, %rdx
+    sub    %rdx, %rsi
+    sub    %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_128bytes):
-	lddqu	-128(%rsi), %xmm0
-	movdqu	%xmm0, -128(%rdi)
+    lddqu    -128(%rsi), %xmm0
+    movdqu    %xmm0, -128(%rdi)
 L(fwd_write_112bytes):
-	lddqu	-112(%rsi), %xmm0
-	movdqu	%xmm0, -112(%rdi)
+    lddqu    -112(%rsi), %xmm0
+    movdqu    %xmm0, -112(%rdi)
 L(fwd_write_96bytes):
-	lddqu	-96(%rsi), %xmm0
-	movdqu	%xmm0, -96(%rdi)
+    lddqu    -96(%rsi), %xmm0
+    movdqu    %xmm0, -96(%rdi)
 L(fwd_write_80bytes):
-	lddqu	-80(%rsi), %xmm0
-	movdqu	%xmm0, -80(%rdi)
+    lddqu    -80(%rsi), %xmm0
+    movdqu    %xmm0, -80(%rdi)
 L(fwd_write_64bytes):
-	lddqu	-64(%rsi), %xmm0
-	movdqu	%xmm0, -64(%rdi)
+    lddqu    -64(%rsi), %xmm0
+    movdqu    %xmm0, -64(%rdi)
 L(fwd_write_48bytes):
-	lddqu	-48(%rsi), %xmm0
-	movdqu	%xmm0, -48(%rdi)
+    lddqu    -48(%rsi), %xmm0
+    movdqu    %xmm0, -48(%rdi)
 L(fwd_write_32bytes):
-	lddqu	-32(%rsi), %xmm0
-	movdqu	%xmm0, -32(%rdi)
+    lddqu    -32(%rsi), %xmm0
+    movdqu    %xmm0, -32(%rdi)
 L(fwd_write_16bytes):
-	lddqu	-16(%rsi), %xmm0
-	movdqu	%xmm0, -16(%rdi)
+    lddqu    -16(%rsi), %xmm0
+    movdqu    %xmm0, -16(%rdi)
 L(fwd_write_0bytes):
-	ret
+    ret
 
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_143bytes):
-	lddqu	-143(%rsi), %xmm0
-	movdqu	%xmm0, -143(%rdi)
+    lddqu    -143(%rsi), %xmm0
+    movdqu    %xmm0, -143(%rdi)
 L(fwd_write_127bytes):
-	lddqu	-127(%rsi), %xmm0
-	movdqu	%xmm0, -127(%rdi)
+    lddqu    -127(%rsi), %xmm0
+    movdqu    %xmm0, -127(%rdi)
 L(fwd_write_111bytes):
-	lddqu	-111(%rsi), %xmm0
-	movdqu	%xmm0, -111(%rdi)
+    lddqu    -111(%rsi), %xmm0
+    movdqu    %xmm0, -111(%rdi)
 L(fwd_write_95bytes):
-	lddqu	-95(%rsi), %xmm0
-	movdqu	%xmm0, -95(%rdi)
+    lddqu    -95(%rsi), %xmm0
+    movdqu    %xmm0, -95(%rdi)
 L(fwd_write_79bytes):
-	lddqu	-79(%rsi), %xmm0
-	movdqu	%xmm0, -79(%rdi)
+    lddqu    -79(%rsi), %xmm0
+    movdqu    %xmm0, -79(%rdi)
 L(fwd_write_63bytes):
-	lddqu	-63(%rsi), %xmm0
-	movdqu	%xmm0, -63(%rdi)
+    lddqu    -63(%rsi), %xmm0
+    movdqu    %xmm0, -63(%rdi)
 L(fwd_write_47bytes):
-	lddqu	-47(%rsi), %xmm0
-	movdqu	%xmm0, -47(%rdi)
+    lddqu    -47(%rsi), %xmm0
+    movdqu    %xmm0, -47(%rdi)
 L(fwd_write_31bytes):
-	lddqu	-31(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -31(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -31(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -31(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_15bytes):
-	mov	-15(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -15(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
+    mov    -15(%rsi), %rdx
+    mov    -8(%rsi), %rcx
+    mov    %rdx, -15(%rdi)
+    mov    %rcx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_142bytes):
-	lddqu	-142(%rsi), %xmm0
-	movdqu	%xmm0, -142(%rdi)
+    lddqu    -142(%rsi), %xmm0
+    movdqu    %xmm0, -142(%rdi)
 L(fwd_write_126bytes):
-	lddqu	-126(%rsi), %xmm0
-	movdqu	%xmm0, -126(%rdi)
+    lddqu    -126(%rsi), %xmm0
+    movdqu    %xmm0, -126(%rdi)
 L(fwd_write_110bytes):
-	lddqu	-110(%rsi), %xmm0
-	movdqu	%xmm0, -110(%rdi)
+    lddqu    -110(%rsi), %xmm0
+    movdqu    %xmm0, -110(%rdi)
 L(fwd_write_94bytes):
-	lddqu	-94(%rsi), %xmm0
-	movdqu	%xmm0, -94(%rdi)
+    lddqu    -94(%rsi), %xmm0
+    movdqu    %xmm0, -94(%rdi)
 L(fwd_write_78bytes):
-	lddqu	-78(%rsi), %xmm0
-	movdqu	%xmm0, -78(%rdi)
+    lddqu    -78(%rsi), %xmm0
+    movdqu    %xmm0, -78(%rdi)
 L(fwd_write_62bytes):
-	lddqu	-62(%rsi), %xmm0
-	movdqu	%xmm0, -62(%rdi)
+    lddqu    -62(%rsi), %xmm0
+    movdqu    %xmm0, -62(%rdi)
 L(fwd_write_46bytes):
-	lddqu	-46(%rsi), %xmm0
-	movdqu	%xmm0, -46(%rdi)
+    lddqu    -46(%rsi), %xmm0
+    movdqu    %xmm0, -46(%rdi)
 L(fwd_write_30bytes):
-	lddqu	-30(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -30(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -30(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -30(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_14bytes):
-	mov	-14(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -14(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
+    mov    -14(%rsi), %rdx
+    mov    -8(%rsi), %rcx
+    mov    %rdx, -14(%rdi)
+    mov    %rcx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_141bytes):
-	lddqu	-141(%rsi), %xmm0
-	movdqu	%xmm0, -141(%rdi)
+    lddqu    -141(%rsi), %xmm0
+    movdqu    %xmm0, -141(%rdi)
 L(fwd_write_125bytes):
-	lddqu	-125(%rsi), %xmm0
-	movdqu	%xmm0, -125(%rdi)
+    lddqu    -125(%rsi), %xmm0
+    movdqu    %xmm0, -125(%rdi)
 L(fwd_write_109bytes):
-	lddqu	-109(%rsi), %xmm0
-	movdqu	%xmm0, -109(%rdi)
+    lddqu    -109(%rsi), %xmm0
+    movdqu    %xmm0, -109(%rdi)
 L(fwd_write_93bytes):
-	lddqu	-93(%rsi), %xmm0
-	movdqu	%xmm0, -93(%rdi)
+    lddqu    -93(%rsi), %xmm0
+    movdqu    %xmm0, -93(%rdi)
 L(fwd_write_77bytes):
-	lddqu	-77(%rsi), %xmm0
-	movdqu	%xmm0, -77(%rdi)
+    lddqu    -77(%rsi), %xmm0
+    movdqu    %xmm0, -77(%rdi)
 L(fwd_write_61bytes):
-	lddqu	-61(%rsi), %xmm0
-	movdqu	%xmm0, -61(%rdi)
+    lddqu    -61(%rsi), %xmm0
+    movdqu    %xmm0, -61(%rdi)
 L(fwd_write_45bytes):
-	lddqu	-45(%rsi), %xmm0
-	movdqu	%xmm0, -45(%rdi)
+    lddqu    -45(%rsi), %xmm0
+    movdqu    %xmm0, -45(%rdi)
 L(fwd_write_29bytes):
-	lddqu	-29(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -29(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -29(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -29(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_13bytes):
-	mov	-13(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -13(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
+    mov    -13(%rsi), %rdx
+    mov    -8(%rsi), %rcx
+    mov    %rdx, -13(%rdi)
+    mov    %rcx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_140bytes):
-	lddqu	-140(%rsi), %xmm0
-	movdqu	%xmm0, -140(%rdi)
+    lddqu    -140(%rsi), %xmm0
+    movdqu    %xmm0, -140(%rdi)
 L(fwd_write_124bytes):
-	lddqu	-124(%rsi), %xmm0
-	movdqu	%xmm0, -124(%rdi)
+    lddqu    -124(%rsi), %xmm0
+    movdqu    %xmm0, -124(%rdi)
 L(fwd_write_108bytes):
-	lddqu	-108(%rsi), %xmm0
-	movdqu	%xmm0, -108(%rdi)
+    lddqu    -108(%rsi), %xmm0
+    movdqu    %xmm0, -108(%rdi)
 L(fwd_write_92bytes):
-	lddqu	-92(%rsi), %xmm0
-	movdqu	%xmm0, -92(%rdi)
+    lddqu    -92(%rsi), %xmm0
+    movdqu    %xmm0, -92(%rdi)
 L(fwd_write_76bytes):
-	lddqu	-76(%rsi), %xmm0
-	movdqu	%xmm0, -76(%rdi)
+    lddqu    -76(%rsi), %xmm0
+    movdqu    %xmm0, -76(%rdi)
 L(fwd_write_60bytes):
-	lddqu	-60(%rsi), %xmm0
-	movdqu	%xmm0, -60(%rdi)
+    lddqu    -60(%rsi), %xmm0
+    movdqu    %xmm0, -60(%rdi)
 L(fwd_write_44bytes):
-	lddqu	-44(%rsi), %xmm0
-	movdqu	%xmm0, -44(%rdi)
+    lddqu    -44(%rsi), %xmm0
+    movdqu    %xmm0, -44(%rdi)
 L(fwd_write_28bytes):
-	lddqu	-28(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -28(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -28(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -28(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_12bytes):
-	mov	-12(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -12(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
+    mov    -12(%rsi), %rdx
+    mov    -4(%rsi), %ecx
+    mov    %rdx, -12(%rdi)
+    mov    %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_139bytes):
-	lddqu	-139(%rsi), %xmm0
-	movdqu	%xmm0, -139(%rdi)
+    lddqu    -139(%rsi), %xmm0
+    movdqu    %xmm0, -139(%rdi)
 L(fwd_write_123bytes):
-	lddqu	-123(%rsi), %xmm0
-	movdqu	%xmm0, -123(%rdi)
+    lddqu    -123(%rsi), %xmm0
+    movdqu    %xmm0, -123(%rdi)
 L(fwd_write_107bytes):
-	lddqu	-107(%rsi), %xmm0
-	movdqu	%xmm0, -107(%rdi)
+    lddqu    -107(%rsi), %xmm0
+    movdqu    %xmm0, -107(%rdi)
 L(fwd_write_91bytes):
-	lddqu	-91(%rsi), %xmm0
-	movdqu	%xmm0, -91(%rdi)
+    lddqu    -91(%rsi), %xmm0
+    movdqu    %xmm0, -91(%rdi)
 L(fwd_write_75bytes):
-	lddqu	-75(%rsi), %xmm0
-	movdqu	%xmm0, -75(%rdi)
+    lddqu    -75(%rsi), %xmm0
+    movdqu    %xmm0, -75(%rdi)
 L(fwd_write_59bytes):
-	lddqu	-59(%rsi), %xmm0
-	movdqu	%xmm0, -59(%rdi)
+    lddqu    -59(%rsi), %xmm0
+    movdqu    %xmm0, -59(%rdi)
 L(fwd_write_43bytes):
-	lddqu	-43(%rsi), %xmm0
-	movdqu	%xmm0, -43(%rdi)
+    lddqu    -43(%rsi), %xmm0
+    movdqu    %xmm0, -43(%rdi)
 L(fwd_write_27bytes):
-	lddqu	-27(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -27(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -27(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -27(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_11bytes):
-	mov	-11(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -11(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
+    mov    -11(%rsi), %rdx
+    mov    -4(%rsi), %ecx
+    mov    %rdx, -11(%rdi)
+    mov    %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_138bytes):
-	lddqu	-138(%rsi), %xmm0
-	movdqu	%xmm0, -138(%rdi)
+    lddqu    -138(%rsi), %xmm0
+    movdqu    %xmm0, -138(%rdi)
 L(fwd_write_122bytes):
-	lddqu	-122(%rsi), %xmm0
-	movdqu	%xmm0, -122(%rdi)
+    lddqu    -122(%rsi), %xmm0
+    movdqu    %xmm0, -122(%rdi)
 L(fwd_write_106bytes):
-	lddqu	-106(%rsi), %xmm0
-	movdqu	%xmm0, -106(%rdi)
+    lddqu    -106(%rsi), %xmm0
+    movdqu    %xmm0, -106(%rdi)
 L(fwd_write_90bytes):
-	lddqu	-90(%rsi), %xmm0
-	movdqu	%xmm0, -90(%rdi)
+    lddqu    -90(%rsi), %xmm0
+    movdqu    %xmm0, -90(%rdi)
 L(fwd_write_74bytes):
-	lddqu	-74(%rsi), %xmm0
-	movdqu	%xmm0, -74(%rdi)
+    lddqu    -74(%rsi), %xmm0
+    movdqu    %xmm0, -74(%rdi)
 L(fwd_write_58bytes):
-	lddqu	-58(%rsi), %xmm0
-	movdqu	%xmm0, -58(%rdi)
+    lddqu    -58(%rsi), %xmm0
+    movdqu    %xmm0, -58(%rdi)
 L(fwd_write_42bytes):
-	lddqu	-42(%rsi), %xmm0
-	movdqu	%xmm0, -42(%rdi)
+    lddqu    -42(%rsi), %xmm0
+    movdqu    %xmm0, -42(%rdi)
 L(fwd_write_26bytes):
-	lddqu	-26(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -26(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -26(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -26(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_10bytes):
-	mov	-10(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -10(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
+    mov    -10(%rsi), %rdx
+    mov    -4(%rsi), %ecx
+    mov    %rdx, -10(%rdi)
+    mov    %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_137bytes):
-	lddqu	-137(%rsi), %xmm0
-	movdqu	%xmm0, -137(%rdi)
+    lddqu    -137(%rsi), %xmm0
+    movdqu    %xmm0, -137(%rdi)
 L(fwd_write_121bytes):
-	lddqu	-121(%rsi), %xmm0
-	movdqu	%xmm0, -121(%rdi)
+    lddqu    -121(%rsi), %xmm0
+    movdqu    %xmm0, -121(%rdi)
 L(fwd_write_105bytes):
-	lddqu	-105(%rsi), %xmm0
-	movdqu	%xmm0, -105(%rdi)
+    lddqu    -105(%rsi), %xmm0
+    movdqu    %xmm0, -105(%rdi)
 L(fwd_write_89bytes):
-	lddqu	-89(%rsi), %xmm0
-	movdqu	%xmm0, -89(%rdi)
+    lddqu    -89(%rsi), %xmm0
+    movdqu    %xmm0, -89(%rdi)
 L(fwd_write_73bytes):
-	lddqu	-73(%rsi), %xmm0
-	movdqu	%xmm0, -73(%rdi)
+    lddqu    -73(%rsi), %xmm0
+    movdqu    %xmm0, -73(%rdi)
 L(fwd_write_57bytes):
-	lddqu	-57(%rsi), %xmm0
-	movdqu	%xmm0, -57(%rdi)
+    lddqu    -57(%rsi), %xmm0
+    movdqu    %xmm0, -57(%rdi)
 L(fwd_write_41bytes):
-	lddqu	-41(%rsi), %xmm0
-	movdqu	%xmm0, -41(%rdi)
+    lddqu    -41(%rsi), %xmm0
+    movdqu    %xmm0, -41(%rdi)
 L(fwd_write_25bytes):
-	lddqu	-25(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -25(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -25(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -25(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_9bytes):
-	mov	-9(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -9(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
+    mov    -9(%rsi), %rdx
+    mov    -4(%rsi), %ecx
+    mov    %rdx, -9(%rdi)
+    mov    %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_136bytes):
-	lddqu	-136(%rsi), %xmm0
-	movdqu	%xmm0, -136(%rdi)
+    lddqu    -136(%rsi), %xmm0
+    movdqu    %xmm0, -136(%rdi)
 L(fwd_write_120bytes):
-	lddqu	-120(%rsi), %xmm0
-	movdqu	%xmm0, -120(%rdi)
+    lddqu    -120(%rsi), %xmm0
+    movdqu    %xmm0, -120(%rdi)
 L(fwd_write_104bytes):
-	lddqu	-104(%rsi), %xmm0
-	movdqu	%xmm0, -104(%rdi)
+    lddqu    -104(%rsi), %xmm0
+    movdqu    %xmm0, -104(%rdi)
 L(fwd_write_88bytes):
-	lddqu	-88(%rsi), %xmm0
-	movdqu	%xmm0, -88(%rdi)
+    lddqu    -88(%rsi), %xmm0
+    movdqu    %xmm0, -88(%rdi)
 L(fwd_write_72bytes):
-	lddqu	-72(%rsi), %xmm0
-	movdqu	%xmm0, -72(%rdi)
+    lddqu    -72(%rsi), %xmm0
+    movdqu    %xmm0, -72(%rdi)
 L(fwd_write_56bytes):
-	lddqu	-56(%rsi), %xmm0
-	movdqu	%xmm0, -56(%rdi)
+    lddqu    -56(%rsi), %xmm0
+    movdqu    %xmm0, -56(%rdi)
 L(fwd_write_40bytes):
-	lddqu	-40(%rsi), %xmm0
-	movdqu	%xmm0, -40(%rdi)
+    lddqu    -40(%rsi), %xmm0
+    movdqu    %xmm0, -40(%rdi)
 L(fwd_write_24bytes):
-	lddqu	-24(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -24(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -24(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -24(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	%rdx, -8(%rdi)
-	ret
+    mov    -8(%rsi), %rdx
+    mov    %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_135bytes):
-	lddqu	-135(%rsi), %xmm0
-	movdqu	%xmm0, -135(%rdi)
+    lddqu    -135(%rsi), %xmm0
+    movdqu    %xmm0, -135(%rdi)
 L(fwd_write_119bytes):
-	lddqu	-119(%rsi), %xmm0
-	movdqu	%xmm0, -119(%rdi)
+    lddqu    -119(%rsi), %xmm0
+    movdqu    %xmm0, -119(%rdi)
 L(fwd_write_103bytes):
-	lddqu	-103(%rsi), %xmm0
-	movdqu	%xmm0, -103(%rdi)
+    lddqu    -103(%rsi), %xmm0
+    movdqu    %xmm0, -103(%rdi)
 L(fwd_write_87bytes):
-	lddqu	-87(%rsi), %xmm0
-	movdqu	%xmm0, -87(%rdi)
+    lddqu    -87(%rsi), %xmm0
+    movdqu    %xmm0, -87(%rdi)
 L(fwd_write_71bytes):
-	lddqu	-71(%rsi), %xmm0
-	movdqu	%xmm0, -71(%rdi)
+    lddqu    -71(%rsi), %xmm0
+    movdqu    %xmm0, -71(%rdi)
 L(fwd_write_55bytes):
-	lddqu	-55(%rsi), %xmm0
-	movdqu	%xmm0, -55(%rdi)
+    lddqu    -55(%rsi), %xmm0
+    movdqu    %xmm0, -55(%rdi)
 L(fwd_write_39bytes):
-	lddqu	-39(%rsi), %xmm0
-	movdqu	%xmm0, -39(%rdi)
+    lddqu    -39(%rsi), %xmm0
+    movdqu    %xmm0, -39(%rdi)
 L(fwd_write_23bytes):
-	lddqu	-23(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -23(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -23(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -23(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -7(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
+    mov    -7(%rsi), %edx
+    mov    -4(%rsi), %ecx
+    mov    %edx, -7(%rdi)
+    mov    %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_134bytes):
-	lddqu	-134(%rsi), %xmm0
-	movdqu	%xmm0, -134(%rdi)
+    lddqu    -134(%rsi), %xmm0
+    movdqu    %xmm0, -134(%rdi)
 L(fwd_write_118bytes):
-	lddqu	-118(%rsi), %xmm0
-	movdqu	%xmm0, -118(%rdi)
+    lddqu    -118(%rsi), %xmm0
+    movdqu    %xmm0, -118(%rdi)
 L(fwd_write_102bytes):
-	lddqu	-102(%rsi), %xmm0
-	movdqu	%xmm0, -102(%rdi)
+    lddqu    -102(%rsi), %xmm0
+    movdqu    %xmm0, -102(%rdi)
 L(fwd_write_86bytes):
-	lddqu	-86(%rsi), %xmm0
-	movdqu	%xmm0, -86(%rdi)
+    lddqu    -86(%rsi), %xmm0
+    movdqu    %xmm0, -86(%rdi)
 L(fwd_write_70bytes):
-	lddqu	-70(%rsi), %xmm0
-	movdqu	%xmm0, -70(%rdi)
+    lddqu    -70(%rsi), %xmm0
+    movdqu    %xmm0, -70(%rdi)
 L(fwd_write_54bytes):
-	lddqu	-54(%rsi), %xmm0
-	movdqu	%xmm0, -54(%rdi)
+    lddqu    -54(%rsi), %xmm0
+    movdqu    %xmm0, -54(%rdi)
 L(fwd_write_38bytes):
-	lddqu	-38(%rsi), %xmm0
-	movdqu	%xmm0, -38(%rdi)
+    lddqu    -38(%rsi), %xmm0
+    movdqu    %xmm0, -38(%rdi)
 L(fwd_write_22bytes):
-	lddqu	-22(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -22(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -22(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -22(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -6(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
+    mov    -6(%rsi), %edx
+    mov    -4(%rsi), %ecx
+    mov    %edx, -6(%rdi)
+    mov    %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_133bytes):
-	lddqu	-133(%rsi), %xmm0
-	movdqu	%xmm0, -133(%rdi)
+    lddqu    -133(%rsi), %xmm0
+    movdqu    %xmm0, -133(%rdi)
 L(fwd_write_117bytes):
-	lddqu	-117(%rsi), %xmm0
-	movdqu	%xmm0, -117(%rdi)
+    lddqu    -117(%rsi), %xmm0
+    movdqu    %xmm0, -117(%rdi)
 L(fwd_write_101bytes):
-	lddqu	-101(%rsi), %xmm0
-	movdqu	%xmm0, -101(%rdi)
+    lddqu    -101(%rsi), %xmm0
+    movdqu    %xmm0, -101(%rdi)
 L(fwd_write_85bytes):
-	lddqu	-85(%rsi), %xmm0
-	movdqu	%xmm0, -85(%rdi)
+    lddqu    -85(%rsi), %xmm0
+    movdqu    %xmm0, -85(%rdi)
 L(fwd_write_69bytes):
-	lddqu	-69(%rsi), %xmm0
-	movdqu	%xmm0, -69(%rdi)
+    lddqu    -69(%rsi), %xmm0
+    movdqu    %xmm0, -69(%rdi)
 L(fwd_write_53bytes):
-	lddqu	-53(%rsi), %xmm0
-	movdqu	%xmm0, -53(%rdi)
+    lddqu    -53(%rsi), %xmm0
+    movdqu    %xmm0, -53(%rdi)
 L(fwd_write_37bytes):
-	lddqu	-37(%rsi), %xmm0
-	movdqu	%xmm0, -37(%rdi)
+    lddqu    -37(%rsi), %xmm0
+    movdqu    %xmm0, -37(%rdi)
 L(fwd_write_21bytes):
-	lddqu	-21(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -21(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -21(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -21(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -5(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
+    mov    -5(%rsi), %edx
+    mov    -4(%rsi), %ecx
+    mov    %edx, -5(%rdi)
+    mov    %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_132bytes):
-	lddqu	-132(%rsi), %xmm0
-	movdqu	%xmm0, -132(%rdi)
+    lddqu    -132(%rsi), %xmm0
+    movdqu    %xmm0, -132(%rdi)
 L(fwd_write_116bytes):
-	lddqu	-116(%rsi), %xmm0
-	movdqu	%xmm0, -116(%rdi)
+    lddqu    -116(%rsi), %xmm0
+    movdqu    %xmm0, -116(%rdi)
 L(fwd_write_100bytes):
-	lddqu	-100(%rsi), %xmm0
-	movdqu	%xmm0, -100(%rdi)
+    lddqu    -100(%rsi), %xmm0
+    movdqu    %xmm0, -100(%rdi)
 L(fwd_write_84bytes):
-	lddqu	-84(%rsi), %xmm0
-	movdqu	%xmm0, -84(%rdi)
+    lddqu    -84(%rsi), %xmm0
+    movdqu    %xmm0, -84(%rdi)
 L(fwd_write_68bytes):
-	lddqu	-68(%rsi), %xmm0
-	movdqu	%xmm0, -68(%rdi)
+    lddqu    -68(%rsi), %xmm0
+    movdqu    %xmm0, -68(%rdi)
 L(fwd_write_52bytes):
-	lddqu	-52(%rsi), %xmm0
-	movdqu	%xmm0, -52(%rdi)
+    lddqu    -52(%rsi), %xmm0
+    movdqu    %xmm0, -52(%rdi)
 L(fwd_write_36bytes):
-	lddqu	-36(%rsi), %xmm0
-	movdqu	%xmm0, -36(%rdi)
+    lddqu    -36(%rsi), %xmm0
+    movdqu    %xmm0, -36(%rdi)
 L(fwd_write_20bytes):
-	lddqu	-20(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -20(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -20(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -20(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	%edx, -4(%rdi)
-	ret
+    mov    -4(%rsi), %edx
+    mov    %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_131bytes):
-	lddqu	-131(%rsi), %xmm0
-	movdqu	%xmm0, -131(%rdi)
+    lddqu    -131(%rsi), %xmm0
+    movdqu    %xmm0, -131(%rdi)
 L(fwd_write_115bytes):
-	lddqu	-115(%rsi), %xmm0
-	movdqu	%xmm0, -115(%rdi)
+    lddqu    -115(%rsi), %xmm0
+    movdqu    %xmm0, -115(%rdi)
 L(fwd_write_99bytes):
-	lddqu	-99(%rsi), %xmm0
-	movdqu	%xmm0, -99(%rdi)
+    lddqu    -99(%rsi), %xmm0
+    movdqu    %xmm0, -99(%rdi)
 L(fwd_write_83bytes):
-	lddqu	-83(%rsi), %xmm0
-	movdqu	%xmm0, -83(%rdi)
+    lddqu    -83(%rsi), %xmm0
+    movdqu    %xmm0, -83(%rdi)
 L(fwd_write_67bytes):
-	lddqu	-67(%rsi), %xmm0
-	movdqu	%xmm0, -67(%rdi)
+    lddqu    -67(%rsi), %xmm0
+    movdqu    %xmm0, -67(%rdi)
 L(fwd_write_51bytes):
-	lddqu	-51(%rsi), %xmm0
-	movdqu	%xmm0, -51(%rdi)
+    lddqu    -51(%rsi), %xmm0
+    movdqu    %xmm0, -51(%rdi)
 L(fwd_write_35bytes):
-	lddqu	-35(%rsi), %xmm0
-	movdqu	%xmm0, -35(%rdi)
+    lddqu    -35(%rsi), %xmm0
+    movdqu    %xmm0, -35(%rdi)
 L(fwd_write_19bytes):
-	lddqu	-19(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -19(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -19(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -19(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	%dx, -3(%rdi)
-	mov	%cx, -2(%rdi)
-	ret
+    mov    -3(%rsi), %dx
+    mov    -2(%rsi), %cx
+    mov    %dx, -3(%rdi)
+    mov    %cx, -2(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_130bytes):
-	lddqu	-130(%rsi), %xmm0
-	movdqu	%xmm0, -130(%rdi)
+    lddqu    -130(%rsi), %xmm0
+    movdqu    %xmm0, -130(%rdi)
 L(fwd_write_114bytes):
-	lddqu	-114(%rsi), %xmm0
-	movdqu	%xmm0, -114(%rdi)
+    lddqu    -114(%rsi), %xmm0
+    movdqu    %xmm0, -114(%rdi)
 L(fwd_write_98bytes):
-	lddqu	-98(%rsi), %xmm0
-	movdqu	%xmm0, -98(%rdi)
+    lddqu    -98(%rsi), %xmm0
+    movdqu    %xmm0, -98(%rdi)
 L(fwd_write_82bytes):
-	lddqu	-82(%rsi), %xmm0
-	movdqu	%xmm0, -82(%rdi)
+    lddqu    -82(%rsi), %xmm0
+    movdqu    %xmm0, -82(%rdi)
 L(fwd_write_66bytes):
-	lddqu	-66(%rsi), %xmm0
-	movdqu	%xmm0, -66(%rdi)
+    lddqu    -66(%rsi), %xmm0
+    movdqu    %xmm0, -66(%rdi)
 L(fwd_write_50bytes):
-	lddqu	-50(%rsi), %xmm0
-	movdqu	%xmm0, -50(%rdi)
+    lddqu    -50(%rsi), %xmm0
+    movdqu    %xmm0, -50(%rdi)
 L(fwd_write_34bytes):
-	lddqu	-34(%rsi), %xmm0
-	movdqu	%xmm0, -34(%rdi)
+    lddqu    -34(%rsi), %xmm0
+    movdqu    %xmm0, -34(%rdi)
 L(fwd_write_18bytes):
-	lddqu	-18(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -18(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -18(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -18(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_2bytes):
-	movzwl	-2(%rsi), %edx
-	mov	%dx, -2(%rdi)
-	ret
+    movzwl    -2(%rsi), %edx
+    mov    %dx, -2(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_129bytes):
-	lddqu	-129(%rsi), %xmm0
-	movdqu	%xmm0, -129(%rdi)
+    lddqu    -129(%rsi), %xmm0
+    movdqu    %xmm0, -129(%rdi)
 L(fwd_write_113bytes):
-	lddqu	-113(%rsi), %xmm0
-	movdqu	%xmm0, -113(%rdi)
+    lddqu    -113(%rsi), %xmm0
+    movdqu    %xmm0, -113(%rdi)
 L(fwd_write_97bytes):
-	lddqu	-97(%rsi), %xmm0
-	movdqu	%xmm0, -97(%rdi)
+    lddqu    -97(%rsi), %xmm0
+    movdqu    %xmm0, -97(%rdi)
 L(fwd_write_81bytes):
-	lddqu	-81(%rsi), %xmm0
-	movdqu	%xmm0, -81(%rdi)
+    lddqu    -81(%rsi), %xmm0
+    movdqu    %xmm0, -81(%rdi)
 L(fwd_write_65bytes):
-	lddqu	-65(%rsi), %xmm0
-	movdqu	%xmm0, -65(%rdi)
+    lddqu    -65(%rsi), %xmm0
+    movdqu    %xmm0, -65(%rdi)
 L(fwd_write_49bytes):
-	lddqu	-49(%rsi), %xmm0
-	movdqu	%xmm0, -49(%rdi)
+    lddqu    -49(%rsi), %xmm0
+    movdqu    %xmm0, -49(%rdi)
 L(fwd_write_33bytes):
-	lddqu	-33(%rsi), %xmm0
-	movdqu	%xmm0, -33(%rdi)
+    lddqu    -33(%rsi), %xmm0
+    movdqu    %xmm0, -33(%rdi)
 L(fwd_write_17bytes):
-	lddqu	-17(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -17(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
+    lddqu    -17(%rsi), %xmm0
+    lddqu    -16(%rsi), %xmm1
+    movdqu    %xmm0, -17(%rdi)
+    movdqu    %xmm1, -16(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(fwd_write_1bytes):
-	movzbl	-1(%rsi), %edx
-	mov	%dl, -1(%rdi)
-	ret
+    movzbl    -1(%rsi), %edx
+    mov    %dl, -1(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_128bytes):
-	lddqu	112(%rsi), %xmm0
-	movdqu	%xmm0, 112(%rdi)
+    lddqu    112(%rsi), %xmm0
+    movdqu    %xmm0, 112(%rdi)
 L(bwd_write_112bytes):
-	lddqu	96(%rsi), %xmm0
-	movdqu	%xmm0, 96(%rdi)
+    lddqu    96(%rsi), %xmm0
+    movdqu    %xmm0, 96(%rdi)
 L(bwd_write_96bytes):
-	lddqu	80(%rsi), %xmm0
-	movdqu	%xmm0, 80(%rdi)
+    lddqu    80(%rsi), %xmm0
+    movdqu    %xmm0, 80(%rdi)
 L(bwd_write_80bytes):
-	lddqu	64(%rsi), %xmm0
-	movdqu	%xmm0, 64(%rdi)
+    lddqu    64(%rsi), %xmm0
+    movdqu    %xmm0, 64(%rdi)
 L(bwd_write_64bytes):
-	lddqu	48(%rsi), %xmm0
-	movdqu	%xmm0, 48(%rdi)
+    lddqu    48(%rsi), %xmm0
+    movdqu    %xmm0, 48(%rdi)
 L(bwd_write_48bytes):
-	lddqu	32(%rsi), %xmm0
-	movdqu	%xmm0, 32(%rdi)
+    lddqu    32(%rsi), %xmm0
+    movdqu    %xmm0, 32(%rdi)
 L(bwd_write_32bytes):
-	lddqu	16(%rsi), %xmm0
-	movdqu	%xmm0, 16(%rdi)
+    lddqu    16(%rsi), %xmm0
+    movdqu    %xmm0, 16(%rdi)
 L(bwd_write_16bytes):
-	lddqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
+    lddqu    (%rsi), %xmm0
+    movdqu    %xmm0, (%rdi)
 L(bwd_write_0bytes):
-	ret
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_143bytes):
-	lddqu	127(%rsi), %xmm0
-	movdqu	%xmm0, 127(%rdi)
+    lddqu    127(%rsi), %xmm0
+    movdqu    %xmm0, 127(%rdi)
 L(bwd_write_127bytes):
-	lddqu	111(%rsi), %xmm0
-	movdqu	%xmm0, 111(%rdi)
+    lddqu    111(%rsi), %xmm0
+    movdqu    %xmm0, 111(%rdi)
 L(bwd_write_111bytes):
-	lddqu	95(%rsi), %xmm0
-	movdqu	%xmm0, 95(%rdi)
+    lddqu    95(%rsi), %xmm0
+    movdqu    %xmm0, 95(%rdi)
 L(bwd_write_95bytes):
-	lddqu	79(%rsi), %xmm0
-	movdqu	%xmm0, 79(%rdi)
+    lddqu    79(%rsi), %xmm0
+    movdqu    %xmm0, 79(%rdi)
 L(bwd_write_79bytes):
-	lddqu	63(%rsi), %xmm0
-	movdqu	%xmm0, 63(%rdi)
+    lddqu    63(%rsi), %xmm0
+    movdqu    %xmm0, 63(%rdi)
 L(bwd_write_63bytes):
-	lddqu	47(%rsi), %xmm0
-	movdqu	%xmm0, 47(%rdi)
+    lddqu    47(%rsi), %xmm0
+    movdqu    %xmm0, 47(%rdi)
 L(bwd_write_47bytes):
-	lddqu	31(%rsi), %xmm0
-	movdqu	%xmm0, 31(%rdi)
+    lddqu    31(%rsi), %xmm0
+    movdqu    %xmm0, 31(%rdi)
 L(bwd_write_31bytes):
-	lddqu	15(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 15(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    15(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 15(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_15bytes):
-	mov	7(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 7(%rdi)
-	mov	%rcx, (%rdi)
-	ret
+    mov    7(%rsi), %rdx
+    mov    (%rsi), %rcx
+    mov    %rdx, 7(%rdi)
+    mov    %rcx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_142bytes):
-	lddqu	126(%rsi), %xmm0
-	movdqu	%xmm0, 126(%rdi)
+    lddqu    126(%rsi), %xmm0
+    movdqu    %xmm0, 126(%rdi)
 L(bwd_write_126bytes):
-	lddqu	110(%rsi), %xmm0
-	movdqu	%xmm0, 110(%rdi)
+    lddqu    110(%rsi), %xmm0
+    movdqu    %xmm0, 110(%rdi)
 L(bwd_write_110bytes):
-	lddqu	94(%rsi), %xmm0
-	movdqu	%xmm0, 94(%rdi)
+    lddqu    94(%rsi), %xmm0
+    movdqu    %xmm0, 94(%rdi)
 L(bwd_write_94bytes):
-	lddqu	78(%rsi), %xmm0
-	movdqu	%xmm0, 78(%rdi)
+    lddqu    78(%rsi), %xmm0
+    movdqu    %xmm0, 78(%rdi)
 L(bwd_write_78bytes):
-	lddqu	62(%rsi), %xmm0
-	movdqu	%xmm0, 62(%rdi)
+    lddqu    62(%rsi), %xmm0
+    movdqu    %xmm0, 62(%rdi)
 L(bwd_write_62bytes):
-	lddqu	46(%rsi), %xmm0
-	movdqu	%xmm0, 46(%rdi)
+    lddqu    46(%rsi), %xmm0
+    movdqu    %xmm0, 46(%rdi)
 L(bwd_write_46bytes):
-	lddqu	30(%rsi), %xmm0
-	movdqu	%xmm0, 30(%rdi)
+    lddqu    30(%rsi), %xmm0
+    movdqu    %xmm0, 30(%rdi)
 L(bwd_write_30bytes):
-	lddqu	14(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 14(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    14(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 14(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_14bytes):
-	mov	6(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 6(%rdi)
-	mov	%rcx, (%rdi)
-	ret
+    mov    6(%rsi), %rdx
+    mov    (%rsi), %rcx
+    mov    %rdx, 6(%rdi)
+    mov    %rcx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_141bytes):
-	lddqu	125(%rsi), %xmm0
-	movdqu	%xmm0, 125(%rdi)
+    lddqu    125(%rsi), %xmm0
+    movdqu    %xmm0, 125(%rdi)
 L(bwd_write_125bytes):
-	lddqu	109(%rsi), %xmm0
-	movdqu	%xmm0, 109(%rdi)
+    lddqu    109(%rsi), %xmm0
+    movdqu    %xmm0, 109(%rdi)
 L(bwd_write_109bytes):
-	lddqu	93(%rsi), %xmm0
-	movdqu	%xmm0, 93(%rdi)
+    lddqu    93(%rsi), %xmm0
+    movdqu    %xmm0, 93(%rdi)
 L(bwd_write_93bytes):
-	lddqu	77(%rsi), %xmm0
-	movdqu	%xmm0, 77(%rdi)
+    lddqu    77(%rsi), %xmm0
+    movdqu    %xmm0, 77(%rdi)
 L(bwd_write_77bytes):
-	lddqu	61(%rsi), %xmm0
-	movdqu	%xmm0, 61(%rdi)
+    lddqu    61(%rsi), %xmm0
+    movdqu    %xmm0, 61(%rdi)
 L(bwd_write_61bytes):
-	lddqu	45(%rsi), %xmm0
-	movdqu	%xmm0, 45(%rdi)
+    lddqu    45(%rsi), %xmm0
+    movdqu    %xmm0, 45(%rdi)
 L(bwd_write_45bytes):
-	lddqu	29(%rsi), %xmm0
-	movdqu	%xmm0, 29(%rdi)
+    lddqu    29(%rsi), %xmm0
+    movdqu    %xmm0, 29(%rdi)
 L(bwd_write_29bytes):
-	lddqu	13(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 13(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    13(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 13(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_13bytes):
-	mov	5(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 5(%rdi)
-	mov	%rcx, (%rdi)
-	ret
+    mov    5(%rsi), %rdx
+    mov    (%rsi), %rcx
+    mov    %rdx, 5(%rdi)
+    mov    %rcx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_140bytes):
-	lddqu	124(%rsi), %xmm0
-	movdqu	%xmm0, 124(%rdi)
+    lddqu    124(%rsi), %xmm0
+    movdqu    %xmm0, 124(%rdi)
 L(bwd_write_124bytes):
-	lddqu	108(%rsi), %xmm0
-	movdqu	%xmm0, 108(%rdi)
+    lddqu    108(%rsi), %xmm0
+    movdqu    %xmm0, 108(%rdi)
 L(bwd_write_108bytes):
-	lddqu	92(%rsi), %xmm0
-	movdqu	%xmm0, 92(%rdi)
+    lddqu    92(%rsi), %xmm0
+    movdqu    %xmm0, 92(%rdi)
 L(bwd_write_92bytes):
-	lddqu	76(%rsi), %xmm0
-	movdqu	%xmm0, 76(%rdi)
+    lddqu    76(%rsi), %xmm0
+    movdqu    %xmm0, 76(%rdi)
 L(bwd_write_76bytes):
-	lddqu	60(%rsi), %xmm0
-	movdqu	%xmm0, 60(%rdi)
+    lddqu    60(%rsi), %xmm0
+    movdqu    %xmm0, 60(%rdi)
 L(bwd_write_60bytes):
-	lddqu	44(%rsi), %xmm0
-	movdqu	%xmm0, 44(%rdi)
+    lddqu    44(%rsi), %xmm0
+    movdqu    %xmm0, 44(%rdi)
 L(bwd_write_44bytes):
-	lddqu	28(%rsi), %xmm0
-	movdqu	%xmm0, 28(%rdi)
+    lddqu    28(%rsi), %xmm0
+    movdqu    %xmm0, 28(%rdi)
 L(bwd_write_28bytes):
-	lddqu	12(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 12(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    12(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 12(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_12bytes):
-	mov	4(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 4(%rdi)
-	mov	%rcx, (%rdi)
-	ret
+    mov    4(%rsi), %rdx
+    mov    (%rsi), %rcx
+    mov    %rdx, 4(%rdi)
+    mov    %rcx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_139bytes):
-	lddqu	123(%rsi), %xmm0
-	movdqu	%xmm0, 123(%rdi)
+    lddqu    123(%rsi), %xmm0
+    movdqu    %xmm0, 123(%rdi)
 L(bwd_write_123bytes):
-	lddqu	107(%rsi), %xmm0
-	movdqu	%xmm0, 107(%rdi)
+    lddqu    107(%rsi), %xmm0
+    movdqu    %xmm0, 107(%rdi)
 L(bwd_write_107bytes):
-	lddqu	91(%rsi), %xmm0
-	movdqu	%xmm0, 91(%rdi)
+    lddqu    91(%rsi), %xmm0
+    movdqu    %xmm0, 91(%rdi)
 L(bwd_write_91bytes):
-	lddqu	75(%rsi), %xmm0
-	movdqu	%xmm0, 75(%rdi)
+    lddqu    75(%rsi), %xmm0
+    movdqu    %xmm0, 75(%rdi)
 L(bwd_write_75bytes):
-	lddqu	59(%rsi), %xmm0
-	movdqu	%xmm0, 59(%rdi)
+    lddqu    59(%rsi), %xmm0
+    movdqu    %xmm0, 59(%rdi)
 L(bwd_write_59bytes):
-	lddqu	43(%rsi), %xmm0
-	movdqu	%xmm0, 43(%rdi)
+    lddqu    43(%rsi), %xmm0
+    movdqu    %xmm0, 43(%rdi)
 L(bwd_write_43bytes):
-	lddqu	27(%rsi), %xmm0
-	movdqu	%xmm0, 27(%rdi)
+    lddqu    27(%rsi), %xmm0
+    movdqu    %xmm0, 27(%rdi)
 L(bwd_write_27bytes):
-	lddqu	11(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 11(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    11(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 11(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_11bytes):
-	mov	3(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 3(%rdi)
-	mov	%rcx, (%rdi)
-	ret
+    mov    3(%rsi), %rdx
+    mov    (%rsi), %rcx
+    mov    %rdx, 3(%rdi)
+    mov    %rcx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_138bytes):
-	lddqu	122(%rsi), %xmm0
-	movdqu	%xmm0, 122(%rdi)
+    lddqu    122(%rsi), %xmm0
+    movdqu    %xmm0, 122(%rdi)
 L(bwd_write_122bytes):
-	lddqu	106(%rsi), %xmm0
-	movdqu	%xmm0, 106(%rdi)
+    lddqu    106(%rsi), %xmm0
+    movdqu    %xmm0, 106(%rdi)
 L(bwd_write_106bytes):
-	lddqu	90(%rsi), %xmm0
-	movdqu	%xmm0, 90(%rdi)
+    lddqu    90(%rsi), %xmm0
+    movdqu    %xmm0, 90(%rdi)
 L(bwd_write_90bytes):
-	lddqu	74(%rsi), %xmm0
-	movdqu	%xmm0, 74(%rdi)
+    lddqu    74(%rsi), %xmm0
+    movdqu    %xmm0, 74(%rdi)
 L(bwd_write_74bytes):
-	lddqu	58(%rsi), %xmm0
-	movdqu	%xmm0, 58(%rdi)
+    lddqu    58(%rsi), %xmm0
+    movdqu    %xmm0, 58(%rdi)
 L(bwd_write_58bytes):
-	lddqu	42(%rsi), %xmm0
-	movdqu	%xmm0, 42(%rdi)
+    lddqu    42(%rsi), %xmm0
+    movdqu    %xmm0, 42(%rdi)
 L(bwd_write_42bytes):
-	lddqu	26(%rsi), %xmm0
-	movdqu	%xmm0, 26(%rdi)
+    lddqu    26(%rsi), %xmm0
+    movdqu    %xmm0, 26(%rdi)
 L(bwd_write_26bytes):
-	lddqu	10(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 10(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    10(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 10(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_10bytes):
-	mov	2(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 2(%rdi)
-	mov	%rcx, (%rdi)
-	ret
+    mov    2(%rsi), %rdx
+    mov    (%rsi), %rcx
+    mov    %rdx, 2(%rdi)
+    mov    %rcx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_137bytes):
-	lddqu	121(%rsi), %xmm0
-	movdqu	%xmm0, 121(%rdi)
+    lddqu    121(%rsi), %xmm0
+    movdqu    %xmm0, 121(%rdi)
 L(bwd_write_121bytes):
-	lddqu	105(%rsi), %xmm0
-	movdqu	%xmm0, 105(%rdi)
+    lddqu    105(%rsi), %xmm0
+    movdqu    %xmm0, 105(%rdi)
 L(bwd_write_105bytes):
-	lddqu	89(%rsi), %xmm0
-	movdqu	%xmm0, 89(%rdi)
+    lddqu    89(%rsi), %xmm0
+    movdqu    %xmm0, 89(%rdi)
 L(bwd_write_89bytes):
-	lddqu	73(%rsi), %xmm0
-	movdqu	%xmm0, 73(%rdi)
+    lddqu    73(%rsi), %xmm0
+    movdqu    %xmm0, 73(%rdi)
 L(bwd_write_73bytes):
-	lddqu	57(%rsi), %xmm0
-	movdqu	%xmm0, 57(%rdi)
+    lddqu    57(%rsi), %xmm0
+    movdqu    %xmm0, 57(%rdi)
 L(bwd_write_57bytes):
-	lddqu	41(%rsi), %xmm0
-	movdqu	%xmm0, 41(%rdi)
+    lddqu    41(%rsi), %xmm0
+    movdqu    %xmm0, 41(%rdi)
 L(bwd_write_41bytes):
-	lddqu	25(%rsi), %xmm0
-	movdqu	%xmm0, 25(%rdi)
+    lddqu    25(%rsi), %xmm0
+    movdqu    %xmm0, 25(%rdi)
 L(bwd_write_25bytes):
-	lddqu	9(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 9(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    9(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 9(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_9bytes):
-	mov	1(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 1(%rdi)
-	mov	%rcx, (%rdi)
-	ret
+    mov    1(%rsi), %rdx
+    mov    (%rsi), %rcx
+    mov    %rdx, 1(%rdi)
+    mov    %rcx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_136bytes):
-	lddqu	120(%rsi), %xmm0
-	movdqu	%xmm0, 120(%rdi)
+    lddqu    120(%rsi), %xmm0
+    movdqu    %xmm0, 120(%rdi)
 L(bwd_write_120bytes):
-	lddqu	104(%rsi), %xmm0
-	movdqu	%xmm0, 104(%rdi)
+    lddqu    104(%rsi), %xmm0
+    movdqu    %xmm0, 104(%rdi)
 L(bwd_write_104bytes):
-	lddqu	88(%rsi), %xmm0
-	movdqu	%xmm0, 88(%rdi)
+    lddqu    88(%rsi), %xmm0
+    movdqu    %xmm0, 88(%rdi)
 L(bwd_write_88bytes):
-	lddqu	72(%rsi), %xmm0
-	movdqu	%xmm0, 72(%rdi)
+    lddqu    72(%rsi), %xmm0
+    movdqu    %xmm0, 72(%rdi)
 L(bwd_write_72bytes):
-	lddqu	56(%rsi), %xmm0
-	movdqu	%xmm0, 56(%rdi)
+    lddqu    56(%rsi), %xmm0
+    movdqu    %xmm0, 56(%rdi)
 L(bwd_write_56bytes):
-	lddqu	40(%rsi), %xmm0
-	movdqu	%xmm0, 40(%rdi)
+    lddqu    40(%rsi), %xmm0
+    movdqu    %xmm0, 40(%rdi)
 L(bwd_write_40bytes):
-	lddqu	24(%rsi), %xmm0
-	movdqu	%xmm0, 24(%rdi)
+    lddqu    24(%rsi), %xmm0
+    movdqu    %xmm0, 24(%rdi)
 L(bwd_write_24bytes):
-	lddqu	8(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 8(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    8(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 8(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_8bytes):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	ret
+    mov    (%rsi), %rdx
+    mov    %rdx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_135bytes):
-	lddqu	119(%rsi), %xmm0
-	movdqu	%xmm0, 119(%rdi)
+    lddqu    119(%rsi), %xmm0
+    movdqu    %xmm0, 119(%rdi)
 L(bwd_write_119bytes):
-	lddqu	103(%rsi), %xmm0
-	movdqu	%xmm0, 103(%rdi)
+    lddqu    103(%rsi), %xmm0
+    movdqu    %xmm0, 103(%rdi)
 L(bwd_write_103bytes):
-	lddqu	87(%rsi), %xmm0
-	movdqu	%xmm0, 87(%rdi)
+    lddqu    87(%rsi), %xmm0
+    movdqu    %xmm0, 87(%rdi)
 L(bwd_write_87bytes):
-	lddqu	71(%rsi), %xmm0
-	movdqu	%xmm0, 71(%rdi)
+    lddqu    71(%rsi), %xmm0
+    movdqu    %xmm0, 71(%rdi)
 L(bwd_write_71bytes):
-	lddqu	55(%rsi), %xmm0
-	movdqu	%xmm0, 55(%rdi)
+    lddqu    55(%rsi), %xmm0
+    movdqu    %xmm0, 55(%rdi)
 L(bwd_write_55bytes):
-	lddqu	39(%rsi), %xmm0
-	movdqu	%xmm0, 39(%rdi)
+    lddqu    39(%rsi), %xmm0
+    movdqu    %xmm0, 39(%rdi)
 L(bwd_write_39bytes):
-	lddqu	23(%rsi), %xmm0
-	movdqu	%xmm0, 23(%rdi)
+    lddqu    23(%rsi), %xmm0
+    movdqu    %xmm0, 23(%rdi)
 L(bwd_write_23bytes):
-	lddqu	7(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 7(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    7(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 7(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_7bytes):
-	mov	3(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 3(%rdi)
-	mov	%ecx, (%rdi)
-	ret
+    mov    3(%rsi), %edx
+    mov    (%rsi), %ecx
+    mov    %edx, 3(%rdi)
+    mov    %ecx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_134bytes):
-	lddqu	118(%rsi), %xmm0
-	movdqu	%xmm0, 118(%rdi)
+    lddqu    118(%rsi), %xmm0
+    movdqu    %xmm0, 118(%rdi)
 L(bwd_write_118bytes):
-	lddqu	102(%rsi), %xmm0
-	movdqu	%xmm0, 102(%rdi)
+    lddqu    102(%rsi), %xmm0
+    movdqu    %xmm0, 102(%rdi)
 L(bwd_write_102bytes):
-	lddqu	86(%rsi), %xmm0
-	movdqu	%xmm0, 86(%rdi)
+    lddqu    86(%rsi), %xmm0
+    movdqu    %xmm0, 86(%rdi)
 L(bwd_write_86bytes):
-	lddqu	70(%rsi), %xmm0
-	movdqu	%xmm0, 70(%rdi)
+    lddqu    70(%rsi), %xmm0
+    movdqu    %xmm0, 70(%rdi)
 L(bwd_write_70bytes):
-	lddqu	54(%rsi), %xmm0
-	movdqu	%xmm0, 54(%rdi)
+    lddqu    54(%rsi), %xmm0
+    movdqu    %xmm0, 54(%rdi)
 L(bwd_write_54bytes):
-	lddqu	38(%rsi), %xmm0
-	movdqu	%xmm0, 38(%rdi)
+    lddqu    38(%rsi), %xmm0
+    movdqu    %xmm0, 38(%rdi)
 L(bwd_write_38bytes):
-	lddqu	22(%rsi), %xmm0
-	movdqu	%xmm0, 22(%rdi)
+    lddqu    22(%rsi), %xmm0
+    movdqu    %xmm0, 22(%rdi)
 L(bwd_write_22bytes):
-	lddqu	6(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 6(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    6(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 6(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_6bytes):
-	mov	2(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 2(%rdi)
-	mov	%ecx, (%rdi)
-	ret
+    mov    2(%rsi), %edx
+    mov    (%rsi), %ecx
+    mov    %edx, 2(%rdi)
+    mov    %ecx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_133bytes):
-	lddqu	117(%rsi), %xmm0
-	movdqu	%xmm0, 117(%rdi)
+    lddqu    117(%rsi), %xmm0
+    movdqu    %xmm0, 117(%rdi)
 L(bwd_write_117bytes):
-	lddqu	101(%rsi), %xmm0
-	movdqu	%xmm0, 101(%rdi)
+    lddqu    101(%rsi), %xmm0
+    movdqu    %xmm0, 101(%rdi)
 L(bwd_write_101bytes):
-	lddqu	85(%rsi), %xmm0
-	movdqu	%xmm0, 85(%rdi)
+    lddqu    85(%rsi), %xmm0
+    movdqu    %xmm0, 85(%rdi)
 L(bwd_write_85bytes):
-	lddqu	69(%rsi), %xmm0
-	movdqu	%xmm0, 69(%rdi)
+    lddqu    69(%rsi), %xmm0
+    movdqu    %xmm0, 69(%rdi)
 L(bwd_write_69bytes):
-	lddqu	53(%rsi), %xmm0
-	movdqu	%xmm0, 53(%rdi)
+    lddqu    53(%rsi), %xmm0
+    movdqu    %xmm0, 53(%rdi)
 L(bwd_write_53bytes):
-	lddqu	37(%rsi), %xmm0
-	movdqu	%xmm0, 37(%rdi)
+    lddqu    37(%rsi), %xmm0
+    movdqu    %xmm0, 37(%rdi)
 L(bwd_write_37bytes):
-	lddqu	21(%rsi), %xmm0
-	movdqu	%xmm0, 21(%rdi)
+    lddqu    21(%rsi), %xmm0
+    movdqu    %xmm0, 21(%rdi)
 L(bwd_write_21bytes):
-	lddqu	5(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 5(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    5(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 5(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_5bytes):
-	mov	1(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 1(%rdi)
-	mov	%ecx, (%rdi)
-	ret
+    mov    1(%rsi), %edx
+    mov    (%rsi), %ecx
+    mov    %edx, 1(%rdi)
+    mov    %ecx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_132bytes):
-	lddqu	116(%rsi), %xmm0
-	movdqu	%xmm0, 116(%rdi)
+    lddqu    116(%rsi), %xmm0
+    movdqu    %xmm0, 116(%rdi)
 L(bwd_write_116bytes):
-	lddqu	100(%rsi), %xmm0
-	movdqu	%xmm0, 100(%rdi)
+    lddqu    100(%rsi), %xmm0
+    movdqu    %xmm0, 100(%rdi)
 L(bwd_write_100bytes):
-	lddqu	84(%rsi), %xmm0
-	movdqu	%xmm0, 84(%rdi)
+    lddqu    84(%rsi), %xmm0
+    movdqu    %xmm0, 84(%rdi)
 L(bwd_write_84bytes):
-	lddqu	68(%rsi), %xmm0
-	movdqu	%xmm0, 68(%rdi)
+    lddqu    68(%rsi), %xmm0
+    movdqu    %xmm0, 68(%rdi)
 L(bwd_write_68bytes):
-	lddqu	52(%rsi), %xmm0
-	movdqu	%xmm0, 52(%rdi)
+    lddqu    52(%rsi), %xmm0
+    movdqu    %xmm0, 52(%rdi)
 L(bwd_write_52bytes):
-	lddqu	36(%rsi), %xmm0
-	movdqu	%xmm0, 36(%rdi)
+    lddqu    36(%rsi), %xmm0
+    movdqu    %xmm0, 36(%rdi)
 L(bwd_write_36bytes):
-	lddqu	20(%rsi), %xmm0
-	movdqu	%xmm0, 20(%rdi)
+    lddqu    20(%rsi), %xmm0
+    movdqu    %xmm0, 20(%rdi)
 L(bwd_write_20bytes):
-	lddqu	4(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 4(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    4(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 4(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_4bytes):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	ret
+    mov    (%rsi), %edx
+    mov    %edx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_131bytes):
-	lddqu	115(%rsi), %xmm0
-	movdqu	%xmm0, 115(%rdi)
+    lddqu    115(%rsi), %xmm0
+    movdqu    %xmm0, 115(%rdi)
 L(bwd_write_115bytes):
-	lddqu	99(%rsi), %xmm0
-	movdqu	%xmm0, 99(%rdi)
+    lddqu    99(%rsi), %xmm0
+    movdqu    %xmm0, 99(%rdi)
 L(bwd_write_99bytes):
-	lddqu	83(%rsi), %xmm0
-	movdqu	%xmm0, 83(%rdi)
+    lddqu    83(%rsi), %xmm0
+    movdqu    %xmm0, 83(%rdi)
 L(bwd_write_83bytes):
-	lddqu	67(%rsi), %xmm0
-	movdqu	%xmm0, 67(%rdi)
+    lddqu    67(%rsi), %xmm0
+    movdqu    %xmm0, 67(%rdi)
 L(bwd_write_67bytes):
-	lddqu	51(%rsi), %xmm0
-	movdqu	%xmm0, 51(%rdi)
+    lddqu    51(%rsi), %xmm0
+    movdqu    %xmm0, 51(%rdi)
 L(bwd_write_51bytes):
-	lddqu	35(%rsi), %xmm0
-	movdqu	%xmm0, 35(%rdi)
+    lddqu    35(%rsi), %xmm0
+    movdqu    %xmm0, 35(%rdi)
 L(bwd_write_35bytes):
-	lddqu	19(%rsi), %xmm0
-	movdqu	%xmm0, 19(%rdi)
+    lddqu    19(%rsi), %xmm0
+    movdqu    %xmm0, 19(%rdi)
 L(bwd_write_19bytes):
-	lddqu	3(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 3(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    3(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 3(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_3bytes):
-	mov	1(%rsi), %dx
-	mov	(%rsi), %cx
-	mov	%dx, 1(%rdi)
-	mov	%cx, (%rdi)
-	ret
+    mov    1(%rsi), %dx
+    mov    (%rsi), %cx
+    mov    %dx, 1(%rdi)
+    mov    %cx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_130bytes):
-	lddqu	114(%rsi), %xmm0
-	movdqu	%xmm0, 114(%rdi)
+    lddqu    114(%rsi), %xmm0
+    movdqu    %xmm0, 114(%rdi)
 L(bwd_write_114bytes):
-	lddqu	98(%rsi), %xmm0
-	movdqu	%xmm0, 98(%rdi)
+    lddqu    98(%rsi), %xmm0
+    movdqu    %xmm0, 98(%rdi)
 L(bwd_write_98bytes):
-	lddqu	82(%rsi), %xmm0
-	movdqu	%xmm0, 82(%rdi)
+    lddqu    82(%rsi), %xmm0
+    movdqu    %xmm0, 82(%rdi)
 L(bwd_write_82bytes):
-	lddqu	66(%rsi), %xmm0
-	movdqu	%xmm0, 66(%rdi)
+    lddqu    66(%rsi), %xmm0
+    movdqu    %xmm0, 66(%rdi)
 L(bwd_write_66bytes):
-	lddqu	50(%rsi), %xmm0
-	movdqu	%xmm0, 50(%rdi)
+    lddqu    50(%rsi), %xmm0
+    movdqu    %xmm0, 50(%rdi)
 L(bwd_write_50bytes):
-	lddqu	34(%rsi), %xmm0
-	movdqu	%xmm0, 34(%rdi)
+    lddqu    34(%rsi), %xmm0
+    movdqu    %xmm0, 34(%rdi)
 L(bwd_write_34bytes):
-	lddqu	18(%rsi), %xmm0
-	movdqu	%xmm0, 18(%rdi)
+    lddqu    18(%rsi), %xmm0
+    movdqu    %xmm0, 18(%rdi)
 L(bwd_write_18bytes):
-	lddqu	2(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 2(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    2(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 2(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_2bytes):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-	ret
+    movzwl    (%rsi), %edx
+    mov    %dx, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_129bytes):
-	lddqu	113(%rsi), %xmm0
-	movdqu	%xmm0, 113(%rdi)
+    lddqu    113(%rsi), %xmm0
+    movdqu    %xmm0, 113(%rdi)
 L(bwd_write_113bytes):
-	lddqu	97(%rsi), %xmm0
-	movdqu	%xmm0, 97(%rdi)
+    lddqu    97(%rsi), %xmm0
+    movdqu    %xmm0, 97(%rdi)
 L(bwd_write_97bytes):
-	lddqu	81(%rsi), %xmm0
-	movdqu	%xmm0, 81(%rdi)
+    lddqu    81(%rsi), %xmm0
+    movdqu    %xmm0, 81(%rdi)
 L(bwd_write_81bytes):
-	lddqu	65(%rsi), %xmm0
-	movdqu	%xmm0, 65(%rdi)
+    lddqu    65(%rsi), %xmm0
+    movdqu    %xmm0, 65(%rdi)
 L(bwd_write_65bytes):
-	lddqu	49(%rsi), %xmm0
-	movdqu	%xmm0, 49(%rdi)
+    lddqu    49(%rsi), %xmm0
+    movdqu    %xmm0, 49(%rdi)
 L(bwd_write_49bytes):
-	lddqu	33(%rsi), %xmm0
-	movdqu	%xmm0, 33(%rdi)
+    lddqu    33(%rsi), %xmm0
+    movdqu    %xmm0, 33(%rdi)
 L(bwd_write_33bytes):
-	lddqu	17(%rsi), %xmm0
-	movdqu	%xmm0, 17(%rdi)
+    lddqu    17(%rsi), %xmm0
+    movdqu    %xmm0, 17(%rdi)
 L(bwd_write_17bytes):
-	lddqu	1(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 1(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
+    lddqu    1(%rsi), %xmm0
+    lddqu    (%rsi), %xmm1
+    movdqu    %xmm0, 1(%rdi)
+    movdqu    %xmm1, (%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(bwd_write_1bytes):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-	ret
+    movzbl    (%rsi), %edx
+    mov    %dl, (%rdi)
+    ret
 
 END (MEMCPY)
 
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
+    .section .rodata.ssse3,"a",@progbits
+    .p2align 3
 L(table_144_bytes_bwd):
-	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
+    .int    JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
 
-	.p2align 3
+    .p2align 3
 L(table_144_bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
+    .int    JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
 
-	.p2align 3
+    .p2align 3
 L(shl_table_fwd):
-	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_0), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_1), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_2), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_3), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_4), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_5), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_6), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_7), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_8), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_9), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_10), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_11), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_12), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_13), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_14), L(shl_table_fwd))
+    .int    JMPTBL (L(shl_15), L(shl_table_fwd))
 
-	.p2align 3
+    .p2align 3
 L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
 
 #endif
diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3.S b/utils/memcpy-bench/glibc/memcpy-ssse3.S
index 2fd26651645..11cb6559a8b 100644
--- a/utils/memcpy-bench/glibc/memcpy-ssse3.S
+++ b/utils/memcpy-bench/glibc/memcpy-ssse3.S
@@ -24,3129 +24,3129 @@
 #include "asm-syntax.h"
 
 #ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+# define MEMCPY        __memcpy_ssse3
+# define MEMCPY_CHK    __memcpy_chk_ssse3
+# define MEMPCPY    __mempcpy_ssse3
+# define MEMPCPY_CHK    __mempcpy_chk_ssse3
 #endif
 
-#define JMPTBL(I, B)	I - B
+#define JMPTBL(I, B)    I - B
 
 /* Branch to an entry in a jump table.  TABLE is a jump table with
    relative offsets.  INDEX is a register contains the index into the
    jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)        \
+  lea        TABLE(%rip), %r11;                \
+  movslq    (%r11, INDEX, SCALE), INDEX;            \
+  lea        (%r11, INDEX), INDEX;                \
+  _CET_NOTRACK jmp *INDEX;                    \
   ud2
 
-	.section .text.ssse3,"ax",@progbits
+    .section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start)
 END (MEMPCPY)
 #endif
 
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
+    mov    %RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
 #endif
 
 #ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
+    /* Clear the upper 32 bits.  */
+    mov    %edx, %edx
 #endif
 
 #ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
+    cmp    %rsi, %rdi
+    jb    L(copy_forward)
+    je    L(write_0bytes)
+    cmp    $79, %rdx
+    jbe    L(copy_forward)
+    jmp    L(copy_backward)
 L(copy_forward):
 #endif
 L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
+    cmp    $79, %rdx
+    lea     L(table_less_80bytes)(%rip), %r11
+    ja    L(80bytesormore)
+    movslq    (%r11, %rdx, 4), %r9
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    add    %r11, %r9
+    _CET_NOTRACK jmp *%r9
+    ud2
 
-	.p2align 4
+    .p2align 4
 L(80bytesormore):
 #ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
+    cmp    %dil, %sil
+    jle    L(copy_backward)
 #endif
 
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
+    movdqu    (%rsi), %xmm0
+    mov    %rdi, %rcx
+    and    $-16, %rdi
+    add    $16, %rdi
+    mov    %rcx, %r8
+    sub    %rdi, %rcx
+    add    %rcx, %rdx
+    sub    %rcx, %rsi
 
 #ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+    mov    $SHARED_CACHE_SIZE_HALF, %RCX_LP
 #else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+    mov    __x86_shared_cache_size_half(%rip), %RCX_LP
 #endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
+    cmp    %rcx, %rdx
+    mov    %rsi, %r9
+    ja    L(large_page_fwd)
+    and    $0xf, %r9
+    jz    L(shl_0)
 #ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+    mov    $DATA_CACHE_SIZE_HALF, %RCX_LP
 #else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+    mov    __x86_data_cache_size_half(%rip), %RCX_LP
 #endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
+    BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
 
-	.p2align 4
+    .p2align 4
 L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
+    movdqu    -16(%rsi, %rdx), %xmm0
+    add    %rdx, %rsi
+    lea    -16(%rdi, %rdx), %r8
+    add    %rdx, %rdi
 
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
+    mov    %rdi, %rcx
+    and    $0xf, %rcx
+    xor    %rcx, %rdi
+    sub    %rcx, %rdx
+    sub    %rcx, %rsi
 
 #ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
+    mov    $SHARED_CACHE_SIZE_HALF, %RCX_LP
 #else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
+    mov    __x86_shared_cache_size_half(%rip), %RCX_LP
 #endif
 
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
+    cmp    %rcx, %rdx
+    mov    %rsi, %r9
+    ja    L(large_page_bwd)
+    and    $0xf, %r9
+    jz    L(shl_0_bwd)
 #ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
+    mov    $DATA_CACHE_SIZE_HALF, %RCX_LP
 #else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
+    mov    __x86_data_cache_size_half(%rip), %RCX_LP
 #endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
+    BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
+    sub    $16, %rdx
+    movdqa    (%rsi), %xmm1
+    add    $16, %rsi
+    movdqa    %xmm1, (%rdi)
+    add    $16, %rdi
+    cmp    $128, %rdx
+    movdqu    %xmm0, (%r8)
+    ja    L(shl_0_gobble)
+    cmp    $64, %rdx
+    jb    L(shl_0_less_64bytes)
+    movaps    (%rsi), %xmm4
+    movaps    16(%rsi), %xmm1
+    movaps    32(%rsi), %xmm2
+    movaps    48(%rsi), %xmm3
+    movaps    %xmm4, (%rdi)
+    movaps    %xmm1, 16(%rdi)
+    movaps    %xmm2, 32(%rdi)
+    movaps    %xmm3, 48(%rdi)
+    sub    $64, %rdx
+    add    $64, %rsi
+    add    $64, %rdi
 L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_0_gobble):
 #ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+    cmp    $DATA_CACHE_SIZE_HALF, %RDX_LP
 #else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+    cmp    __x86_data_cache_size_half(%rip), %RDX_LP
 #endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
+    lea    -128(%rdx), %rdx
+    jae    L(shl_0_gobble_mem_loop)
 L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
+    movdqa    (%rsi), %xmm4
+    movaps    0x10(%rsi), %xmm1
+    movaps    0x20(%rsi), %xmm2
+    movaps    0x30(%rsi), %xmm3
 
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
+    movdqa    %xmm4, (%rdi)
+    movaps    %xmm1, 0x10(%rdi)
+    movaps    %xmm2, 0x20(%rdi)
+    movaps    %xmm3, 0x30(%rdi)
 
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
+    sub    $128, %rdx
+    movaps    0x40(%rsi), %xmm4
+    movaps    0x50(%rsi), %xmm5
+    movaps    0x60(%rsi), %xmm6
+    movaps    0x70(%rsi), %xmm7
+    lea    0x80(%rsi), %rsi
+    movaps    %xmm4, 0x40(%rdi)
+    movaps    %xmm5, 0x50(%rdi)
+    movaps    %xmm6, 0x60(%rdi)
+    movaps    %xmm7, 0x70(%rdi)
+    lea    0x80(%rdi), %rdi
 
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
+    jae    L(shl_0_gobble_cache_loop)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(shl_0_cache_less_64bytes)
 
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
+    movdqa    (%rsi), %xmm4
+    sub    $0x40, %rdx
+    movdqa    0x10(%rsi), %xmm1
 
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
+    movdqa    %xmm4, (%rdi)
+    movdqa    %xmm1, 0x10(%rdi)
 
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
+    movdqa    0x20(%rsi), %xmm4
+    movdqa    0x30(%rsi), %xmm1
+    add    $0x40, %rsi
 
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
+    movdqa    %xmm4, 0x20(%rdi)
+    movdqa    %xmm1, 0x30(%rdi)
+    add    $0x40, %rdi
 L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
+    prefetcht0 0x1c0(%rsi)
+    prefetcht0 0x280(%rsi)
 
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
+    movdqa    (%rsi), %xmm0
+    movdqa    0x10(%rsi), %xmm1
+    movdqa    0x20(%rsi), %xmm2
+    movdqa    0x30(%rsi), %xmm3
+    movdqa    0x40(%rsi), %xmm4
+    movdqa    0x50(%rsi), %xmm5
+    movdqa    0x60(%rsi), %xmm6
+    movdqa    0x70(%rsi), %xmm7
+    lea    0x80(%rsi), %rsi
+    sub    $0x80, %rdx
+    movdqa    %xmm0, (%rdi)
+    movdqa    %xmm1, 0x10(%rdi)
+    movdqa    %xmm2, 0x20(%rdi)
+    movdqa    %xmm3, 0x30(%rdi)
+    movdqa    %xmm4, 0x40(%rdi)
+    movdqa    %xmm5, 0x50(%rdi)
+    movdqa    %xmm6, 0x60(%rdi)
+    movdqa    %xmm7, 0x70(%rdi)
+    lea    0x80(%rdi), %rdi
 
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
+    jae    L(shl_0_gobble_mem_loop)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(shl_0_mem_less_64bytes)
 
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
+    movdqa    (%rsi), %xmm0
+    sub    $0x40, %rdx
+    movdqa    0x10(%rsi), %xmm1
 
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
+    movdqa    %xmm0, (%rdi)
+    movdqa    %xmm1, 0x10(%rdi)
 
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
+    movdqa    0x20(%rsi), %xmm0
+    movdqa    0x30(%rsi), %xmm1
+    add    $0x40, %rsi
 
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
+    movdqa    %xmm0, 0x20(%rdi)
+    movdqa    %xmm1, 0x30(%rdi)
+    add    $0x40, %rdi
 L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
+    cmp    $0x20, %rdx
+    jb    L(shl_0_mem_less_32bytes)
+    movdqa    (%rsi), %xmm0
+    sub    $0x20, %rdx
+    movdqa    0x10(%rsi), %xmm1
+    add    $0x20, %rsi
+    movdqa    %xmm0, (%rdi)
+    movdqa    %xmm1, 0x10(%rdi)
+    add    $0x20, %rdi
 L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    add    %rdx, %rdi
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
+    sub    $16, %rdx
+    movdqa    -0x10(%rsi), %xmm1
+    sub    $16, %rsi
+    movdqa    %xmm1, -0x10(%rdi)
+    sub    $16, %rdi
+    cmp    $0x80, %rdx
+    movdqu    %xmm0, (%r8)
+    ja    L(shl_0_gobble_bwd)
+    cmp    $64, %rdx
+    jb    L(shl_0_less_64bytes_bwd)
+    movaps    -0x10(%rsi), %xmm0
+    movaps    -0x20(%rsi), %xmm1
+    movaps    -0x30(%rsi), %xmm2
+    movaps    -0x40(%rsi), %xmm3
+    movaps    %xmm0, -0x10(%rdi)
+    movaps    %xmm1, -0x20(%rdi)
+    movaps    %xmm2, -0x30(%rdi)
+    movaps    %xmm3, -0x40(%rdi)
+    sub    $64, %rdx
+    sub    $0x40, %rsi
+    sub    $0x40, %rdi
 L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_0_gobble_bwd):
 #ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
+    cmp    $DATA_CACHE_SIZE_HALF, %RDX_LP
 #else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
+    cmp    __x86_data_cache_size_half(%rip), %RDX_LP
 #endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
+    lea    -128(%rdx), %rdx
+    jae    L(shl_0_gobble_mem_bwd_loop)
 L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
+    movdqa    -0x10(%rsi), %xmm0
+    movaps    -0x20(%rsi), %xmm1
+    movaps    -0x30(%rsi), %xmm2
+    movaps    -0x40(%rsi), %xmm3
 
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
+    movdqa    %xmm0, -0x10(%rdi)
+    movaps    %xmm1, -0x20(%rdi)
+    movaps    %xmm2, -0x30(%rdi)
+    movaps    %xmm3, -0x40(%rdi)
 
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
+    sub    $0x80, %rdx
+    movaps    -0x50(%rsi), %xmm4
+    movaps    -0x60(%rsi), %xmm5
+    movaps    -0x70(%rsi), %xmm6
+    movaps    -0x80(%rsi), %xmm7
+    lea    -0x80(%rsi), %rsi
+    movaps    %xmm4, -0x50(%rdi)
+    movaps    %xmm5, -0x60(%rdi)
+    movaps    %xmm6, -0x70(%rdi)
+    movaps    %xmm7, -0x80(%rdi)
+    lea    -0x80(%rdi), %rdi
 
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
+    jae    L(shl_0_gobble_bwd_loop)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(shl_0_gobble_bwd_less_64bytes)
 
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
+    movdqa    -0x10(%rsi), %xmm0
+    sub    $0x40, %rdx
+    movdqa    -0x20(%rsi), %xmm1
 
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
+    movdqa    %xmm0, -0x10(%rdi)
+    movdqa    %xmm1, -0x20(%rdi)
 
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
+    movdqa    -0x30(%rsi), %xmm0
+    movdqa    -0x40(%rsi), %xmm1
+    sub    $0x40, %rsi
 
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
+    movdqa    %xmm0, -0x30(%rdi)
+    movdqa    %xmm1, -0x40(%rdi)
+    sub    $0x40, %rdi
 L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
+    prefetcht0 -0x1c0(%rsi)
+    prefetcht0 -0x280(%rsi)
+    movdqa    -0x10(%rsi), %xmm0
+    movdqa    -0x20(%rsi), %xmm1
+    movdqa    -0x30(%rsi), %xmm2
+    movdqa    -0x40(%rsi), %xmm3
+    movdqa    -0x50(%rsi), %xmm4
+    movdqa    -0x60(%rsi), %xmm5
+    movdqa    -0x70(%rsi), %xmm6
+    movdqa    -0x80(%rsi), %xmm7
+    lea    -0x80(%rsi), %rsi
+    sub    $0x80, %rdx
+    movdqa    %xmm0, -0x10(%rdi)
+    movdqa    %xmm1, -0x20(%rdi)
+    movdqa    %xmm2, -0x30(%rdi)
+    movdqa    %xmm3, -0x40(%rdi)
+    movdqa    %xmm4, -0x50(%rdi)
+    movdqa    %xmm5, -0x60(%rdi)
+    movdqa    %xmm6, -0x70(%rdi)
+    movdqa    %xmm7, -0x80(%rdi)
+    lea    -0x80(%rdi), %rdi
 
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
+    jae    L(shl_0_gobble_mem_bwd_loop)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(shl_0_mem_bwd_less_64bytes)
 
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
+    movdqa    -0x10(%rsi), %xmm0
+    sub    $0x40, %rdx
+    movdqa    -0x20(%rsi), %xmm1
 
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
+    movdqa    %xmm0, -0x10(%rdi)
+    movdqa    %xmm1, -0x20(%rdi)
 
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
+    movdqa    -0x30(%rsi), %xmm0
+    movdqa    -0x40(%rsi), %xmm1
+    sub    $0x40, %rsi
 
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
+    movdqa    %xmm0, -0x30(%rdi)
+    movdqa    %xmm1, -0x40(%rdi)
+    sub    $0x40, %rdi
 L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
+    cmp    $0x20, %rdx
+    jb    L(shl_0_mem_bwd_less_32bytes)
+    movdqa    -0x10(%rsi), %xmm0
+    sub    $0x20, %rdx
+    movdqa    -0x20(%rsi), %xmm1
+    sub    $0x20, %rsi
+    movdqa    %xmm0, -0x10(%rdi)
+    movdqa    %xmm1, -0x20(%rdi)
+    sub    $0x20, %rdi
 L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
+    lea    (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x01(%rsi), %xmm1
+    jb    L(L1_fwd)
+    lea    (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
 L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x0f(%rsi), %xmm2
+    movaps    0x1f(%rsi), %xmm3
+    movaps    0x2f(%rsi), %xmm4
+    movaps    0x3f(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $1, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $1, %xmm3, %xmm4
+    palignr    $1, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $1, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_1_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x01(%rsi), %xmm1
+    jb    L(L1_bwd)
+    lea    (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
 L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
+    movaps    -0x11(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x21(%rsi), %xmm3
+    movaps    -0x31(%rsi), %xmm4
+    movaps    -0x41(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $1, %xmm2, %xmm1
+    palignr    $1, %xmm3, %xmm2
+    palignr    $1, %xmm4, %xmm3
+    palignr    $1, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_1_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
+    lea    (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x02(%rsi), %xmm1
+    jb    L(L2_fwd)
+    lea    (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
 L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x0e(%rsi), %xmm2
+    movaps    0x1e(%rsi), %xmm3
+    movaps    0x2e(%rsi), %xmm4
+    movaps    0x3e(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $2, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $2, %xmm3, %xmm4
+    palignr    $2, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $2, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_2_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x02(%rsi), %xmm1
+    jb    L(L2_bwd)
+    lea    (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
 L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
+    movaps    -0x12(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x22(%rsi), %xmm3
+    movaps    -0x32(%rsi), %xmm4
+    movaps    -0x42(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $2, %xmm2, %xmm1
+    palignr    $2, %xmm3, %xmm2
+    palignr    $2, %xmm4, %xmm3
+    palignr    $2, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_2_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
+    lea    (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x03(%rsi), %xmm1
+    jb    L(L3_fwd)
+    lea    (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
 L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x0d(%rsi), %xmm2
+    movaps    0x1d(%rsi), %xmm3
+    movaps    0x2d(%rsi), %xmm4
+    movaps    0x3d(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $3, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $3, %xmm3, %xmm4
+    palignr    $3, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $3, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_3_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x03(%rsi), %xmm1
+    jb    L(L3_bwd)
+    lea    (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
 L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
+    movaps    -0x13(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x23(%rsi), %xmm3
+    movaps    -0x33(%rsi), %xmm4
+    movaps    -0x43(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $3, %xmm2, %xmm1
+    palignr    $3, %xmm3, %xmm2
+    palignr    $3, %xmm4, %xmm3
+    palignr    $3, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_3_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
+    lea    (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x04(%rsi), %xmm1
+    jb    L(L4_fwd)
+    lea    (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
 L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x0c(%rsi), %xmm2
+    movaps    0x1c(%rsi), %xmm3
+    movaps    0x2c(%rsi), %xmm4
+    movaps    0x3c(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $4, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $4, %xmm3, %xmm4
+    palignr    $4, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $4, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_4_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x04(%rsi), %xmm1
+    jb    L(L4_bwd)
+    lea    (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
 L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
+    movaps    -0x14(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x24(%rsi), %xmm3
+    movaps    -0x34(%rsi), %xmm4
+    movaps    -0x44(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $4, %xmm2, %xmm1
+    palignr    $4, %xmm3, %xmm2
+    palignr    $4, %xmm4, %xmm3
+    palignr    $4, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_4_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
+    lea    (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x05(%rsi), %xmm1
+    jb    L(L5_fwd)
+    lea    (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
 L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x0b(%rsi), %xmm2
+    movaps    0x1b(%rsi), %xmm3
+    movaps    0x2b(%rsi), %xmm4
+    movaps    0x3b(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $5, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $5, %xmm3, %xmm4
+    palignr    $5, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $5, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_5_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x05(%rsi), %xmm1
+    jb    L(L5_bwd)
+    lea    (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
 L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
+    movaps    -0x15(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x25(%rsi), %xmm3
+    movaps    -0x35(%rsi), %xmm4
+    movaps    -0x45(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $5, %xmm2, %xmm1
+    palignr    $5, %xmm3, %xmm2
+    palignr    $5, %xmm4, %xmm3
+    palignr    $5, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_5_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
+    lea    (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x06(%rsi), %xmm1
+    jb    L(L6_fwd)
+    lea    (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
 L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x0a(%rsi), %xmm2
+    movaps    0x1a(%rsi), %xmm3
+    movaps    0x2a(%rsi), %xmm4
+    movaps    0x3a(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $6, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $6, %xmm3, %xmm4
+    palignr    $6, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $6, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_6_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x06(%rsi), %xmm1
+    jb    L(L6_bwd)
+    lea    (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
 L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
+    movaps    -0x16(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x26(%rsi), %xmm3
+    movaps    -0x36(%rsi), %xmm4
+    movaps    -0x46(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $6, %xmm2, %xmm1
+    palignr    $6, %xmm3, %xmm2
+    palignr    $6, %xmm4, %xmm3
+    palignr    $6, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_6_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
+    lea    (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x07(%rsi), %xmm1
+    jb    L(L7_fwd)
+    lea    (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
 L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x09(%rsi), %xmm2
+    movaps    0x19(%rsi), %xmm3
+    movaps    0x29(%rsi), %xmm4
+    movaps    0x39(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $7, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $7, %xmm3, %xmm4
+    palignr    $7, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $7, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_7_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x07(%rsi), %xmm1
+    jb    L(L7_bwd)
+    lea    (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
 L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
+    movaps    -0x17(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x27(%rsi), %xmm3
+    movaps    -0x37(%rsi), %xmm4
+    movaps    -0x47(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $7, %xmm2, %xmm1
+    palignr    $7, %xmm3, %xmm2
+    palignr    $7, %xmm4, %xmm3
+    palignr    $7, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_7_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
+    lea    (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x08(%rsi), %xmm1
+    jb    L(L8_fwd)
+    lea    (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
 L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
 L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
+    sub    $64, %rdx
+    movaps    0x08(%rsi), %xmm2
+    movaps    0x18(%rsi), %xmm3
+    movaps    0x28(%rsi), %xmm4
+    movaps    0x38(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $8, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $8, %xmm3, %xmm4
+    palignr    $8, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $8, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_8_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
+    .p2align 4
 L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    lea    64(%rdx), %rdx
+    movaps    %xmm4, -0x20(%rdi)
+    add    %rdx, %rsi
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x08(%rsi), %xmm1
+    jb    L(L8_bwd)
+    lea    (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
 L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
+    movaps    -0x18(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x28(%rsi), %xmm3
+    movaps    -0x38(%rsi), %xmm4
+    movaps    -0x48(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $8, %xmm2, %xmm1
+    palignr    $8, %xmm3, %xmm2
+    palignr    $8, %xmm4, %xmm3
+    palignr    $8, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_8_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
+    lea    (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x09(%rsi), %xmm1
+    jb    L(L9_fwd)
+    lea    (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
 L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x07(%rsi), %xmm2
+    movaps    0x17(%rsi), %xmm3
+    movaps    0x27(%rsi), %xmm4
+    movaps    0x37(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $9, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $9, %xmm3, %xmm4
+    palignr    $9, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $9, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_9_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x09(%rsi), %xmm1
+    jb    L(L9_bwd)
+    lea    (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
 L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
+    movaps    -0x19(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x29(%rsi), %xmm3
+    movaps    -0x39(%rsi), %xmm4
+    movaps    -0x49(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $9, %xmm2, %xmm1
+    palignr    $9, %xmm3, %xmm2
+    palignr    $9, %xmm4, %xmm3
+    palignr    $9, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_9_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
+    lea    (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0a(%rsi), %xmm1
+    jb    L(L10_fwd)
+    lea    (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
 L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x06(%rsi), %xmm2
+    movaps    0x16(%rsi), %xmm3
+    movaps    0x26(%rsi), %xmm4
+    movaps    0x36(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $10, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $10, %xmm3, %xmm4
+    palignr    $10, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $10, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_10_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0a(%rsi), %xmm1
+    jb    L(L10_bwd)
+    lea    (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
 L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
+    movaps    -0x1a(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x2a(%rsi), %xmm3
+    movaps    -0x3a(%rsi), %xmm4
+    movaps    -0x4a(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $10, %xmm2, %xmm1
+    palignr    $10, %xmm3, %xmm2
+    palignr    $10, %xmm4, %xmm3
+    palignr    $10, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_10_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
+    lea    (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0b(%rsi), %xmm1
+    jb    L(L11_fwd)
+    lea    (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
 L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x05(%rsi), %xmm2
+    movaps    0x15(%rsi), %xmm3
+    movaps    0x25(%rsi), %xmm4
+    movaps    0x35(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $11, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $11, %xmm3, %xmm4
+    palignr    $11, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $11, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_11_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0b(%rsi), %xmm1
+    jb    L(L11_bwd)
+    lea    (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
 L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
+    movaps    -0x1b(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x2b(%rsi), %xmm3
+    movaps    -0x3b(%rsi), %xmm4
+    movaps    -0x4b(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $11, %xmm2, %xmm1
+    palignr    $11, %xmm3, %xmm2
+    palignr    $11, %xmm4, %xmm3
+    palignr    $11, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_11_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
+    lea    (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0c(%rsi), %xmm1
+    jb    L(L12_fwd)
+    lea    (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
 L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x04(%rsi), %xmm2
+    movaps    0x14(%rsi), %xmm3
+    movaps    0x24(%rsi), %xmm4
+    movaps    0x34(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $12, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $12, %xmm3, %xmm4
+    palignr    $12, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $12, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_12_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0c(%rsi), %xmm1
+    jb    L(L12_bwd)
+    lea    (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
 L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
+    movaps    -0x1c(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x2c(%rsi), %xmm3
+    movaps    -0x3c(%rsi), %xmm4
+    movaps    -0x4c(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $12, %xmm2, %xmm1
+    palignr    $12, %xmm3, %xmm2
+    palignr    $12, %xmm4, %xmm3
+    palignr    $12, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_12_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
+    lea    (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0d(%rsi), %xmm1
+    jb    L(L13_fwd)
+    lea    (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
 L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x03(%rsi), %xmm2
+    movaps    0x13(%rsi), %xmm3
+    movaps    0x23(%rsi), %xmm4
+    movaps    0x33(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $13, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $13, %xmm3, %xmm4
+    palignr    $13, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $13, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_13_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0d(%rsi), %xmm1
+    jb    L(L13_bwd)
+    lea    (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
 L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
+    movaps    -0x1d(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x2d(%rsi), %xmm3
+    movaps    -0x3d(%rsi), %xmm4
+    movaps    -0x4d(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $13, %xmm2, %xmm1
+    palignr    $13, %xmm3, %xmm2
+    palignr    $13, %xmm4, %xmm3
+    palignr    $13, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_13_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
+    lea    (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0e(%rsi), %xmm1
+    jb    L(L14_fwd)
+    lea    (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
 L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x02(%rsi), %xmm2
+    movaps    0x12(%rsi), %xmm3
+    movaps    0x22(%rsi), %xmm4
+    movaps    0x32(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $14, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $14, %xmm3, %xmm4
+    palignr    $14, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $14, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_14_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0e(%rsi), %xmm1
+    jb    L(L14_bwd)
+    lea    (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
 L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
+    movaps    -0x1e(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x2e(%rsi), %xmm3
+    movaps    -0x3e(%rsi), %xmm4
+    movaps    -0x4e(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $14, %xmm2, %xmm1
+    palignr    $14, %xmm3, %xmm2
+    palignr    $14, %xmm4, %xmm3
+    palignr    $14, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_14_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
+    lea    (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0f(%rsi), %xmm1
+    jb    L(L15_fwd)
+    lea    (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
 L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
+    prefetchnta 0x1c0(%rsi)
 L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    sub    $64, %rdx
+    movaps    0x01(%rsi), %xmm2
+    movaps    0x11(%rsi), %xmm3
+    movaps    0x21(%rsi), %xmm4
+    movaps    0x31(%rsi), %xmm5
+    movdqa    %xmm5, %xmm6
+    palignr    $15, %xmm4, %xmm5
+    lea    64(%rsi), %rsi
+    palignr    $15, %xmm3, %xmm4
+    palignr    $15, %xmm2, %xmm3
+    lea    64(%rdi), %rdi
+    palignr    $15, %xmm1, %xmm2
+    movdqa    %xmm6, %xmm1
+    movdqa    %xmm2, -0x40(%rdi)
+    movaps    %xmm3, -0x30(%rdi)
+    jb    L(shl_15_end)
+    movaps    %xmm4, -0x20(%rdi)
+    movaps    %xmm5, -0x10(%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, -0x20(%rdi)
+    lea    64(%rdx), %rdx
+    movaps    %xmm5, -0x10(%rdi)
+    add    %rdx, %rdi
+    movdqu    %xmm0, (%r8)
+    add    %rdx, %rsi
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
+    lea    (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
+    cmp    %rcx, %rdx
+    movaps    -0x0f(%rsi), %xmm1
+    jb    L(L15_bwd)
+    lea    (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
 L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
+    lea    -64(%rdx), %rdx
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
+    prefetchnta -0x1c0(%rsi)
 L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
+    movaps    -0x1f(%rsi), %xmm2
+    sub    $0x40, %rdx
+    movaps    -0x2f(%rsi), %xmm3
+    movaps    -0x3f(%rsi), %xmm4
+    movaps    -0x4f(%rsi), %xmm5
+    lea    -0x40(%rsi), %rsi
+    palignr    $15, %xmm2, %xmm1
+    palignr    $15, %xmm3, %xmm2
+    palignr    $15, %xmm4, %xmm3
+    palignr    $15, %xmm5, %xmm4
 
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
+    movaps    %xmm1, -0x10(%rdi)
+    movaps    %xmm5, %xmm1
 
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
+    movaps    %xmm2, -0x20(%rdi)
+    lea    -0x40(%rdi), %rdi
 
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
+    movaps    %xmm3, 0x10(%rdi)
+    jb    L(shl_15_bwd_end)
+    movaps    %xmm4, (%rdi)
+    _CET_NOTRACK jmp *%r9
+    ud2
 L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+    movaps    %xmm4, (%rdi)
+    lea    64(%rdx), %rdx
+    movdqu    %xmm0, (%r8)
+    BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
 
-	.p2align 4
+    .p2align 4
 L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
+    movdqu    -72(%rsi), %xmm0
+    movdqu    -56(%rsi), %xmm1
+    mov    -40(%rsi), %r8
+    mov    -32(%rsi), %r9
+    mov    -24(%rsi), %r10
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rcx
+    movdqu     %xmm0, -72(%rdi)
+    movdqu     %xmm1, -56(%rdi)
+    mov     %r8, -40(%rdi)
+    mov     %r9, -32(%rdi)
+    mov     %r10, -24(%rdi)
+    mov     %r11, -16(%rdi)
+    mov     %rcx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -64(%rsi), %xmm0
+    mov    -48(%rsi), %rcx
+    mov    -40(%rsi), %r8
+    mov    -32(%rsi), %r9
+    mov    -24(%rsi), %r10
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -64(%rdi)
+    mov     %rcx, -48(%rdi)
+    mov     %r8, -40(%rdi)
+    mov     %r9, -32(%rdi)
+    mov     %r10, -24(%rdi)
+    mov     %r11, -16(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
+    movdqu    -56(%rsi), %xmm0
+    mov    -40(%rsi), %r8
+    mov    -32(%rsi), %r9
+    mov    -24(%rsi), %r10
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rcx
+    movdqu     %xmm0, -56(%rdi)
+    mov     %r8, -40(%rdi)
+    mov     %r9, -32(%rdi)
+    mov     %r10, -24(%rdi)
+    mov     %r11, -16(%rdi)
+    mov     %rcx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -48(%rsi), %rcx
+    mov    -40(%rsi), %r8
+    mov    -32(%rsi), %r9
+    mov    -24(%rsi), %r10
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rdx
+    mov     %rcx, -48(%rdi)
+    mov     %r8, -40(%rdi)
+    mov     %r9, -32(%rdi)
+    mov     %r10, -24(%rdi)
+    mov     %r11, -16(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -40(%rsi), %r8
+    mov    -32(%rsi), %r9
+    mov    -24(%rsi), %r10
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rdx
+    mov     %r8, -40(%rdi)
+    mov     %r9, -32(%rdi)
+    mov     %r10, -24(%rdi)
+    mov     %r11, -16(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -32(%rsi), %r9
+    mov    -24(%rsi), %r10
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rdx
+    mov     %r9, -32(%rdi)
+    mov     %r10, -24(%rdi)
+    mov     %r11, -16(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -24(%rsi), %r10
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rdx
+    mov     %r10, -24(%rdi)
+    mov     %r11, -16(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -16(%rsi), %r11
+    mov    -8(%rsi), %rdx
+    mov     %r11, -16(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
+    mov    -8(%rsi), %rdx
+    mov     %rdx, -8(%rdi)
 L(write_0bytes):
-	ret
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -73(%rsi), %xmm0
+    movdqu    -57(%rsi), %xmm1
+    mov    -41(%rsi), %rcx
+    mov    -33(%rsi), %r9
+    mov    -25(%rsi), %r10
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %r8
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -73(%rdi)
+    movdqu     %xmm1, -57(%rdi)
+    mov     %rcx, -41(%rdi)
+    mov     %r9, -33(%rdi)
+    mov     %r10, -25(%rdi)
+    mov     %r11, -17(%rdi)
+    mov     %r8, -9(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -65(%rsi), %xmm0
+    movdqu    -49(%rsi), %xmm1
+    mov    -33(%rsi), %r9
+    mov    -25(%rsi), %r10
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -65(%rdi)
+    movdqu     %xmm1, -49(%rdi)
+    mov     %r9, -33(%rdi)
+    mov     %r10, -25(%rdi)
+    mov     %r11, -17(%rdi)
+    mov     %rcx, -9(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -57(%rsi), %xmm0
+    mov    -41(%rsi), %r8
+    mov    -33(%rsi), %r9
+    mov    -25(%rsi), %r10
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -57(%rdi)
+    mov     %r8, -41(%rdi)
+    mov     %r9, -33(%rdi)
+    mov     %r10, -25(%rdi)
+    mov     %r11, -17(%rdi)
+    mov     %rcx, -9(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -49(%rsi), %xmm0
+    mov    -33(%rsi), %r9
+    mov    -25(%rsi), %r10
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -49(%rdi)
+    mov     %r9, -33(%rdi)
+    mov     %r10, -25(%rdi)
+    mov     %r11, -17(%rdi)
+    mov     %rcx, -9(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
+    mov    -41(%rsi), %r8
+    mov    -33(%rsi), %r9
+    mov    -25(%rsi), %r10
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %rcx
+    mov    -1(%rsi), %dl
+    mov     %r8, -41(%rdi)
+    mov     %r9, -33(%rdi)
+    mov     %r10, -25(%rdi)
+    mov     %r11, -17(%rdi)
+    mov     %rcx, -9(%rdi)
+    mov     %dl, -1(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
+    mov    -33(%rsi), %r9
+    mov    -25(%rsi), %r10
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %rcx
+    mov    -1(%rsi), %dl
+    mov     %r9, -33(%rdi)
+    mov     %r10, -25(%rdi)
+    mov     %r11, -17(%rdi)
+    mov     %rcx, -9(%rdi)
+    mov     %dl, -1(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
+    mov    -25(%rsi), %r10
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %rcx
+    mov    -1(%rsi), %dl
+    mov     %r10, -25(%rdi)
+    mov     %r11, -17(%rdi)
+    mov     %rcx, -9(%rdi)
+    mov     %dl, -1(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -17(%rsi), %r11
+    mov    -9(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r11, -17(%rdi)
+    mov     %rcx, -9(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -9(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %rcx, -9(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
+    mov    -1(%rsi), %dl
+    mov     %dl, -1(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -74(%rsi), %xmm0
+    movdqu    -58(%rsi), %xmm1
+    mov    -42(%rsi), %r8
+    mov    -34(%rsi), %r9
+    mov    -26(%rsi), %r10
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -74(%rdi)
+    movdqu     %xmm1, -58(%rdi)
+    mov     %r8, -42(%rdi)
+    mov     %r9, -34(%rdi)
+    mov     %r10, -26(%rdi)
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -66(%rsi), %xmm0
+    movdqu    -50(%rsi), %xmm1
+    mov    -42(%rsi), %r8
+    mov    -34(%rsi), %r9
+    mov    -26(%rsi), %r10
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -66(%rdi)
+    movdqu     %xmm1, -50(%rdi)
+    mov     %r8, -42(%rdi)
+    mov     %r9, -34(%rdi)
+    mov     %r10, -26(%rdi)
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -58(%rsi), %xmm1
+    mov    -42(%rsi), %r8
+    mov    -34(%rsi), %r9
+    mov    -26(%rsi), %r10
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm1, -58(%rdi)
+    mov     %r8, -42(%rdi)
+    mov     %r9, -34(%rdi)
+    mov     %r10, -26(%rdi)
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -50(%rsi), %xmm0
+    mov    -34(%rsi), %r9
+    mov    -26(%rsi), %r10
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -50(%rdi)
+    mov     %r9, -34(%rdi)
+    mov     %r10, -26(%rdi)
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -42(%rsi), %r8
+    mov    -34(%rsi), %r9
+    mov    -26(%rsi), %r10
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r8, -42(%rdi)
+    mov     %r9, -34(%rdi)
+    mov     %r10, -26(%rdi)
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -34(%rsi), %r9
+    mov    -26(%rsi), %r10
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r9, -34(%rdi)
+    mov     %r10, -26(%rdi)
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -26(%rsi), %r10
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r10, -26(%rdi)
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -18(%rsi), %r11
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r11, -18(%rdi)
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -10(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %rcx, -10(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
+    mov    -2(%rsi), %dx
+    mov     %dx, -2(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -75(%rsi), %xmm0
+    movdqu    -59(%rsi), %xmm1
+    mov    -43(%rsi), %r8
+    mov    -35(%rsi), %r9
+    mov    -27(%rsi), %r10
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -75(%rdi)
+    movdqu     %xmm1, -59(%rdi)
+    mov     %r8, -43(%rdi)
+    mov     %r9, -35(%rdi)
+    mov     %r10, -27(%rdi)
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -67(%rsi), %xmm0
+    movdqu    -59(%rsi), %xmm1
+    mov    -43(%rsi), %r8
+    mov    -35(%rsi), %r9
+    mov    -27(%rsi), %r10
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -67(%rdi)
+    movdqu     %xmm1, -59(%rdi)
+    mov     %r8, -43(%rdi)
+    mov     %r9, -35(%rdi)
+    mov     %r10, -27(%rdi)
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -59(%rsi), %xmm0
+    mov    -43(%rsi), %r8
+    mov    -35(%rsi), %r9
+    mov    -27(%rsi), %r10
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -59(%rdi)
+    mov     %r8, -43(%rdi)
+    mov     %r9, -35(%rdi)
+    mov     %r10, -27(%rdi)
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -51(%rsi), %xmm0
+    mov    -35(%rsi), %r9
+    mov    -27(%rsi), %r10
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -51(%rdi)
+    mov     %r9, -35(%rdi)
+    mov     %r10, -27(%rdi)
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -43(%rsi), %r8
+    mov    -35(%rsi), %r9
+    mov    -27(%rsi), %r10
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r8, -43(%rdi)
+    mov     %r9, -35(%rdi)
+    mov     %r10, -27(%rdi)
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -35(%rsi), %r9
+    mov    -27(%rsi), %r10
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r9, -35(%rdi)
+    mov     %r10, -27(%rdi)
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -27(%rsi), %r10
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r10, -27(%rdi)
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -19(%rsi), %r11
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r11, -19(%rdi)
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -11(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %rcx, -11(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
+    mov    -3(%rsi), %dx
+    mov    -2(%rsi), %cx
+    mov     %dx, -3(%rdi)
+    mov     %cx, -2(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -76(%rsi), %xmm0
+    movdqu    -60(%rsi), %xmm1
+    mov    -44(%rsi), %r8
+    mov    -36(%rsi), %r9
+    mov    -28(%rsi), %r10
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -76(%rdi)
+    movdqu     %xmm1, -60(%rdi)
+    mov     %r8, -44(%rdi)
+    mov     %r9, -36(%rdi)
+    mov     %r10, -28(%rdi)
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -68(%rsi), %xmm0
+    movdqu    -52(%rsi), %xmm1
+    mov    -36(%rsi), %r9
+    mov    -28(%rsi), %r10
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -68(%rdi)
+    movdqu     %xmm1, -52(%rdi)
+    mov     %r9, -36(%rdi)
+    mov     %r10, -28(%rdi)
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -60(%rsi), %xmm0
+    mov    -44(%rsi), %r8
+    mov    -36(%rsi), %r9
+    mov    -28(%rsi), %r10
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -60(%rdi)
+    mov     %r8, -44(%rdi)
+    mov     %r9, -36(%rdi)
+    mov     %r10, -28(%rdi)
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    movdqu    -52(%rsi), %xmm0
+    mov    -36(%rsi), %r9
+    mov    -28(%rsi), %r10
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    movdqu     %xmm0, -52(%rdi)
+    mov     %r9, -36(%rdi)
+    mov     %r10, -28(%rdi)
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -44(%rsi), %r8
+    mov    -36(%rsi), %r9
+    mov    -28(%rsi), %r10
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r8, -44(%rdi)
+    mov     %r9, -36(%rdi)
+    mov     %r10, -28(%rdi)
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -36(%rsi), %r9
+    mov    -28(%rsi), %r10
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r9, -36(%rdi)
+    mov     %r10, -28(%rdi)
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -28(%rsi), %r10
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r10, -28(%rdi)
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -20(%rsi), %r11
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %r11, -20(%rdi)
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -12(%rsi), %rcx
+    mov    -4(%rsi), %edx
+    mov     %rcx, -12(%rdi)
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
+    mov    -4(%rsi), %edx
+    mov     %edx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -77(%rsi), %xmm0
+    movdqu    -61(%rsi), %xmm1
+    mov    -45(%rsi), %r8
+    mov    -37(%rsi), %r9
+    mov    -29(%rsi), %r10
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -77(%rdi)
+    movdqu     %xmm1, -61(%rdi)
+    mov     %r8, -45(%rdi)
+    mov     %r9, -37(%rdi)
+    mov     %r10, -29(%rdi)
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -69(%rsi), %xmm0
+    movdqu    -53(%rsi), %xmm1
+    mov    -37(%rsi), %r9
+    mov    -29(%rsi), %r10
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -69(%rdi)
+    movdqu     %xmm1, -53(%rdi)
+    mov     %r9, -37(%rdi)
+    mov     %r10, -29(%rdi)
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -61(%rsi), %xmm0
+    mov    -45(%rsi), %r8
+    mov    -37(%rsi), %r9
+    mov    -29(%rsi), %r10
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -61(%rdi)
+    mov     %r8, -45(%rdi)
+    mov     %r9, -37(%rdi)
+    mov     %r10, -29(%rdi)
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -53(%rsi), %xmm0
+    mov    -45(%rsi), %r8
+    mov    -37(%rsi), %r9
+    mov    -29(%rsi), %r10
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -53(%rdi)
+    mov     %r9, -37(%rdi)
+    mov     %r10, -29(%rdi)
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -45(%rsi), %r8
+    mov    -37(%rsi), %r9
+    mov    -29(%rsi), %r10
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r8, -45(%rdi)
+    mov     %r9, -37(%rdi)
+    mov     %r10, -29(%rdi)
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -37(%rsi), %r9
+    mov    -29(%rsi), %r10
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r9, -37(%rdi)
+    mov     %r10, -29(%rdi)
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -29(%rsi), %r10
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r10, -29(%rdi)
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -21(%rsi), %r11
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r11, -21(%rdi)
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -13(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %rcx, -13(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
+    mov    -5(%rsi), %edx
+    mov    -4(%rsi), %ecx
+    mov     %edx, -5(%rdi)
+    mov     %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -78(%rsi), %xmm0
+    movdqu    -62(%rsi), %xmm1
+    mov    -46(%rsi), %r8
+    mov    -38(%rsi), %r9
+    mov    -30(%rsi), %r10
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -78(%rdi)
+    movdqu     %xmm1, -62(%rdi)
+    mov     %r8, -46(%rdi)
+    mov     %r9, -38(%rdi)
+    mov     %r10, -30(%rdi)
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -70(%rsi), %xmm0
+    movdqu    -54(%rsi), %xmm1
+    mov    -38(%rsi), %r9
+    mov    -30(%rsi), %r10
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -70(%rdi)
+    movdqu     %xmm1, -54(%rdi)
+    mov     %r9, -38(%rdi)
+    mov     %r10, -30(%rdi)
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -62(%rsi), %xmm0
+    mov    -46(%rsi), %r8
+    mov    -38(%rsi), %r9
+    mov    -30(%rsi), %r10
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -62(%rdi)
+    mov     %r8, -46(%rdi)
+    mov     %r9, -38(%rdi)
+    mov     %r10, -30(%rdi)
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -54(%rsi), %xmm0
+    mov    -38(%rsi), %r9
+    mov    -30(%rsi), %r10
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -54(%rdi)
+    mov     %r9, -38(%rdi)
+    mov     %r10, -30(%rdi)
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -46(%rsi), %r8
+    mov    -38(%rsi), %r9
+    mov    -30(%rsi), %r10
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r8, -46(%rdi)
+    mov     %r9, -38(%rdi)
+    mov     %r10, -30(%rdi)
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -38(%rsi), %r9
+    mov    -30(%rsi), %r10
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r9, -38(%rdi)
+    mov     %r10, -30(%rdi)
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -30(%rsi), %r10
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r10, -30(%rdi)
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -22(%rsi), %r11
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r11, -22(%rdi)
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -14(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %rcx, -14(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
+    mov    -6(%rsi), %edx
+    mov    -4(%rsi), %ecx
+    mov     %edx, -6(%rdi)
+    mov     %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -79(%rsi), %xmm0
+    movdqu    -63(%rsi), %xmm1
+    mov    -47(%rsi), %r8
+    mov    -39(%rsi), %r9
+    mov    -31(%rsi), %r10
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -79(%rdi)
+    movdqu     %xmm1, -63(%rdi)
+    mov     %r8, -47(%rdi)
+    mov     %r9, -39(%rdi)
+    mov     %r10, -31(%rdi)
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -71(%rsi), %xmm0
+    movdqu    -55(%rsi), %xmm1
+    mov    -39(%rsi), %r9
+    mov    -31(%rsi), %r10
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -71(%rdi)
+    movdqu     %xmm1, -55(%rdi)
+    mov     %r9, -39(%rdi)
+    mov     %r10, -31(%rdi)
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -63(%rsi), %xmm0
+    mov    -47(%rsi), %r8
+    mov    -39(%rsi), %r9
+    mov    -31(%rsi), %r10
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -63(%rdi)
+    mov     %r8, -47(%rdi)
+    mov     %r9, -39(%rdi)
+    mov     %r10, -31(%rdi)
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    movdqu    -55(%rsi), %xmm0
+    mov    -39(%rsi), %r9
+    mov    -31(%rsi), %r10
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    movdqu     %xmm0, -55(%rdi)
+    mov     %r9, -39(%rdi)
+    mov     %r10, -31(%rdi)
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -47(%rsi), %r8
+    mov    -39(%rsi), %r9
+    mov    -31(%rsi), %r10
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r8, -47(%rdi)
+    mov     %r9, -39(%rdi)
+    mov     %r10, -31(%rdi)
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -39(%rsi), %r9
+    mov    -31(%rsi), %r10
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r9, -39(%rdi)
+    mov     %r10, -31(%rdi)
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -31(%rsi), %r10
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r10, -31(%rdi)
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -23(%rsi), %r11
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %r11, -23(%rdi)
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
+    mov    -15(%rsi), %rcx
+    mov    -8(%rsi), %rdx
+    mov     %rcx, -15(%rdi)
+    mov     %rdx, -8(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
+    mov    -7(%rsi), %edx
+    mov    -4(%rsi), %ecx
+    mov     %edx, -7(%rdi)
+    mov     %ecx, -4(%rdi)
+    ret
 
-	.p2align 4
+    .p2align 4
 L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
+    movdqu    (%rsi), %xmm1
+    lea    16(%rsi), %rsi
+    movdqu    %xmm0, (%r8)
+    movntdq    %xmm1, (%rdi)
+    lea    16(%rdi), %rdi
+    lea    -0x90(%rdx), %rdx
 #ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
+    mov    %rsi, %r9
+    sub    %rdi, %r9
+    cmp    %rdx, %r9
+    jae    L(memmove_is_memcpy_fwd)
+    shl    $2, %rcx
+    cmp    %rcx, %rdx
+    jb    L(ll_cache_copy_fwd_start)
 L(memmove_is_memcpy_fwd):
 #endif
 L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
+    movdqu    (%rsi), %xmm0
+    movdqu    0x10(%rsi), %xmm1
+    movdqu    0x20(%rsi), %xmm2
+    movdqu    0x30(%rsi), %xmm3
+    movdqu    0x40(%rsi), %xmm4
+    movdqu    0x50(%rsi), %xmm5
+    movdqu    0x60(%rsi), %xmm6
+    movdqu    0x70(%rsi), %xmm7
+    lea    0x80(%rsi), %rsi
 
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
+    sub    $0x80, %rdx
+    movntdq    %xmm0, (%rdi)
+    movntdq    %xmm1, 0x10(%rdi)
+    movntdq    %xmm2, 0x20(%rdi)
+    movntdq    %xmm3, 0x30(%rdi)
+    movntdq    %xmm4, 0x40(%rdi)
+    movntdq    %xmm5, 0x50(%rdi)
+    movntdq    %xmm6, 0x60(%rdi)
+    movntdq    %xmm7, 0x70(%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(large_page_loop)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(large_page_less_64bytes)
 
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
+    movdqu    (%rsi), %xmm0
+    movdqu    0x10(%rsi), %xmm1
+    movdqu    0x20(%rsi), %xmm2
+    movdqu    0x30(%rsi), %xmm3
+    lea    0x40(%rsi), %rsi
 
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
+    movntdq    %xmm0, (%rdi)
+    movntdq    %xmm1, 0x10(%rdi)
+    movntdq    %xmm2, 0x20(%rdi)
+    movntdq    %xmm3, 0x30(%rdi)
+    lea    0x40(%rdi), %rdi
+    sub    $0x40, %rdx
 L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    sfence
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
 #ifdef USE_AS_MEMMOVE
-	.p2align 4
+    .p2align 4
 L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
+    prefetcht0 0x1c0(%rsi)
+    prefetcht0 0x200(%rsi)
+    movdqu    (%rsi), %xmm0
+    movdqu    0x10(%rsi), %xmm1
+    movdqu    0x20(%rsi), %xmm2
+    movdqu    0x30(%rsi), %xmm3
+    movdqu    0x40(%rsi), %xmm4
+    movdqu    0x50(%rsi), %xmm5
+    movdqu    0x60(%rsi), %xmm6
+    movdqu    0x70(%rsi), %xmm7
+    lea    0x80(%rsi), %rsi
 
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
+    sub    $0x80, %rdx
+    movaps    %xmm0, (%rdi)
+    movaps    %xmm1, 0x10(%rdi)
+    movaps    %xmm2, 0x20(%rdi)
+    movaps    %xmm3, 0x30(%rdi)
+    movaps    %xmm4, 0x40(%rdi)
+    movaps    %xmm5, 0x50(%rdi)
+    movaps    %xmm6, 0x60(%rdi)
+    movaps    %xmm7, 0x70(%rdi)
+    lea    0x80(%rdi), %rdi
+    jae    L(ll_cache_copy_fwd_start)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(large_page_ll_less_fwd_64bytes)
 
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
+    movdqu    (%rsi), %xmm0
+    movdqu    0x10(%rsi), %xmm1
+    movdqu    0x20(%rsi), %xmm2
+    movdqu    0x30(%rsi), %xmm3
+    lea    0x40(%rsi), %rsi
 
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
+    movaps    %xmm0, (%rdi)
+    movaps    %xmm1, 0x10(%rdi)
+    movaps    %xmm2, 0x20(%rdi)
+    movaps    %xmm3, 0x30(%rdi)
+    lea    0x40(%rdi), %rdi
+    sub    $0x40, %rdx
 L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    add    %rdx, %rsi
+    add    %rdx, %rdi
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
 #endif
-	.p2align 4
+    .p2align 4
 L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
+    movdqu    -0x10(%rsi), %xmm1
+    lea    -16(%rsi), %rsi
+    movdqu    %xmm0, (%r8)
+    movdqa    %xmm1, -0x10(%rdi)
+    lea    -16(%rdi), %rdi
+    lea    -0x90(%rdx), %rdx
 #ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
+    mov    %rdi, %r9
+    sub    %rsi, %r9
+    cmp    %rdx, %r9
+    jae    L(memmove_is_memcpy_bwd)
+    cmp    %rcx, %r9
+    jb    L(ll_cache_copy_bwd_start)
 L(memmove_is_memcpy_bwd):
 #endif
 L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
+    movdqu    -0x10(%rsi), %xmm0
+    movdqu    -0x20(%rsi), %xmm1
+    movdqu    -0x30(%rsi), %xmm2
+    movdqu    -0x40(%rsi), %xmm3
+    movdqu    -0x50(%rsi), %xmm4
+    movdqu    -0x60(%rsi), %xmm5
+    movdqu    -0x70(%rsi), %xmm6
+    movdqu    -0x80(%rsi), %xmm7
+    lea    -0x80(%rsi), %rsi
 
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
+    sub    $0x80, %rdx
+    movntdq    %xmm0, -0x10(%rdi)
+    movntdq    %xmm1, -0x20(%rdi)
+    movntdq    %xmm2, -0x30(%rdi)
+    movntdq    %xmm3, -0x40(%rdi)
+    movntdq    %xmm4, -0x50(%rdi)
+    movntdq    %xmm5, -0x60(%rdi)
+    movntdq    %xmm6, -0x70(%rdi)
+    movntdq    %xmm7, -0x80(%rdi)
+    lea    -0x80(%rdi), %rdi
+    jae    L(large_page_bwd_loop)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(large_page_less_bwd_64bytes)
 
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
+    movdqu    -0x10(%rsi), %xmm0
+    movdqu    -0x20(%rsi), %xmm1
+    movdqu    -0x30(%rsi), %xmm2
+    movdqu    -0x40(%rsi), %xmm3
+    lea    -0x40(%rsi), %rsi
 
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
+    movntdq    %xmm0, -0x10(%rdi)
+    movntdq    %xmm1, -0x20(%rdi)
+    movntdq    %xmm2, -0x30(%rdi)
+    movntdq    %xmm3, -0x40(%rdi)
+    lea    -0x40(%rdi), %rdi
+    sub    $0x40, %rdx
 L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    sfence
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 
 #ifdef USE_AS_MEMMOVE
-	.p2align 4
+    .p2align 4
 L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
+    prefetcht0 -0x1c0(%rsi)
+    prefetcht0 -0x200(%rsi)
+    movdqu    -0x10(%rsi), %xmm0
+    movdqu    -0x20(%rsi), %xmm1
+    movdqu    -0x30(%rsi), %xmm2
+    movdqu    -0x40(%rsi), %xmm3
+    movdqu    -0x50(%rsi), %xmm4
+    movdqu    -0x60(%rsi), %xmm5
+    movdqu    -0x70(%rsi), %xmm6
+    movdqu    -0x80(%rsi), %xmm7
+    lea    -0x80(%rsi), %rsi
 
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
+    sub    $0x80, %rdx
+    movaps    %xmm0, -0x10(%rdi)
+    movaps    %xmm1, -0x20(%rdi)
+    movaps    %xmm2, -0x30(%rdi)
+    movaps    %xmm3, -0x40(%rdi)
+    movaps    %xmm4, -0x50(%rdi)
+    movaps    %xmm5, -0x60(%rdi)
+    movaps    %xmm6, -0x70(%rdi)
+    movaps    %xmm7, -0x80(%rdi)
+    lea    -0x80(%rdi), %rdi
+    jae    L(ll_cache_copy_bwd_start)
+    cmp    $-0x40, %rdx
+    lea    0x80(%rdx), %rdx
+    jl    L(large_page_ll_less_bwd_64bytes)
 
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
+    movdqu    -0x10(%rsi), %xmm0
+    movdqu    -0x20(%rsi), %xmm1
+    movdqu    -0x30(%rsi), %xmm2
+    movdqu    -0x40(%rsi), %xmm3
+    lea    -0x40(%rsi), %rsi
 
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
+    movaps    %xmm0, -0x10(%rdi)
+    movaps    %xmm1, -0x20(%rdi)
+    movaps    %xmm2, -0x30(%rdi)
+    movaps    %xmm3, -0x40(%rdi)
+    lea    -0x40(%rdi), %rdi
+    sub    $0x40, %rdx
 L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+    BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
 #endif
 
 END (MEMCPY)
 
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
+    .section .rodata.ssse3,"a",@progbits
+    .p2align 3
 L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_0bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_1bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_2bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_3bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_4bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_5bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_6bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_7bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_8bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_9bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_10bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_11bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_12bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_13bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_14bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_15bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_16bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_17bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_18bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_19bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_20bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_21bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_22bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_23bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_24bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_25bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_26bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_27bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_28bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_29bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_30bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_31bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_32bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_33bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_34bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_35bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_36bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_37bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_38bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_39bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_40bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_41bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_42bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_43bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_44bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_45bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_46bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_47bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_48bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_49bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_50bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_51bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_52bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_53bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_54bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_55bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_56bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_57bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_58bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_59bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_60bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_61bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_62bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_63bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_64bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_65bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_66bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_67bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_68bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_69bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_70bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_71bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_72bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_73bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_74bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_75bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_76bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_77bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_78bytes), L(table_less_80bytes))
+    .int    JMPTBL (L(write_79bytes), L(table_less_80bytes))
 
-	.p2align 3
+    .p2align 3
 L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
+    .int    JMPTBL (L(shl_0), L(shl_table))
+    .int    JMPTBL (L(shl_1), L(shl_table))
+    .int    JMPTBL (L(shl_2), L(shl_table))
+    .int    JMPTBL (L(shl_3), L(shl_table))
+    .int    JMPTBL (L(shl_4), L(shl_table))
+    .int    JMPTBL (L(shl_5), L(shl_table))
+    .int    JMPTBL (L(shl_6), L(shl_table))
+    .int    JMPTBL (L(shl_7), L(shl_table))
+    .int    JMPTBL (L(shl_8), L(shl_table))
+    .int    JMPTBL (L(shl_9), L(shl_table))
+    .int    JMPTBL (L(shl_10), L(shl_table))
+    .int    JMPTBL (L(shl_11), L(shl_table))
+    .int    JMPTBL (L(shl_12), L(shl_table))
+    .int    JMPTBL (L(shl_13), L(shl_table))
+    .int    JMPTBL (L(shl_14), L(shl_table))
+    .int    JMPTBL (L(shl_15), L(shl_table))
 
-	.p2align 3
+    .p2align 3
 L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+    .int    JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
 
 #endif
diff --git a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
index 9ee6f0a71c3..2de73b29a85 100644
--- a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
@@ -1,12 +1,12 @@
 #if 1
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
+# define VEC_SIZE    32
+# define VEC(i)        ymm##i
+# define VMOVNT        vmovntdq
+# define VMOVU        vmovdqu
+# define VMOVA        vmovdqa
 
-# define SECTION(p)		p##.avx
-# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+# define SECTION(p)        p##.avx
+# define MEMMOVE_SYMBOL(p,s)    p##_avx_##s
 
 # include "memmove-vec-unaligned-erms.S"
 #endif
diff --git a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
index b14d92fd6a8..3effa845274 100644
--- a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
+++ b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
@@ -22,396 +22,396 @@
 
 # include "asm-syntax.h"
 
-	.section .text.avx512,"ax",@progbits
+    .section .text.avx512,"ax",@progbits
 ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_avx512_no_vzeroupper)
 
 ENTRY (__mempcpy_avx512_no_vzeroupper)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start)
 END (__mempcpy_avx512_no_vzeroupper)
 
 ENTRY (__memmove_chk_avx512_no_vzeroupper)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_avx512_no_vzeroupper)
 
 ENTRY (__memmove_avx512_no_vzeroupper)
-	mov	%RDI_LP, %RAX_LP
+    mov    %RDI_LP, %RAX_LP
 # ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
 # endif
 L(start):
 # ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
+    /* Clear the upper 32 bits.  */
+    mov    %edx, %edx
 # endif
-	lea	(%rsi, %rdx), %rcx
-	lea	(%rdi, %rdx), %r9
-	cmp	$512, %rdx
-	ja	L(512bytesormore)
+    lea    (%rsi, %rdx), %rcx
+    lea    (%rdi, %rdx), %r9
+    cmp    $512, %rdx
+    ja    L(512bytesormore)
 
 L(check):
-	cmp	$16, %rdx
-	jbe	L(less_16bytes)
-	cmp	$256, %rdx
-	jb	L(less_256bytes)
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups 0x80(%rsi), %zmm2
-	vmovups 0xC0(%rsi), %zmm3
-	vmovups	-0x100(%rcx), %zmm4
-	vmovups -0xC0(%rcx), %zmm5
-	vmovups -0x80(%rcx), %zmm6
-	vmovups -0x40(%rcx), %zmm7
-	vmovups %zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, 0x80(%rdi)
-	vmovups %zmm3, 0xC0(%rdi)
-	vmovups	%zmm4, -0x100(%r9)
-	vmovups %zmm5, -0xC0(%r9)
-	vmovups %zmm6, -0x80(%r9)
-	vmovups %zmm7, -0x40(%r9)
-	ret
+    cmp    $16, %rdx
+    jbe    L(less_16bytes)
+    cmp    $256, %rdx
+    jb    L(less_256bytes)
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups 0x80(%rsi), %zmm2
+    vmovups 0xC0(%rsi), %zmm3
+    vmovups    -0x100(%rcx), %zmm4
+    vmovups -0xC0(%rcx), %zmm5
+    vmovups -0x80(%rcx), %zmm6
+    vmovups -0x40(%rcx), %zmm7
+    vmovups %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, 0x80(%rdi)
+    vmovups %zmm3, 0xC0(%rdi)
+    vmovups    %zmm4, -0x100(%r9)
+    vmovups %zmm5, -0xC0(%r9)
+    vmovups %zmm6, -0x80(%r9)
+    vmovups %zmm7, -0x40(%r9)
+    ret
 
 L(less_256bytes):
-	cmp	$128, %dl
-	jb	L(less_128bytes)
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups -0x80(%rcx), %zmm2
-	vmovups -0x40(%rcx), %zmm3
-	vmovups	%zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, -0x80(%r9)
-	vmovups %zmm3, -0x40(%r9)
-	ret
+    cmp    $128, %dl
+    jb    L(less_128bytes)
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups -0x80(%rcx), %zmm2
+    vmovups -0x40(%rcx), %zmm3
+    vmovups    %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, -0x80(%r9)
+    vmovups %zmm3, -0x40(%r9)
+    ret
 
 L(less_128bytes):
-	cmp	$64, %dl
-	jb	L(less_64bytes)
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 0x20(%rsi), %ymm1
-	vmovdqu -0x40(%rcx), %ymm2
-	vmovdqu -0x20(%rcx), %ymm3
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 0x20(%rdi)
-	vmovdqu %ymm2, -0x40(%r9)
-	vmovdqu %ymm3, -0x20(%r9)
-	ret
+    cmp    $64, %dl
+    jb    L(less_64bytes)
+    vmovdqu (%rsi), %ymm0
+    vmovdqu 0x20(%rsi), %ymm1
+    vmovdqu -0x40(%rcx), %ymm2
+    vmovdqu -0x20(%rcx), %ymm3
+    vmovdqu %ymm0, (%rdi)
+    vmovdqu %ymm1, 0x20(%rdi)
+    vmovdqu %ymm2, -0x40(%r9)
+    vmovdqu %ymm3, -0x20(%r9)
+    ret
 
 L(less_64bytes):
-	cmp	$32, %dl
-	jb	L(less_32bytes)
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu -0x20(%rcx), %ymm1
-	vmovdqu	%ymm0, (%rdi)
-	vmovdqu	%ymm1, -0x20(%r9)
-	ret
+    cmp    $32, %dl
+    jb    L(less_32bytes)
+    vmovdqu    (%rsi), %ymm0
+    vmovdqu -0x20(%rcx), %ymm1
+    vmovdqu    %ymm0, (%rdi)
+    vmovdqu    %ymm1, -0x20(%r9)
+    ret
 
 L(less_32bytes):
-	vmovdqu (%rsi), %xmm0
-	vmovdqu -0x10(%rcx), %xmm1
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, -0x10(%r9)
-	ret
+    vmovdqu (%rsi), %xmm0
+    vmovdqu -0x10(%rcx), %xmm1
+    vmovdqu %xmm0, (%rdi)
+    vmovdqu %xmm1, -0x10(%r9)
+    ret
 
 L(less_16bytes):
-	cmp	$8, %dl
-	jb	L(less_8bytes)
-	movq	(%rsi), %rsi
-	movq	-0x8(%rcx), %rcx
-	movq	%rsi, (%rdi)
-	movq	%rcx, -0x8(%r9)
-	ret
+    cmp    $8, %dl
+    jb    L(less_8bytes)
+    movq    (%rsi), %rsi
+    movq    -0x8(%rcx), %rcx
+    movq    %rsi, (%rdi)
+    movq    %rcx, -0x8(%r9)
+    ret
 
 L(less_8bytes):
-	cmp	$4, %dl
-	jb	L(less_4bytes)
-	mov	(%rsi), %esi
-	mov	-0x4(%rcx), %ecx
-	mov	%esi, (%rdi)
-	mov	%ecx, -0x4(%r9)
-	ret
+    cmp    $4, %dl
+    jb    L(less_4bytes)
+    mov    (%rsi), %esi
+    mov    -0x4(%rcx), %ecx
+    mov    %esi, (%rdi)
+    mov    %ecx, -0x4(%r9)
+    ret
 
 L(less_4bytes):
-	cmp	$2, %dl
-	jb	L(less_2bytes)
-	mov	(%rsi), %si
-	mov	-0x2(%rcx), %cx
-	mov	%si, (%rdi)
-	mov	%cx, -0x2(%r9)
-	ret
+    cmp    $2, %dl
+    jb    L(less_2bytes)
+    mov    (%rsi), %si
+    mov    -0x2(%rcx), %cx
+    mov    %si, (%rdi)
+    mov    %cx, -0x2(%r9)
+    ret
 
 L(less_2bytes):
-	cmp	$1, %dl
-	jb	L(less_1bytes)
-	mov	(%rsi), %cl
-	mov	%cl, (%rdi)
+    cmp    $1, %dl
+    jb    L(less_1bytes)
+    mov    (%rsi), %cl
+    mov    %cl, (%rdi)
 L(less_1bytes):
-	ret
+    ret
 
 L(512bytesormore):
 # ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %r8
+    mov    $SHARED_CACHE_SIZE_HALF, %r8
 # else
-	mov	__x86_shared_cache_size_half(%rip), %r8
+    mov    __x86_shared_cache_size_half(%rip), %r8
 # endif
-	cmp	%r8, %rdx
-	jae	L(preloop_large)
-	cmp	$1024, %rdx
-	ja	L(1024bytesormore)
-	prefetcht1 (%rsi)
-	prefetcht1 0x40(%rsi)
-	prefetcht1 0x80(%rsi)
-	prefetcht1 0xC0(%rsi)
-	prefetcht1 0x100(%rsi)
-	prefetcht1 0x140(%rsi)
-	prefetcht1 0x180(%rsi)
-	prefetcht1 0x1C0(%rsi)
-	prefetcht1 -0x200(%rcx)
-	prefetcht1 -0x1C0(%rcx)
-	prefetcht1 -0x180(%rcx)
-	prefetcht1 -0x140(%rcx)
-	prefetcht1 -0x100(%rcx)
-	prefetcht1 -0xC0(%rcx)
-	prefetcht1 -0x80(%rcx)
-	prefetcht1 -0x40(%rcx)
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups 0x80(%rsi), %zmm2
-	vmovups 0xC0(%rsi), %zmm3
-	vmovups	0x100(%rsi), %zmm4
-	vmovups 0x140(%rsi), %zmm5
-	vmovups 0x180(%rsi), %zmm6
-	vmovups 0x1C0(%rsi), %zmm7
-	vmovups	-0x200(%rcx), %zmm8
-	vmovups -0x1C0(%rcx), %zmm9
-	vmovups -0x180(%rcx), %zmm10
-	vmovups -0x140(%rcx), %zmm11
-	vmovups	-0x100(%rcx), %zmm12
-	vmovups -0xC0(%rcx), %zmm13
-	vmovups -0x80(%rcx), %zmm14
-	vmovups -0x40(%rcx), %zmm15
-	vmovups %zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, 0x80(%rdi)
-	vmovups %zmm3, 0xC0(%rdi)
-	vmovups %zmm4, 0x100(%rdi)
-	vmovups %zmm5, 0x140(%rdi)
-	vmovups %zmm6, 0x180(%rdi)
-	vmovups %zmm7, 0x1C0(%rdi)
-	vmovups	%zmm8, -0x200(%r9)
-	vmovups %zmm9, -0x1C0(%r9)
-	vmovups %zmm10, -0x180(%r9)
-	vmovups %zmm11, -0x140(%r9)
-	vmovups	%zmm12, -0x100(%r9)
-	vmovups %zmm13, -0xC0(%r9)
-	vmovups %zmm14, -0x80(%r9)
-	vmovups %zmm15, -0x40(%r9)
-	ret
+    cmp    %r8, %rdx
+    jae    L(preloop_large)
+    cmp    $1024, %rdx
+    ja    L(1024bytesormore)
+    prefetcht1 (%rsi)
+    prefetcht1 0x40(%rsi)
+    prefetcht1 0x80(%rsi)
+    prefetcht1 0xC0(%rsi)
+    prefetcht1 0x100(%rsi)
+    prefetcht1 0x140(%rsi)
+    prefetcht1 0x180(%rsi)
+    prefetcht1 0x1C0(%rsi)
+    prefetcht1 -0x200(%rcx)
+    prefetcht1 -0x1C0(%rcx)
+    prefetcht1 -0x180(%rcx)
+    prefetcht1 -0x140(%rcx)
+    prefetcht1 -0x100(%rcx)
+    prefetcht1 -0xC0(%rcx)
+    prefetcht1 -0x80(%rcx)
+    prefetcht1 -0x40(%rcx)
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups 0x80(%rsi), %zmm2
+    vmovups 0xC0(%rsi), %zmm3
+    vmovups    0x100(%rsi), %zmm4
+    vmovups 0x140(%rsi), %zmm5
+    vmovups 0x180(%rsi), %zmm6
+    vmovups 0x1C0(%rsi), %zmm7
+    vmovups    -0x200(%rcx), %zmm8
+    vmovups -0x1C0(%rcx), %zmm9
+    vmovups -0x180(%rcx), %zmm10
+    vmovups -0x140(%rcx), %zmm11
+    vmovups    -0x100(%rcx), %zmm12
+    vmovups -0xC0(%rcx), %zmm13
+    vmovups -0x80(%rcx), %zmm14
+    vmovups -0x40(%rcx), %zmm15
+    vmovups %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, 0x80(%rdi)
+    vmovups %zmm3, 0xC0(%rdi)
+    vmovups %zmm4, 0x100(%rdi)
+    vmovups %zmm5, 0x140(%rdi)
+    vmovups %zmm6, 0x180(%rdi)
+    vmovups %zmm7, 0x1C0(%rdi)
+    vmovups    %zmm8, -0x200(%r9)
+    vmovups %zmm9, -0x1C0(%r9)
+    vmovups %zmm10, -0x180(%r9)
+    vmovups %zmm11, -0x140(%r9)
+    vmovups    %zmm12, -0x100(%r9)
+    vmovups %zmm13, -0xC0(%r9)
+    vmovups %zmm14, -0x80(%r9)
+    vmovups %zmm15, -0x40(%r9)
+    ret
 
 L(1024bytesormore):
-	cmp	%rsi, %rdi
-	ja	L(1024bytesormore_bkw)
-	sub	$512, %r9
-	vmovups -0x200(%rcx), %zmm8
-	vmovups -0x1C0(%rcx), %zmm9
-	vmovups -0x180(%rcx), %zmm10
-	vmovups -0x140(%rcx), %zmm11
-	vmovups	-0x100(%rcx), %zmm12
-	vmovups -0xC0(%rcx), %zmm13
-	vmovups -0x80(%rcx), %zmm14
-	vmovups -0x40(%rcx), %zmm15
-	prefetcht1 (%rsi)
-	prefetcht1 0x40(%rsi)
-	prefetcht1 0x80(%rsi)
-	prefetcht1 0xC0(%rsi)
-	prefetcht1 0x100(%rsi)
-	prefetcht1 0x140(%rsi)
-	prefetcht1 0x180(%rsi)
-	prefetcht1 0x1C0(%rsi)
+    cmp    %rsi, %rdi
+    ja    L(1024bytesormore_bkw)
+    sub    $512, %r9
+    vmovups -0x200(%rcx), %zmm8
+    vmovups -0x1C0(%rcx), %zmm9
+    vmovups -0x180(%rcx), %zmm10
+    vmovups -0x140(%rcx), %zmm11
+    vmovups    -0x100(%rcx), %zmm12
+    vmovups -0xC0(%rcx), %zmm13
+    vmovups -0x80(%rcx), %zmm14
+    vmovups -0x40(%rcx), %zmm15
+    prefetcht1 (%rsi)
+    prefetcht1 0x40(%rsi)
+    prefetcht1 0x80(%rsi)
+    prefetcht1 0xC0(%rsi)
+    prefetcht1 0x100(%rsi)
+    prefetcht1 0x140(%rsi)
+    prefetcht1 0x180(%rsi)
+    prefetcht1 0x1C0(%rsi)
 
 /* Loop with unaligned memory access.  */
 L(gobble_512bytes_loop):
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups 0x80(%rsi), %zmm2
-	vmovups 0xC0(%rsi), %zmm3
-	vmovups	0x100(%rsi), %zmm4
-	vmovups 0x140(%rsi), %zmm5
-	vmovups 0x180(%rsi), %zmm6
-	vmovups 0x1C0(%rsi), %zmm7
-	add	$512, %rsi
-	prefetcht1 (%rsi)
-	prefetcht1 0x40(%rsi)
-	prefetcht1 0x80(%rsi)
-	prefetcht1 0xC0(%rsi)
-	prefetcht1 0x100(%rsi)
-	prefetcht1 0x140(%rsi)
-	prefetcht1 0x180(%rsi)
-	prefetcht1 0x1C0(%rsi)
-	vmovups	%zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, 0x80(%rdi)
-	vmovups %zmm3, 0xC0(%rdi)
-	vmovups	%zmm4, 0x100(%rdi)
-	vmovups %zmm5, 0x140(%rdi)
-	vmovups %zmm6, 0x180(%rdi)
-	vmovups %zmm7, 0x1C0(%rdi)
-	add	$512, %rdi
-	cmp	%r9, %rdi
-	jb	L(gobble_512bytes_loop)
-	vmovups %zmm8, (%r9)
-	vmovups %zmm9, 0x40(%r9)
-	vmovups %zmm10, 0x80(%r9)
-	vmovups %zmm11, 0xC0(%r9)
-	vmovups %zmm12, 0x100(%r9)
-	vmovups %zmm13, 0x140(%r9)
-	vmovups %zmm14, 0x180(%r9)
-	vmovups %zmm15, 0x1C0(%r9)
-	ret
+    vmovups    (%rsi), %zmm0
+    vmovups 0x40(%rsi), %zmm1
+    vmovups 0x80(%rsi), %zmm2
+    vmovups 0xC0(%rsi), %zmm3
+    vmovups    0x100(%rsi), %zmm4
+    vmovups 0x140(%rsi), %zmm5
+    vmovups 0x180(%rsi), %zmm6
+    vmovups 0x1C0(%rsi), %zmm7
+    add    $512, %rsi
+    prefetcht1 (%rsi)
+    prefetcht1 0x40(%rsi)
+    prefetcht1 0x80(%rsi)
+    prefetcht1 0xC0(%rsi)
+    prefetcht1 0x100(%rsi)
+    prefetcht1 0x140(%rsi)
+    prefetcht1 0x180(%rsi)
+    prefetcht1 0x1C0(%rsi)
+    vmovups    %zmm0, (%rdi)
+    vmovups %zmm1, 0x40(%rdi)
+    vmovups %zmm2, 0x80(%rdi)
+    vmovups %zmm3, 0xC0(%rdi)
+    vmovups    %zmm4, 0x100(%rdi)
+    vmovups %zmm5, 0x140(%rdi)
+    vmovups %zmm6, 0x180(%rdi)
+    vmovups %zmm7, 0x1C0(%rdi)
+    add    $512, %rdi
+    cmp    %r9, %rdi
+    jb    L(gobble_512bytes_loop)
+    vmovups %zmm8, (%r9)
+    vmovups %zmm9, 0x40(%r9)
+    vmovups %zmm10, 0x80(%r9)
+    vmovups %zmm11, 0xC0(%r9)
+    vmovups %zmm12, 0x100(%r9)
+    vmovups %zmm13, 0x140(%r9)
+    vmovups %zmm14, 0x180(%r9)
+    vmovups %zmm15, 0x1C0(%r9)
+    ret
 
 L(1024bytesormore_bkw):
-	add	$512, %rdi
-	vmovups	0x1C0(%rsi), %zmm8
-	vmovups 0x180(%rsi), %zmm9
-	vmovups 0x140(%rsi), %zmm10
-	vmovups 0x100(%rsi), %zmm11
-	vmovups	0xC0(%rsi), %zmm12
-	vmovups 0x80(%rsi), %zmm13
-	vmovups 0x40(%rsi), %zmm14
-	vmovups (%rsi), %zmm15
-	prefetcht1 -0x40(%rcx)
-	prefetcht1 -0x80(%rcx)
-	prefetcht1 -0xC0(%rcx)
-	prefetcht1 -0x100(%rcx)
-	prefetcht1 -0x140(%rcx)
-	prefetcht1 -0x180(%rcx)
-	prefetcht1 -0x1C0(%rcx)
-	prefetcht1 -0x200(%rcx)
+    add    $512, %rdi
+    vmovups    0x1C0(%rsi), %zmm8
+    vmovups 0x180(%rsi), %zmm9
+    vmovups 0x140(%rsi), %zmm10
+    vmovups 0x100(%rsi), %zmm11
+    vmovups    0xC0(%rsi), %zmm12
+    vmovups 0x80(%rsi), %zmm13
+    vmovups 0x40(%rsi), %zmm14
+    vmovups (%rsi), %zmm15
+    prefetcht1 -0x40(%rcx)
+    prefetcht1 -0x80(%rcx)
+    prefetcht1 -0xC0(%rcx)
+    prefetcht1 -0x100(%rcx)
+    prefetcht1 -0x140(%rcx)
+    prefetcht1 -0x180(%rcx)
+    prefetcht1 -0x1C0(%rcx)
+    prefetcht1 -0x200(%rcx)
 
 /* Backward loop with unaligned memory access.  */
 L(gobble_512bytes_loop_bkw):
-	vmovups -0x40(%rcx), %zmm0
-	vmovups -0x80(%rcx), %zmm1
-	vmovups -0xC0(%rcx), %zmm2
-	vmovups	-0x100(%rcx), %zmm3
-	vmovups -0x140(%rcx), %zmm4
-	vmovups -0x180(%rcx), %zmm5
-	vmovups -0x1C0(%rcx), %zmm6
-	vmovups	-0x200(%rcx), %zmm7
-	sub	$512, %rcx
-	prefetcht1 -0x40(%rcx)
-	prefetcht1 -0x80(%rcx)
-	prefetcht1 -0xC0(%rcx)
-	prefetcht1 -0x100(%rcx)
-	prefetcht1 -0x140(%rcx)
-	prefetcht1 -0x180(%rcx)
-	prefetcht1 -0x1C0(%rcx)
-	prefetcht1 -0x200(%rcx)
-	vmovups %zmm0, -0x40(%r9)
-	vmovups %zmm1, -0x80(%r9)
-	vmovups %zmm2, -0xC0(%r9)
-	vmovups	%zmm3, -0x100(%r9)
-	vmovups %zmm4, -0x140(%r9)
-	vmovups %zmm5, -0x180(%r9)
-	vmovups %zmm6, -0x1C0(%r9)
-	vmovups	%zmm7, -0x200(%r9)
-	sub	$512, %r9
-	cmp	%rdi, %r9
-	ja	L(gobble_512bytes_loop_bkw)
-	vmovups %zmm8, -0x40(%rdi)
-	vmovups %zmm9, -0x80(%rdi)
-	vmovups %zmm10, -0xC0(%rdi)
-	vmovups %zmm11, -0x100(%rdi)
-	vmovups %zmm12, -0x140(%rdi)
-	vmovups %zmm13, -0x180(%rdi)
-	vmovups %zmm14, -0x1C0(%rdi)
-	vmovups %zmm15, -0x200(%rdi)
-	ret
+    vmovups -0x40(%rcx), %zmm0
+    vmovups -0x80(%rcx), %zmm1
+    vmovups -0xC0(%rcx), %zmm2
+    vmovups    -0x100(%rcx), %zmm3
+    vmovups -0x140(%rcx), %zmm4
+    vmovups -0x180(%rcx), %zmm5
+    vmovups -0x1C0(%rcx), %zmm6
+    vmovups    -0x200(%rcx), %zmm7
+    sub    $512, %rcx
+    prefetcht1 -0x40(%rcx)
+    prefetcht1 -0x80(%rcx)
+    prefetcht1 -0xC0(%rcx)
+    prefetcht1 -0x100(%rcx)
+    prefetcht1 -0x140(%rcx)
+    prefetcht1 -0x180(%rcx)
+    prefetcht1 -0x1C0(%rcx)
+    prefetcht1 -0x200(%rcx)
+    vmovups %zmm0, -0x40(%r9)
+    vmovups %zmm1, -0x80(%r9)
+    vmovups %zmm2, -0xC0(%r9)
+    vmovups    %zmm3, -0x100(%r9)
+    vmovups %zmm4, -0x140(%r9)
+    vmovups %zmm5, -0x180(%r9)
+    vmovups %zmm6, -0x1C0(%r9)
+    vmovups    %zmm7, -0x200(%r9)
+    sub    $512, %r9
+    cmp    %rdi, %r9
+    ja    L(gobble_512bytes_loop_bkw)
+    vmovups %zmm8, -0x40(%rdi)
+    vmovups %zmm9, -0x80(%rdi)
+    vmovups %zmm10, -0xC0(%rdi)
+    vmovups %zmm11, -0x100(%rdi)
+    vmovups %zmm12, -0x140(%rdi)
+    vmovups %zmm13, -0x180(%rdi)
+    vmovups %zmm14, -0x1C0(%rdi)
+    vmovups %zmm15, -0x200(%rdi)
+    ret
 
 L(preloop_large):
-	cmp	%rsi, %rdi
-	ja	L(preloop_large_bkw)
-	vmovups	(%rsi), %zmm4
-	vmovups	0x40(%rsi), %zmm5
+    cmp    %rsi, %rdi
+    ja    L(preloop_large_bkw)
+    vmovups    (%rsi), %zmm4
+    vmovups    0x40(%rsi), %zmm5
 
-	mov	%rdi, %r11
+    mov    %rdi, %r11
 /* Align destination for access with non-temporal stores in the loop.  */
-	mov	%rdi, %r8
-	and	$-0x80, %rdi
-	add	$0x80, %rdi
-	sub	%rdi, %r8
-	sub	%r8, %rsi
-	add	%r8, %rdx
+    mov    %rdi, %r8
+    and    $-0x80, %rdi
+    add    $0x80, %rdi
+    sub    %rdi, %r8
+    sub    %r8, %rsi
+    add    %r8, %rdx
 L(gobble_256bytes_nt_loop):
-	prefetcht1 0x200(%rsi)
-	prefetcht1 0x240(%rsi)
-	prefetcht1 0x280(%rsi)
-	prefetcht1 0x2C0(%rsi)
-	prefetcht1 0x300(%rsi)
-	prefetcht1 0x340(%rsi)
-	prefetcht1 0x380(%rsi)
-	prefetcht1 0x3C0(%rsi)
-	vmovdqu64 (%rsi), %zmm0
-	vmovdqu64 0x40(%rsi), %zmm1
-	vmovdqu64 0x80(%rsi), %zmm2
-	vmovdqu64 0xC0(%rsi), %zmm3
-	vmovntdq %zmm0, (%rdi)
-	vmovntdq %zmm1, 0x40(%rdi)
-	vmovntdq %zmm2, 0x80(%rdi)
-	vmovntdq %zmm3, 0xC0(%rdi)
-	sub	$256, %rdx
-	add	$256, %rsi
-	add	$256, %rdi
-	cmp	$256, %rdx
-	ja	L(gobble_256bytes_nt_loop)
-	sfence
-	vmovups	%zmm4, (%r11)
-	vmovups	%zmm5, 0x40(%r11)
-	jmp	L(check)
+    prefetcht1 0x200(%rsi)
+    prefetcht1 0x240(%rsi)
+    prefetcht1 0x280(%rsi)
+    prefetcht1 0x2C0(%rsi)
+    prefetcht1 0x300(%rsi)
+    prefetcht1 0x340(%rsi)
+    prefetcht1 0x380(%rsi)
+    prefetcht1 0x3C0(%rsi)
+    vmovdqu64 (%rsi), %zmm0
+    vmovdqu64 0x40(%rsi), %zmm1
+    vmovdqu64 0x80(%rsi), %zmm2
+    vmovdqu64 0xC0(%rsi), %zmm3
+    vmovntdq %zmm0, (%rdi)
+    vmovntdq %zmm1, 0x40(%rdi)
+    vmovntdq %zmm2, 0x80(%rdi)
+    vmovntdq %zmm3, 0xC0(%rdi)
+    sub    $256, %rdx
+    add    $256, %rsi
+    add    $256, %rdi
+    cmp    $256, %rdx
+    ja    L(gobble_256bytes_nt_loop)
+    sfence
+    vmovups    %zmm4, (%r11)
+    vmovups    %zmm5, 0x40(%r11)
+    jmp    L(check)
 
 L(preloop_large_bkw):
-	vmovups -0x80(%rcx), %zmm4
-	vmovups -0x40(%rcx), %zmm5
+    vmovups -0x80(%rcx), %zmm4
+    vmovups -0x40(%rcx), %zmm5
 
 /* Align end of destination for access with non-temporal stores.  */
-	mov	%r9, %r8
-	and	$-0x80, %r9
-	sub	%r9, %r8
-	sub	%r8, %rcx
-	sub	%r8, %rdx
-	add	%r9, %r8
+    mov    %r9, %r8
+    and    $-0x80, %r9
+    sub    %r9, %r8
+    sub    %r8, %rcx
+    sub    %r8, %rdx
+    add    %r9, %r8
 L(gobble_256bytes_nt_loop_bkw):
-	prefetcht1 -0x400(%rcx)
-	prefetcht1 -0x3C0(%rcx)
-	prefetcht1 -0x380(%rcx)
-	prefetcht1 -0x340(%rcx)
-	prefetcht1 -0x300(%rcx)
-	prefetcht1 -0x2C0(%rcx)
-	prefetcht1 -0x280(%rcx)
-	prefetcht1 -0x240(%rcx)
-	vmovdqu64 -0x100(%rcx), %zmm0
-	vmovdqu64 -0xC0(%rcx), %zmm1
-	vmovdqu64 -0x80(%rcx), %zmm2
-	vmovdqu64 -0x40(%rcx), %zmm3
-	vmovntdq %zmm0,	-0x100(%r9)
-	vmovntdq %zmm1,	-0xC0(%r9)
-	vmovntdq %zmm2,	-0x80(%r9)
-	vmovntdq %zmm3,	-0x40(%r9)
-	sub	$256, %rdx
-	sub	$256, %rcx
-	sub	$256, %r9
-	cmp	$256, %rdx
-	ja	L(gobble_256bytes_nt_loop_bkw)
-	sfence
-	vmovups	%zmm4, -0x80(%r8)
-	vmovups	%zmm5, -0x40(%r8)
-	jmp	L(check)
+    prefetcht1 -0x400(%rcx)
+    prefetcht1 -0x3C0(%rcx)
+    prefetcht1 -0x380(%rcx)
+    prefetcht1 -0x340(%rcx)
+    prefetcht1 -0x300(%rcx)
+    prefetcht1 -0x2C0(%rcx)
+    prefetcht1 -0x280(%rcx)
+    prefetcht1 -0x240(%rcx)
+    vmovdqu64 -0x100(%rcx), %zmm0
+    vmovdqu64 -0xC0(%rcx), %zmm1
+    vmovdqu64 -0x80(%rcx), %zmm2
+    vmovdqu64 -0x40(%rcx), %zmm3
+    vmovntdq %zmm0,    -0x100(%r9)
+    vmovntdq %zmm1,    -0xC0(%r9)
+    vmovntdq %zmm2,    -0x80(%r9)
+    vmovntdq %zmm3,    -0x40(%r9)
+    sub    $256, %rdx
+    sub    $256, %rcx
+    sub    $256, %r9
+    cmp    $256, %rdx
+    ja    L(gobble_256bytes_nt_loop_bkw)
+    sfence
+    vmovups    %zmm4, -0x80(%r8)
+    vmovups    %zmm5, -0x40(%r8)
+    jmp    L(check)
 END (__memmove_avx512_no_vzeroupper)
 
 strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
diff --git a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
index db70fdf1b4e..9666b05f1c5 100644
--- a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
@@ -1,12 +1,12 @@
 #if 1
-# define VEC_SIZE	64
-# define VEC(i)		zmm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VEC_SIZE    64
+# define VEC(i)        zmm##i
+# define VMOVNT        vmovntdq
+# define VMOVU        vmovdqu64
+# define VMOVA        vmovdqa64
 
-# define SECTION(p)		p##.avx512
-# define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+# define SECTION(p)        p##.avx512
+# define MEMMOVE_SYMBOL(p,s)    p##_avx512_##s
 
 # include "memmove-vec-unaligned-erms.S"
 #endif
diff --git a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
index 17b4f861621..ad405be479e 100644
--- a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if 1
-# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+# define MEMMOVE_SYMBOL(p,s)    p##_sse2_##s
 #else
 weak_alias (__mempcpy, mempcpy)
 #endif
diff --git a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
index 21be351b4e7..097ff6ca617 100644
--- a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
+++ b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
@@ -37,15 +37,15 @@
 #include "sysdep.h"
 
 #ifndef MEMCPY_SYMBOL
-# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+# define MEMCPY_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
 #endif
 
 #ifndef MEMPCPY_SYMBOL
-# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+# define MEMPCPY_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
 #endif
 
 #ifndef MEMMOVE_CHK_SYMBOL
-# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+# define MEMMOVE_CHK_SYMBOL(p,s)    MEMMOVE_SYMBOL(p, s)
 #endif
 
 #ifndef VZEROUPPER
@@ -70,17 +70,17 @@
 #if PREFETCH_SIZE == 64
 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
 #  define PREFETCH_ONE_SET(dir, base, offset) \
-	PREFETCH ((offset)base)
+    PREFETCH ((offset)base)
 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
 #  define PREFETCH_ONE_SET(dir, base, offset) \
-	PREFETCH ((offset)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+    PREFETCH ((offset)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE)base)
 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
 #  define PREFETCH_ONE_SET(dir, base, offset) \
-	PREFETCH ((offset)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+    PREFETCH ((offset)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+    PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
 # else
 #   error Unsupported PREFETCHED_LOAD_SIZE!
 # endif
@@ -92,100 +92,100 @@
 # error SECTION is not defined!
 #endif
 
-	.section SECTION(.text),"ax",@progbits
+    .section SECTION(.text),"ax",@progbits
 #if defined SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 #endif
 
 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start)
 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 
 #if defined SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 #endif
 
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
-	movq	%rdi, %rax
+    movq    %rdi, %rax
 L(start):
 # ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+    /* Clear the upper 32 bits.  */
+    movl    %edx, %edx
 # endif
-	cmp	$VEC_SIZE, %RDX_LP
-	jb	L(less_vec)
-	cmp	$(VEC_SIZE * 2), %RDX_LP
-	ja	L(more_2x_vec)
+    cmp    $VEC_SIZE, %RDX_LP
+    jb    L(less_vec)
+    cmp    $(VEC_SIZE * 2), %RDX_LP
+    ja    L(more_2x_vec)
 #if !defined USE_MULTIARCH
 L(last_2x_vec):
 #endif
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-	VZEROUPPER
+    /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(1)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), -VEC_SIZE(%rdi,%rdx)
+    VZEROUPPER
 #if !defined USE_MULTIARCH
 L(nop):
 #endif
-	ret
+    ret
 #if defined USE_MULTIARCH
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_erms)
 
 /* Only used to measure performance of REP MOVSB.  */
 ENTRY (__mempcpy_erms)
-	mov	%RDI_LP, %RAX_LP
-	/* Skip zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	2f
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start_movsb)
+    mov    %RDI_LP, %RAX_LP
+    /* Skip zero length.  */
+    test    %RDX_LP, %RDX_LP
+    jz    2f
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start_movsb)
 END (__mempcpy_erms)
 
 ENTRY (__memmove_chk_erms)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_erms)
 
 ENTRY (__memmove_erms)
-	movq	%rdi, %rax
-	/* Skip zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	2f
+    movq    %rdi, %rax
+    /* Skip zero length.  */
+    test    %RDX_LP, %RDX_LP
+    jz    2f
 L(start_movsb):
-	mov	%RDX_LP, %RCX_LP
-	cmp	%RSI_LP, %RDI_LP
-	jb	1f
-	/* Source == destination is less common.  */
-	je	2f
-	lea	(%rsi,%rcx), %RDX_LP
-	cmp	%RDX_LP, %RDI_LP
-	jb	L(movsb_backward)
+    mov    %RDX_LP, %RCX_LP
+    cmp    %RSI_LP, %RDI_LP
+    jb    1f
+    /* Source == destination is less common.  */
+    je    2f
+    lea    (%rsi,%rcx), %RDX_LP
+    cmp    %RDX_LP, %RDI_LP
+    jb    L(movsb_backward)
 1:
-	rep movsb
+    rep movsb
 2:
-	ret
+    ret
 L(movsb_backward):
-	leaq	-1(%rdi,%rcx), %rdi
-	leaq	-1(%rsi,%rcx), %rsi
-	std
-	rep movsb
-	cld
-	ret
+    leaq    -1(%rdi,%rcx), %rdi
+    leaq    -1(%rsi,%rcx), %rsi
+    std
+    rep movsb
+    cld
+    ret
 END (__memmove_erms)
 strong_alias (__memmove_erms, __memcpy_erms)
 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
@@ -193,367 +193,367 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 
 # ifdef SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start_erms)
+    mov    %RDI_LP, %RAX_LP
+    add    %RDX_LP, %RAX_LP
+    jmp    L(start_erms)
 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 
 # ifdef SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+    cmp    %RDX_LP, %RCX_LP
+    jb    HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-	movq	%rdi, %rax
+    movq    %rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+    /* Clear the upper 32 bits.  */
+    movl    %edx, %edx
 # endif
-	cmp	$VEC_SIZE, %RDX_LP
-	jb	L(less_vec)
-	cmp	$(VEC_SIZE * 2), %RDX_LP
-	ja	L(movsb_more_2x_vec)
+    cmp    $VEC_SIZE, %RDX_LP
+    jb    L(less_vec)
+    cmp    $(VEC_SIZE * 2), %RDX_LP
+    ja    L(movsb_more_2x_vec)
 L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+    /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(1)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), -VEC_SIZE(%rdi,%rdx)
 L(return):
-	VZEROUPPER
-	ret
+    VZEROUPPER
+    ret
 
 L(movsb):
-	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
+    cmp    $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+    jae    L(more_8x_vec)
+    cmpq    %rsi, %rdi
+    jb    1f
+    /* Source == destination is less common.  */
+    je    L(nop)
+    leaq    (%rsi,%rdx), %r9
+    cmpq    %r9, %rdi
+    /* Avoid slow backward REP MOVSB.  */
+    jb    L(more_8x_vec_backward)
 1:
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
+    mov    %RDX_LP, %RCX_LP
+    rep movsb
 L(nop):
-	ret
+    ret
 #endif
 
 L(less_vec):
-	/* Less than 1 VEC.  */
+    /* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
-	cmpb	$32, %dl
-	jae	L(between_32_63)
+    cmpb    $32, %dl
+    jae    L(between_32_63)
 #endif
 #if VEC_SIZE > 16
-	cmpb	$16, %dl
-	jae	L(between_16_31)
+    cmpb    $16, %dl
+    jae    L(between_16_31)
 #endif
-	cmpb	$8, %dl
-	jae	L(between_8_15)
-	cmpb	$4, %dl
-	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
-	movb	%cl, (%rdi)
+    cmpb    $8, %dl
+    jae    L(between_8_15)
+    cmpb    $4, %dl
+    jae    L(between_4_7)
+    cmpb    $1, %dl
+    ja    L(between_2_3)
+    jb    1f
+    movzbl    (%rsi), %ecx
+    movb    %cl, (%rdi)
 1:
-	ret
+    ret
 #if VEC_SIZE > 32
 L(between_32_63):
-	/* From 32 to 63.  No branch when size == 32.  */
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	-32(%rsi,%rdx), %ymm1
-	vmovdqu	%ymm0, (%rdi)
-	vmovdqu	%ymm1, -32(%rdi,%rdx)
-	VZEROUPPER
-	ret
+    /* From 32 to 63.  No branch when size == 32.  */
+    vmovdqu    (%rsi), %ymm0
+    vmovdqu    -32(%rsi,%rdx), %ymm1
+    vmovdqu    %ymm0, (%rdi)
+    vmovdqu    %ymm1, -32(%rdi,%rdx)
+    VZEROUPPER
+    ret
 #endif
 #if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
+    /* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	-16(%rsi,%rdx), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, -16(%rdi,%rdx)
-	ret
+    vmovdqu    (%rsi), %xmm0
+    vmovdqu    -16(%rsi,%rdx), %xmm1
+    vmovdqu    %xmm0, (%rdi)
+    vmovdqu    %xmm1, -16(%rdi,%rdx)
+    ret
 #endif
 L(between_8_15):
-	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
-	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
-	movq	%rsi, (%rdi)
-	ret
+    /* From 8 to 15.  No branch when size == 8.  */
+    movq    -8(%rsi,%rdx), %rcx
+    movq    (%rsi), %rsi
+    movq    %rcx, -8(%rdi,%rdx)
+    movq    %rsi, (%rdi)
+    ret
 L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
+    /* From 4 to 7.  No branch when size == 4.  */
+    movl    -4(%rsi,%rdx), %ecx
+    movl    (%rsi), %esi
+    movl    %ecx, -4(%rdi,%rdx)
+    movl    %esi, (%rdi)
+    ret
 L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
-	ret
+    /* From 2 to 3.  No branch when size == 2.  */
+    movzwl    -2(%rsi,%rdx), %ecx
+    movzwl    (%rsi), %esi
+    movw    %cx, -2(%rdi,%rdx)
+    movw    %si, (%rdi)
+    ret
 
 #if defined USE_MULTIARCH
 L(movsb_more_2x_vec):
-	cmp	$REP_MOSB_THRESHOLD, %RDX_LP
-	ja	L(movsb)
+    cmp    $REP_MOSB_THRESHOLD, %RDX_LP
+    ja    L(movsb)
 #endif
 L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
-	cmpq	$(VEC_SIZE * 8), %rdx
-	ja	L(more_8x_vec)
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(last_4x_vec)
-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER
-	ret
+    /* More than 2 * VEC and there may be overlap between destination
+       and source.  */
+    cmpq    $(VEC_SIZE * 8), %rdx
+    ja    L(more_8x_vec)
+    cmpq    $(VEC_SIZE * 4), %rdx
+    jb    L(last_4x_vec)
+    /* Copy from 4 * VEC to 8 * VEC, inclusively. */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(2)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(3)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(4)
+    VMOVU    -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+    VMOVU    -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+    VMOVU    -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), VEC_SIZE(%rdi)
+    VMOVU    %VEC(2), (VEC_SIZE * 2)(%rdi)
+    VMOVU    %VEC(3), (VEC_SIZE * 3)(%rdi)
+    VMOVU    %VEC(4), -VEC_SIZE(%rdi,%rdx)
+    VMOVU    %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+    VMOVU    %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+    VMOVU    %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+    VZEROUPPER
+    ret
 L(last_4x_vec):
-	/* Copy from 2 * VEC to 4 * VEC. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VZEROUPPER
-	ret
+    /* Copy from 2 * VEC to 4 * VEC. */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(2)
+    VMOVU    -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+    VMOVU    %VEC(0), (%rdi)
+    VMOVU    %VEC(1), VEC_SIZE(%rdi)
+    VMOVU    %VEC(2), -VEC_SIZE(%rdi,%rdx)
+    VMOVU    %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+    VZEROUPPER
+    ret
 
 L(more_8x_vec):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
+    cmpq    %rsi, %rdi
+    ja    L(more_8x_vec_backward)
+    /* Source == destination is less common.  */
+    je    L(nop)
+    /* Load the first VEC and last 4 * VEC to support overlapping
+       addresses.  */
+    VMOVU    (%rsi), %VEC(4)
+    VMOVU    -VEC_SIZE(%rsi, %rdx), %VEC(5)
+    VMOVU    -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+    VMOVU    -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+    VMOVU    -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+    /* Save start and stop of the destination buffer.  */
+    movq    %rdi, %r11
+    leaq    -VEC_SIZE(%rdi, %rdx), %rcx
+    /* Align destination for aligned stores in the loop.  Compute
+       how much destination is misaligned.  */
+    movq    %rdi, %r8
+    andq    $(VEC_SIZE - 1), %r8
+    /* Get the negative of offset for alignment.  */
+    subq    $VEC_SIZE, %r8
+    /* Adjust source.  */
+    subq    %r8, %rsi
+    /* Adjust destination which should be aligned now.  */
+    subq    %r8, %rdi
+    /* Adjust length.  */
+    addq    %r8, %rdx
 #if (defined USE_MULTIARCH || VEC_SIZE == 16)
-	/* Check non-temporal store threshold.  */
-	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
-	ja	L(large_forward)
+    /* Check non-temporal store threshold.  */
+    cmp    $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+    ja    L(large_forward)
 #endif
 L(loop_4x_vec_forward):
-	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$(VEC_SIZE * 4), %rsi
-	subq	$(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_forward)
-	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
+    /* Copy 4 * VEC a time forward.  */
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(2)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(3)
+    addq    $(VEC_SIZE * 4), %rsi
+    subq    $(VEC_SIZE * 4), %rdx
+    VMOVA    %VEC(0), (%rdi)
+    VMOVA    %VEC(1), VEC_SIZE(%rdi)
+    VMOVA    %VEC(2), (VEC_SIZE * 2)(%rdi)
+    VMOVA    %VEC(3), (VEC_SIZE * 3)(%rdi)
+    addq    $(VEC_SIZE * 4), %rdi
+    cmpq    $(VEC_SIZE * 4), %rdx
+    ja    L(loop_4x_vec_forward)
+    /* Store the last 4 * VEC.  */
+    VMOVU    %VEC(5), (%rcx)
+    VMOVU    %VEC(6), -VEC_SIZE(%rcx)
+    VMOVU    %VEC(7), -(VEC_SIZE * 2)(%rcx)
+    VMOVU    %VEC(8), -(VEC_SIZE * 3)(%rcx)
+    /* Store the first VEC.  */
+    VMOVU    %VEC(4), (%r11)
+    VZEROUPPER
+    ret
 
 L(more_8x_vec_backward):
-	/* Load the first 4 * VEC and last VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
-	VMOVU	VEC_SIZE(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
+    /* Load the first 4 * VEC and last VEC to support overlapping
+       addresses.  */
+    VMOVU    (%rsi), %VEC(4)
+    VMOVU    VEC_SIZE(%rsi), %VEC(5)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(6)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(7)
+    VMOVU    -VEC_SIZE(%rsi,%rdx), %VEC(8)
+    /* Save stop of the destination buffer.  */
+    leaq    -VEC_SIZE(%rdi, %rdx), %r11
+    /* Align destination end for aligned stores in the loop.  Compute
+       how much destination end is misaligned.  */
+    leaq    -VEC_SIZE(%rsi, %rdx), %rcx
+    movq    %r11, %r9
+    movq    %r11, %r8
+    andq    $(VEC_SIZE - 1), %r8
+    /* Adjust source.  */
+    subq    %r8, %rcx
+    /* Adjust the end of destination which should be aligned now.  */
+    subq    %r8, %r9
+    /* Adjust length.  */
+    subq    %r8, %rdx
 #if (defined USE_MULTIARCH || VEC_SIZE == 16)
-	/* Check non-temporal store threshold.  */
-	cmp	$SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
-	ja	L(large_backward)
+    /* Check non-temporal store threshold.  */
+    cmp    $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
+    ja    L(large_backward)
 #endif
 L(loop_4x_vec_backward):
-	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$(VEC_SIZE * 4), %rcx
-	subq	$(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
+    /* Copy 4 * VEC a time backward.  */
+    VMOVU    (%rcx), %VEC(0)
+    VMOVU    -VEC_SIZE(%rcx), %VEC(1)
+    VMOVU    -(VEC_SIZE * 2)(%rcx), %VEC(2)
+    VMOVU    -(VEC_SIZE * 3)(%rcx), %VEC(3)
+    subq    $(VEC_SIZE * 4), %rcx
+    subq    $(VEC_SIZE * 4), %rdx
+    VMOVA    %VEC(0), (%r9)
+    VMOVA    %VEC(1), -VEC_SIZE(%r9)
+    VMOVA    %VEC(2), -(VEC_SIZE * 2)(%r9)
+    VMOVA    %VEC(3), -(VEC_SIZE * 3)(%r9)
+    subq    $(VEC_SIZE * 4), %r9
+    cmpq    $(VEC_SIZE * 4), %rdx
+    ja    L(loop_4x_vec_backward)
+    /* Store the first 4 * VEC.  */
+    VMOVU    %VEC(4), (%rdi)
+    VMOVU    %VEC(5), VEC_SIZE(%rdi)
+    VMOVU    %VEC(6), (VEC_SIZE * 2)(%rdi)
+    VMOVU    %VEC(7), (VEC_SIZE * 3)(%rdi)
+    /* Store the last VEC.  */
+    VMOVU    %VEC(8), (%r11)
+    VZEROUPPER
+    ret
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16)
 L(large_forward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
+    /* Don't use non-temporal store if there is overlap between
+       destination and source since destination may be in cache
+       when source is loaded.  */
+    leaq    (%rdi, %rdx), %r10
+    cmpq    %r10, %rsi
+    jb    L(loop_4x_vec_forward)
 L(loop_large_forward):
-	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%rdi)
-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
-	sfence
-	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
+    /* Copy 4 * VEC a time forward with non-temporal stores.  */
+    PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+    PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+    VMOVU    (%rsi), %VEC(0)
+    VMOVU    VEC_SIZE(%rsi), %VEC(1)
+    VMOVU    (VEC_SIZE * 2)(%rsi), %VEC(2)
+    VMOVU    (VEC_SIZE * 3)(%rsi), %VEC(3)
+    addq    $PREFETCHED_LOAD_SIZE, %rsi
+    subq    $PREFETCHED_LOAD_SIZE, %rdx
+    VMOVNT    %VEC(0), (%rdi)
+    VMOVNT    %VEC(1), VEC_SIZE(%rdi)
+    VMOVNT    %VEC(2), (VEC_SIZE * 2)(%rdi)
+    VMOVNT    %VEC(3), (VEC_SIZE * 3)(%rdi)
+    addq    $PREFETCHED_LOAD_SIZE, %rdi
+    cmpq    $PREFETCHED_LOAD_SIZE, %rdx
+    ja    L(loop_large_forward)
+    sfence
+    /* Store the last 4 * VEC.  */
+    VMOVU    %VEC(5), (%rcx)
+    VMOVU    %VEC(6), -VEC_SIZE(%rcx)
+    VMOVU    %VEC(7), -(VEC_SIZE * 2)(%rcx)
+    VMOVU    %VEC(8), -(VEC_SIZE * 3)(%rcx)
+    /* Store the first VEC.  */
+    VMOVU    %VEC(4), (%r11)
+    VZEROUPPER
+    ret
 
 L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
+    /* Don't use non-temporal store if there is overlap between
+       destination and source since destination may be in cache
+       when source is loaded.  */
+    leaq    (%rcx, %rdx), %r10
+    cmpq    %r10, %r9
+    jb    L(loop_4x_vec_backward)
 L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
-	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
+    /* Copy 4 * VEC a time backward with non-temporal stores.  */
+    PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+    PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+    VMOVU    (%rcx), %VEC(0)
+    VMOVU    -VEC_SIZE(%rcx), %VEC(1)
+    VMOVU    -(VEC_SIZE * 2)(%rcx), %VEC(2)
+    VMOVU    -(VEC_SIZE * 3)(%rcx), %VEC(3)
+    subq    $PREFETCHED_LOAD_SIZE, %rcx
+    subq    $PREFETCHED_LOAD_SIZE, %rdx
+    VMOVNT    %VEC(0), (%r9)
+    VMOVNT    %VEC(1), -VEC_SIZE(%r9)
+    VMOVNT    %VEC(2), -(VEC_SIZE * 2)(%r9)
+    VMOVNT    %VEC(3), -(VEC_SIZE * 3)(%r9)
+    subq    $PREFETCHED_LOAD_SIZE, %r9
+    cmpq    $PREFETCHED_LOAD_SIZE, %rdx
+    ja    L(loop_large_backward)
+    sfence
+    /* Store the first 4 * VEC.  */
+    VMOVU    %VEC(4), (%rdi)
+    VMOVU    %VEC(5), VEC_SIZE(%rdi)
+    VMOVU    %VEC(6), (VEC_SIZE * 2)(%rdi)
+    VMOVU    %VEC(7), (VEC_SIZE * 3)(%rdi)
+    /* Store the last VEC.  */
+    VMOVU    %VEC(8), (%r11)
+    VZEROUPPER
+    ret
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
 #if 1
 # ifdef USE_MULTIARCH
 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
-	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+          MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 #  ifdef SHARED
 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
-	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+          MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 #  endif
 # endif
 # ifdef SHARED
 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
-	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
+          MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 # endif
 #endif
 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
-	      MEMCPY_SYMBOL (__memcpy, unaligned))
+          MEMCPY_SYMBOL (__memcpy, unaligned))
diff --git a/utils/memcpy-bench/glibc/memmove.S b/utils/memcpy-bench/glibc/memmove.S
index 97e735facff..7bd47b9a03f 100644
--- a/utils/memcpy-bench/glibc/memmove.S
+++ b/utils/memcpy-bench/glibc/memmove.S
@@ -18,33 +18,33 @@
 
 #include "sysdep.h"
 
-#define VEC_SIZE	16
-#define VEC(i)		xmm##i
-#define PREFETCHNT	prefetchnta
-#define VMOVNT		movntdq
+#define VEC_SIZE    16
+#define VEC(i)        xmm##i
+#define PREFETCHNT    prefetchnta
+#define VMOVNT        movntdq
 /* Use movups and movaps for smaller code sizes.  */
-#define VMOVU		movups
-#define VMOVA		movaps
+#define VMOVU        movups
+#define VMOVA        movaps
 
-#define SECTION(p)		p
+#define SECTION(p)        p
 
 #ifdef USE_MULTIARCH
 # if 0
-#  define MEMCPY_SYMBOL(p,s)		memcpy
+#  define MEMCPY_SYMBOL(p,s)        memcpy
 # endif
 #else
 # if defined SHARED
-#  define MEMCPY_SYMBOL(p,s)		__memcpy
+#  define MEMCPY_SYMBOL(p,s)        __memcpy
 # else
-#  define MEMCPY_SYMBOL(p,s)		memcpy
+#  define MEMCPY_SYMBOL(p,s)        memcpy
 # endif
 #endif
 #if !defined USE_MULTIARCH
-# define MEMPCPY_SYMBOL(p,s)		__mempcpy
+# define MEMPCPY_SYMBOL(p,s)        __mempcpy
 #endif
 #ifndef MEMMOVE_SYMBOL
-# define MEMMOVE_CHK_SYMBOL(p,s)	p
-# define MEMMOVE_SYMBOL(p,s)		memmove
+# define MEMMOVE_CHK_SYMBOL(p,s)    p
+# define MEMMOVE_SYMBOL(p,s)        memmove
 #endif
 
 #include "memmove-vec-unaligned-erms.S"
diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h
index 099134b2a2f..e255e7488da 100644
--- a/utils/memcpy-bench/glibc/sysdep.h
+++ b/utils/memcpy-bench/glibc/sysdep.h
@@ -21,7 +21,7 @@
 
 #include "sysdep_x86.h"
 
-#ifdef	__ASSEMBLER__
+#ifdef    __ASSEMBLER__
 
 /* Syntactic details of assembler.  */
 
@@ -29,11 +29,11 @@
    the register as saved relative to %rsp instead of relative to the CFA.
    Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
    from %rsp.  */
-#define cfi_offset_rel_rsp(regn, off)	.cfi_escape 0x10, regn, 0x4, 0x13, \
-					0x77, off & 0x7F | 0x80, off >> 7
+#define cfi_offset_rel_rsp(regn, off)    .cfi_escape 0x10, regn, 0x4, 0x13, \
+                    0x77, off & 0x7F | 0x80, off >> 7
 
 /* If compiled for profiling, call `mcount' at the start of each function.  */
-#ifdef	PROF
+#ifdef    PROF
 /* The mcount code relies on a normal frame pointer being on the stack
    to locate our caller, so push one just for its benefit.  */
 #define CALL_MCOUNT                                                          \
@@ -45,31 +45,31 @@
   popq %rbp;                                                                 \
   cfi_def_cfa(rsp,8);
 #else
-#define CALL_MCOUNT		/* Do nothing.  */
+#define CALL_MCOUNT        /* Do nothing.  */
 #endif
 
-#define	PSEUDO(name, syscall_name, args)				      \
-lose:									      \
-  jmp JUMPTARGET(syscall_error)						      \
-  .globl syscall_error;							      \
-  ENTRY (name)								      \
-  DO_CALL (syscall_name, args);						      \
+#define    PSEUDO(name, syscall_name, args)                      \
+lose:                                          \
+  jmp JUMPTARGET(syscall_error)                              \
+  .globl syscall_error;                                  \
+  ENTRY (name)                                      \
+  DO_CALL (syscall_name, args);                              \
   jb lose
 
 #undef JUMPTARGET
 #ifdef SHARED
 # ifdef BIND_NOW
-#  define JUMPTARGET(name)	*name##@GOTPCREL(%rip)
+#  define JUMPTARGET(name)    *name##@GOTPCREL(%rip)
 # else
-#  define JUMPTARGET(name)	name##@PLT
+#  define JUMPTARGET(name)    name##@PLT
 # endif
 #else
 /* For static archives, branch to target directly.  */
-# define JUMPTARGET(name)	name
+# define JUMPTARGET(name)    name
 #endif
 
 /* Long and pointer size in bytes.  */
-#define LP_SIZE	8
+#define LP_SIZE    8
 
 /* Instruction to operate on long and pointer.  */
 #define LP_OP(insn) insn##q
@@ -78,24 +78,24 @@ lose:									      \
 #define ASM_ADDR .quad
 
 /* Registers to hold long and pointer.  */
-#define RAX_LP	rax
-#define RBP_LP	rbp
-#define RBX_LP	rbx
-#define RCX_LP	rcx
-#define RDI_LP	rdi
-#define RDX_LP	rdx
-#define RSI_LP	rsi
-#define RSP_LP	rsp
-#define R8_LP	r8
-#define R9_LP	r9
-#define R10_LP	r10
-#define R11_LP	r11
-#define R12_LP	r12
-#define R13_LP	r13
-#define R14_LP	r14
-#define R15_LP	r15
+#define RAX_LP    rax
+#define RBP_LP    rbp
+#define RBX_LP    rbx
+#define RCX_LP    rcx
+#define RDI_LP    rdi
+#define RDX_LP    rdx
+#define RSI_LP    rsi
+#define RSP_LP    rsp
+#define R8_LP    r8
+#define R9_LP    r9
+#define R10_LP    r10
+#define R11_LP    r11
+#define R12_LP    r12
+#define R13_LP    r13
+#define R14_LP    r14
+#define R15_LP    r15
 
-#else	/* __ASSEMBLER__ */
+#else    /* __ASSEMBLER__ */
 
 /* Long and pointer size in bytes.  */
 #define LP_SIZE "8"
@@ -107,23 +107,23 @@ lose:									      \
 #define ASM_ADDR ".quad"
 
 /* Registers to hold long and pointer.  */
-#define RAX_LP	"rax"
-#define RBP_LP	"rbp"
-#define RBX_LP	"rbx"
-#define RCX_LP	"rcx"
-#define RDI_LP	"rdi"
-#define RDX_LP	"rdx"
-#define RSI_LP	"rsi"
-#define RSP_LP	"rsp"
-#define R8_LP	"r8"
-#define R9_LP	"r9"
-#define R10_LP	"r10"
-#define R11_LP	"r11"
-#define R12_LP	"r12"
-#define R13_LP	"r13"
-#define R14_LP	"r14"
-#define R15_LP	"r15"
+#define RAX_LP    "rax"
+#define RBP_LP    "rbp"
+#define RBX_LP    "rbx"
+#define RCX_LP    "rcx"
+#define RDI_LP    "rdi"
+#define RDX_LP    "rdx"
+#define RSI_LP    "rsi"
+#define RSP_LP    "rsp"
+#define R8_LP    "r8"
+#define R9_LP    "r9"
+#define R10_LP    "r10"
+#define R11_LP    "r11"
+#define R12_LP    "r12"
+#define R13_LP    "r13"
+#define R14_LP    "r14"
+#define R15_LP    "r15"
 
-#endif	/* __ASSEMBLER__ */
+#endif    /* __ASSEMBLER__ */
 
-#endif	/* _X86_64_SYSDEP_H */
+#endif    /* _X86_64_SYSDEP_H */
diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h
index 91f78e1b04d..afecea8c356 100644
--- a/utils/memcpy-bench/glibc/sysdep_generic.h
+++ b/utils/memcpy-bench/glibc/sysdep_generic.h
@@ -28,14 +28,14 @@
 
 #define ASM_LINE_SEP ;
 
-#define strong_alias(original, alias)				\
-  .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP		\
+#define strong_alias(original, alias)                \
+  .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP        \
   C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
 
 #ifndef C_LABEL
 
 /* Define a macro we can use to construct the asm name for a C symbol.  */
-# define C_LABEL(name)	name##:
+# define C_LABEL(name)    name##:
 
 #endif
 
@@ -47,38 +47,38 @@
 # endif
 
 # ifndef JUMPTARGET
-#  define JUMPTARGET(sym)	sym
+#  define JUMPTARGET(sym)    sym
 # endif
 #endif
 
-/* Makros to generate eh_frame unwind information.  */
+/* Macros to generate eh_frame unwind information.  */
 #ifdef __ASSEMBLER__
-# define cfi_startproc			.cfi_startproc
-# define cfi_endproc			.cfi_endproc
-# define cfi_def_cfa(reg, off)		.cfi_def_cfa reg, off
-# define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
-# define cfi_def_cfa_offset(off)	.cfi_def_cfa_offset off
-# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-# define cfi_offset(reg, off)		.cfi_offset reg, off
-# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-# define cfi_register(r1, r2)		.cfi_register r1, r2
-# define cfi_return_column(reg)	.cfi_return_column reg
-# define cfi_restore(reg)		.cfi_restore reg
-# define cfi_same_value(reg)		.cfi_same_value reg
-# define cfi_undefined(reg)		.cfi_undefined reg
-# define cfi_remember_state		.cfi_remember_state
-# define cfi_restore_state		.cfi_restore_state
-# define cfi_window_save		.cfi_window_save
-# define cfi_personality(enc, exp)	.cfi_personality enc, exp
-# define cfi_lsda(enc, exp)		.cfi_lsda enc, exp
+# define cfi_startproc            .cfi_startproc
+# define cfi_endproc            .cfi_endproc
+# define cfi_def_cfa(reg, off)        .cfi_def_cfa reg, off
+# define cfi_def_cfa_register(reg)    .cfi_def_cfa_register reg
+# define cfi_def_cfa_offset(off)    .cfi_def_cfa_offset off
+# define cfi_adjust_cfa_offset(off)    .cfi_adjust_cfa_offset off
+# define cfi_offset(reg, off)        .cfi_offset reg, off
+# define cfi_rel_offset(reg, off)    .cfi_rel_offset reg, off
+# define cfi_register(r1, r2)        .cfi_register r1, r2
+# define cfi_return_column(reg)    .cfi_return_column reg
+# define cfi_restore(reg)        .cfi_restore reg
+# define cfi_same_value(reg)        .cfi_same_value reg
+# define cfi_undefined(reg)        .cfi_undefined reg
+# define cfi_remember_state        .cfi_remember_state
+# define cfi_restore_state        .cfi_restore_state
+# define cfi_window_save        .cfi_window_save
+# define cfi_personality(enc, exp)    .cfi_personality enc, exp
+# define cfi_lsda(enc, exp)        .cfi_lsda enc, exp
 
 #else /* ! ASSEMBLER */
 
 # define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name)
 # define CFI_STRINGIFY2(Name) #Name
-# define CFI_STARTPROC	".cfi_startproc"
-# define CFI_ENDPROC	".cfi_endproc"
-# define CFI_DEF_CFA(reg, off)	\
+# define CFI_STARTPROC    ".cfi_startproc"
+# define CFI_ENDPROC    ".cfi_endproc"
+# define CFI_DEF_CFA(reg, off)    \
    ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
 # define CFI_DEF_CFA_REGISTER(reg) \
    ".cfi_def_cfa_register " CFI_STRINGIFY(reg)
diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h
index a3fecd01268..7abb350242f 100644
--- a/utils/memcpy-bench/glibc/sysdep_x86.h
+++ b/utils/memcpy-bench/glibc/sysdep_x86.h
@@ -34,18 +34,18 @@ enum cf_protection_level
 */
 
 /* Set if CF_BRANCH (IBT) is enabled.  */
-#define X86_FEATURE_1_IBT	(1U << 0)
+#define X86_FEATURE_1_IBT    (1U << 0)
 /* Set if CF_RETURN (SHSTK) is enabled.  */
-#define X86_FEATURE_1_SHSTK	(1U << 1)
+#define X86_FEATURE_1_SHSTK    (1U << 1)
 
 #ifdef __CET__
-# define CET_ENABLED	1
-# define IBT_ENABLED	(__CET__ & X86_FEATURE_1_IBT)
-# define SHSTK_ENABLED	(__CET__ & X86_FEATURE_1_SHSTK)
+# define CET_ENABLED    1
+# define IBT_ENABLED    (__CET__ & X86_FEATURE_1_IBT)
+# define SHSTK_ENABLED    (__CET__ & X86_FEATURE_1_SHSTK)
 #else
-# define CET_ENABLED	0
-# define IBT_ENABLED	0
-# define SHSTK_ENABLED	0
+# define CET_ENABLED    0
+# define IBT_ENABLED    0
+# define SHSTK_ENABLED    0
 #endif
 
 /* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
@@ -57,7 +57,7 @@ enum cf_protection_level
 #define STATE_SAVE_MASK \
   ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
 
-#ifdef	__ASSEMBLER__
+#ifdef    __ASSEMBLER__
 
 /* Syntactic details of assembler.  */
 
@@ -73,18 +73,18 @@ enum cf_protection_level
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
 
 /* Define an entry point visible from C.  */
-#define	ENTRY(name)							      \
-  .globl C_SYMBOL_NAME(name);						      \
-  .type C_SYMBOL_NAME(name),@function;					      \
-  .align ALIGNARG(4);							      \
-  C_LABEL(name)								      \
-  cfi_startproc;							      \
-  _CET_ENDBR;								      \
+#define    ENTRY(name)                                  \
+  .globl C_SYMBOL_NAME(name);                              \
+  .type C_SYMBOL_NAME(name),@function;                          \
+  .align ALIGNARG(4);                                  \
+  C_LABEL(name)                                      \
+  cfi_startproc;                                  \
+  _CET_ENDBR;                                      \
   CALL_MCOUNT
 
-#undef	END
-#define END(name)							      \
-  cfi_endproc;								      \
+#undef    END
+#define END(name)                                  \
+  cfi_endproc;                                      \
   ASM_SIZE_DIRECTIVE(name)
 
 #define ENTRY_CHK(name) ENTRY (name)
@@ -93,21 +93,21 @@ enum cf_protection_level
 /* Since C identifiers are not normally prefixed with an underscore
    on this system, the asm identifier `syscall_error' intrudes on the
    C name space.  Make sure we use an innocuous name.  */
-#define	syscall_error	__syscall_error
-#define mcount		_mcount
+#define    syscall_error    __syscall_error
+#define mcount        _mcount
 
-#undef	PSEUDO_END
-#define	PSEUDO_END(name)						      \
+#undef    PSEUDO_END
+#define    PSEUDO_END(name)                              \
   END (name)
 
 /* Local label name for asm code. */
 #ifndef L
 /* ELF-like local names start with `.L'.  */
-# define L(name)	.L##name
+# define L(name)    .L##name
 #endif
 
 #define atom_text_section .section ".text.atom", "ax"
 
-#endif	/* __ASSEMBLER__ */
+#endif    /* __ASSEMBLER__ */
 
-#endif	/* _X86_SYSDEP_H */
+#endif    /* _X86_SYSDEP_H */

From 8c2d65242a81b68f9ca520cf015e53933a52eaca Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 14 Mar 2021 23:24:22 +0300
Subject: [PATCH 3/6] Fix style

---
 utils/memcpy-bench/glibc/asm-syntax.h     | 2 ++
 utils/memcpy-bench/glibc/dwarf2.h         | 4 +++-
 utils/memcpy-bench/glibc/sysdep.h         | 2 ++
 utils/memcpy-bench/glibc/sysdep_generic.h | 2 ++
 utils/memcpy-bench/glibc/sysdep_x86.h     | 2 ++
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/utils/memcpy-bench/glibc/asm-syntax.h b/utils/memcpy-bench/glibc/asm-syntax.h
index 6e299c1fec2..9d65213ba30 100644
--- a/utils/memcpy-bench/glibc/asm-syntax.h
+++ b/utils/memcpy-bench/glibc/asm-syntax.h
@@ -1,3 +1,5 @@
+#pragma once
+
 /* Definitions for x86 syntax variations.
    Copyright (C) 1992-2020 Free Software Foundation, Inc.
    This file is part of the GNU C Library.  Its master source is NOT part of
diff --git a/utils/memcpy-bench/glibc/dwarf2.h b/utils/memcpy-bench/glibc/dwarf2.h
index 2be827f00ae..b0536c97e5e 100644
--- a/utils/memcpy-bench/glibc/dwarf2.h
+++ b/utils/memcpy-bench/glibc/dwarf2.h
@@ -1,3 +1,5 @@
+#pragma once
+
 /* Declarations and definitions of codes relating to the DWARF2 symbolic
    debugging information format.
    Copyright (C) 1992-2020 Free Software Foundation, Inc.
@@ -563,7 +565,7 @@ enum dwarf_macinfo_record_type
   };
 
 #endif /* !ASSEMBLER */
-
+
 /* @@@ For use with GNU frame unwind information.  */
 
 #define DW_EH_PE_absptr        0x00
diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h
index e255e7488da..2f43d688df9 100644
--- a/utils/memcpy-bench/glibc/sysdep.h
+++ b/utils/memcpy-bench/glibc/sysdep.h
@@ -1,3 +1,5 @@
+#pragma once
+
 /* Assembler macros for x86-64.
    Copyright (C) 2001-2020 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h
index afecea8c356..0cb5bca4102 100644
--- a/utils/memcpy-bench/glibc/sysdep_generic.h
+++ b/utils/memcpy-bench/glibc/sysdep_generic.h
@@ -1,3 +1,5 @@
+#pragma once
+
 /* Generic asm macros used on many machines.
    Copyright (C) 1991-2020 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h
index 7abb350242f..4469ed2e885 100644
--- a/utils/memcpy-bench/glibc/sysdep_x86.h
+++ b/utils/memcpy-bench/glibc/sysdep_x86.h
@@ -1,3 +1,5 @@
+#pragma once
+
 /* Assembler macros for x86.
    Copyright (C) 2017-2020 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

From 637f6a29a649ee46360848c5a8013fb040050589 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 15 Mar 2021 11:16:15 +0300
Subject: [PATCH 4/6] Add penalty

---
 utils/memcpy-bench/memcpy-bench.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp
index 365abe1f01e..dc510af0dbf 100644
--- a/utils/memcpy-bench/memcpy-bench.cpp
+++ b/utils/memcpy-bench/memcpy-bench.cpp
@@ -33,6 +33,9 @@ void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_d
         dst += bytes_to_copy;
         src += bytes_to_copy;
         size -= bytes_to_copy;
+
+        /// Execute at least one SSE instruction as a penalty after running AVX code.
+        __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
     }
 }
 
@@ -76,16 +79,9 @@ uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size
     uint64_t elapsed_ns = watch.elapsed();
 
     /// Validation
-    size_t sum = 0;
-    size_t reference = 0;
     for (size_t i = 0; i < size; ++i)
-    {
-        sum += dst[i];
-        reference += uint8_t(i);
-    }
-
-    if (sum != reference)
-        throw std::logic_error("Incorrect result");
+        if (dst[i] != uint8_t(i))
+            throw std::logic_error("Incorrect result");
 
     std::cout << name;
     return elapsed_ns;
@@ -676,11 +672,9 @@ done | tee result.tsv
     }
     else
     {
-        iterations = 10000000000ULL * num_threads / size;
+        iterations = 10000000000ULL / size;
 
         if (generator_variant == 1)
-            iterations /= 100;
-        if (generator_variant == 2)
             iterations /= 10;
     }
 

From 1f6b05cd85d34c2d6f71b057c16d95b83f7d8853 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 15 Mar 2021 11:18:11 +0300
Subject: [PATCH 5/6] Add example

---
 utils/memcpy-bench/memcpy-bench.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp
index dc510af0dbf..cd769640017 100644
--- a/utils/memcpy-bench/memcpy-bench.cpp
+++ b/utils/memcpy-bench/memcpy-bench.cpp
@@ -655,6 +655,24 @@ for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do
     done;
 done | tee result.tsv
 
+clickhouse-local --structure '
+    name String,
+    size UInt64,
+    iterations UInt64,
+    threads UInt16,
+    generator UInt8,
+    memcpy UInt8,
+    elapsed UInt64
+' --query "
+    SELECT
+        size, name,
+        avg(1000 * elapsed / size / iterations) AS s,
+        count() AS c
+    FROM table
+    GROUP BY size, name
+    ORDER BY size ASC, s DESC
+" --output-format PrettyCompact < result.tsv
+
 )" << std::endl;
         std::cout << desc << std::endl;
         return 1;

From 9bea10d9f94206671c89b8faf196725ed47e0d5e Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 15 Mar 2021 11:49:56 +0300
Subject: [PATCH 6/6] Fix style

---
 utils/memcpy-bench/glibc/asm-syntax.h     |  28 +++---
 utils/memcpy-bench/glibc/dwarf2.h         | 114 +++++++++++-----------
 utils/memcpy-bench/glibc/sysdep.h         |  58 +++++------
 utils/memcpy-bench/glibc/sysdep_generic.h |  62 ++++++------
 utils/memcpy-bench/glibc/sysdep_x86.h     |  66 ++++++-------
 5 files changed, 164 insertions(+), 164 deletions(-)

diff --git a/utils/memcpy-bench/glibc/asm-syntax.h b/utils/memcpy-bench/glibc/asm-syntax.h
index 9d65213ba30..0879f2606c7 100644
--- a/utils/memcpy-bench/glibc/asm-syntax.h
+++ b/utils/memcpy-bench/glibc/asm-syntax.h
@@ -1,23 +1,23 @@
 #pragma once
 
 /* Definitions for x86 syntax variations.
-   Copyright (C) 1992-2020 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.  Its master source is NOT part of
-   the C library, however.  The master source lives in the GNU MP Library.
+    Copyright (C) 1992-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.  Its master source is NOT part of
+    the C library, however.  The master source lives in the GNU MP Library.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
 
 #undef ALIGN
 #define ALIGN(log) .align 1<<log
diff --git a/utils/memcpy-bench/glibc/dwarf2.h b/utils/memcpy-bench/glibc/dwarf2.h
index b0536c97e5e..d410ce2da38 100644
--- a/utils/memcpy-bench/glibc/dwarf2.h
+++ b/utils/memcpy-bench/glibc/dwarf2.h
@@ -1,44 +1,44 @@
 #pragma once
 
 /* Declarations and definitions of codes relating to the DWARF2 symbolic
-   debugging information format.
-   Copyright (C) 1992-2020 Free Software Foundation, Inc.
-   Contributed by Gary Funck (gary@intrepid.com).  Derived from the
-   DWARF 1 implementation written by Ron Guilmette (rfg@monkeys.com).
+    debugging information format.
+    Copyright (C) 1992-2020 Free Software Foundation, Inc.
+    Contributed by Gary Funck (gary@intrepid.com).  Derived from the
+    DWARF 1 implementation written by Ron Guilmette (rfg@monkeys.com).
 
-   This file is part of the GNU C Library.
+    This file is part of the GNU C Library.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
 
 #ifndef _DWARF2_H
 #define _DWARF2_H    1
 
 /* This file is derived from the DWARF specification (a public document)
-   Revision 2.0.0 (July 27, 1993) developed by the UNIX International
-   Programming Languages Special Interest Group (UI/PLSIG) and distributed
-   by UNIX International.  Copies of this specification are available from
-   UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.  */
+    Revision 2.0.0 (July 27, 1993) developed by the UNIX International
+    Programming Languages Special Interest Group (UI/PLSIG) and distributed
+    by UNIX International.  Copies of this specification are available from
+    UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.  */
 
 /* This file is shared between GCC and GDB, and should not contain
-   prototypes.  */
+    prototypes.  */
 
 #ifndef __ASSEMBLER__
 /* Tag names and codes.  */
 
 enum dwarf_tag
-  {
+    {
     DW_TAG_padding = 0x00,
     DW_TAG_array_type = 0x01,
     DW_TAG_class_type = 0x02,
@@ -95,7 +95,7 @@ enum dwarf_tag
     DW_TAG_class_template = 0x4103,    /* for C++ */
     DW_TAG_GNU_BINCL = 0x4104,
     DW_TAG_GNU_EINCL = 0x4105
-  };
+    };
 
 #define DW_TAG_lo_user    0x4080
 #define DW_TAG_hi_user    0xffff
@@ -106,7 +106,7 @@ enum dwarf_tag
 
 /* Form names and codes.  */
 enum dwarf_form
-  {
+    {
     DW_FORM_addr = 0x01,
     DW_FORM_block2 = 0x03,
     DW_FORM_block4 = 0x04,
@@ -128,12 +128,12 @@ enum dwarf_form
     DW_FORM_ref8 = 0x14,
     DW_FORM_ref_udata = 0x15,
     DW_FORM_indirect = 0x16
-  };
+    };
 
 /* Attribute names and codes.  */
 
 enum dwarf_attribute
-  {
+    {
     DW_AT_sibling = 0x01,
     DW_AT_location = 0x02,
     DW_AT_name = 0x03,
@@ -215,7 +215,7 @@ enum dwarf_attribute
     DW_AT_src_coords = 0x2104,
     DW_AT_body_begin = 0x2105,
     DW_AT_body_end = 0x2106
-  };
+    };
 
 #define DW_AT_lo_user    0x2000    /* implementation-defined range start */
 #define DW_AT_hi_user    0x3ff0    /* implementation-defined range end */
@@ -223,7 +223,7 @@ enum dwarf_attribute
 /* Location atom names and codes.  */
 
 enum dwarf_location_atom
-  {
+    {
     DW_OP_addr = 0x03,
     DW_OP_deref = 0x06,
     DW_OP_const1u = 0x08,
@@ -369,7 +369,7 @@ enum dwarf_location_atom
     DW_OP_deref_size = 0x94,
     DW_OP_xderef_size = 0x95,
     DW_OP_nop = 0x96
-  };
+    };
 
 #define DW_OP_lo_user    0x80    /* implementation-defined range start */
 #define DW_OP_hi_user    0xff    /* implementation-defined range end */
@@ -377,7 +377,7 @@ enum dwarf_location_atom
 /* Type encodings.  */
 
 enum dwarf_type
-  {
+    {
     DW_ATE_void = 0x0,
     DW_ATE_address = 0x1,
     DW_ATE_boolean = 0x2,
@@ -387,81 +387,81 @@ enum dwarf_type
     DW_ATE_signed_char = 0x6,
     DW_ATE_unsigned = 0x7,
     DW_ATE_unsigned_char = 0x8
-  };
+    };
 
 #define    DW_ATE_lo_user 0x80
 #define    DW_ATE_hi_user 0xff
 
 /* Array ordering names and codes.  */
 enum dwarf_array_dim_ordering
-  {
+    {
     DW_ORD_row_major = 0,
     DW_ORD_col_major = 1
-  };
+    };
 
 /* access attribute */
 enum dwarf_access_attribute
-  {
+    {
     DW_ACCESS_public = 1,
     DW_ACCESS_protected = 2,
     DW_ACCESS_private = 3
-  };
+    };
 
 /* visibility */
 enum dwarf_visibility_attribute
-  {
+    {
     DW_VIS_local = 1,
     DW_VIS_exported = 2,
     DW_VIS_qualified = 3
-  };
+    };
 
 /* virtuality */
 enum dwarf_virtuality_attribute
-  {
+    {
     DW_VIRTUALITY_none = 0,
     DW_VIRTUALITY_virtual = 1,
     DW_VIRTUALITY_pure_virtual = 2
-  };
+    };
 
 /* case sensitivity */
 enum dwarf_id_case
-  {
+    {
     DW_ID_case_sensitive = 0,
     DW_ID_up_case = 1,
     DW_ID_down_case = 2,
     DW_ID_case_insensitive = 3
-  };
+    };
 
 /* calling convention */
 enum dwarf_calling_convention
-  {
+    {
     DW_CC_normal = 0x1,
     DW_CC_program = 0x2,
     DW_CC_nocall = 0x3
-  };
+    };
 
 #define DW_CC_lo_user 0x40
 #define DW_CC_hi_user 0xff
 
 /* inline attribute */
 enum dwarf_inline_attribute
-  {
+    {
     DW_INL_not_inlined = 0,
     DW_INL_inlined = 1,
     DW_INL_declared_not_inlined = 2,
     DW_INL_declared_inlined = 3
-  };
+    };
 
 /* discriminant lists */
 enum dwarf_discrim_list
-  {
+    {
     DW_DSC_label = 0,
     DW_DSC_range = 1
-  };
+    };
 
 /* line number opcodes */
 enum dwarf_line_number_ops
-  {
+    {
     DW_LNS_extended_op = 0,
     DW_LNS_copy = 1,
     DW_LNS_advance_pc = 2,
@@ -472,19 +472,19 @@ enum dwarf_line_number_ops
     DW_LNS_set_basic_block = 7,
     DW_LNS_const_add_pc = 8,
     DW_LNS_fixed_advance_pc = 9
-  };
+    };
 
 /* line number extended opcodes */
 enum dwarf_line_number_x_ops
-  {
+    {
     DW_LNE_end_sequence = 1,
     DW_LNE_set_address = 2,
     DW_LNE_define_file = 3
-  };
+    };
 
 /* call frame information */
 enum dwarf_call_frame_info
-  {
+    {
     DW_CFA_advance_loc = 0x40,
     DW_CFA_offset = 0x80,
     DW_CFA_restore = 0xc0,
@@ -517,7 +517,7 @@ enum dwarf_call_frame_info
     DW_CFA_GNU_window_save = 0x2d,
     DW_CFA_GNU_args_size = 0x2e,
     DW_CFA_GNU_negative_offset_extended = 0x2f
-  };
+    };
 
 #define DW_CIE_ID      0xffffffff
 #define DW_CIE_VERSION      1
@@ -534,7 +534,7 @@ enum dwarf_call_frame_info
 /* Source language names and codes.  */
 
 enum dwarf_source_language
-  {
+    {
     DW_LANG_C89 = 0x0001,
     DW_LANG_C = 0x0002,
     DW_LANG_Ada83 = 0x0003,
@@ -547,7 +547,7 @@ enum dwarf_source_language
     DW_LANG_Modula2 = 0x000a,
     DW_LANG_Java = 0x000b,
     DW_LANG_Mips_Assembler = 0x8001
-  };
+    };
 
 
 #define DW_LANG_lo_user 0x8000    /* implementation-defined range start */
@@ -556,13 +556,13 @@ enum dwarf_source_language
 /* Names and codes for macro information.  */
 
 enum dwarf_macinfo_record_type
-  {
+    {
     DW_MACINFO_define = 1,
     DW_MACINFO_undef = 2,
     DW_MACINFO_start_file = 3,
     DW_MACINFO_end_file = 4,
     DW_MACINFO_vendor_ext = 255
-  };
+    };
 
 #endif /* !ASSEMBLER */
 
diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h
index 2f43d688df9..82b1e747fbe 100644
--- a/utils/memcpy-bench/glibc/sysdep.h
+++ b/utils/memcpy-bench/glibc/sysdep.h
@@ -1,22 +1,22 @@
 #pragma once
 
 /* Assembler macros for x86-64.
-   Copyright (C) 2001-2020 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
+    Copyright (C) 2001-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
 
 #ifndef _X86_64_SYSDEP_H
 #define _X86_64_SYSDEP_H 1
@@ -28,35 +28,35 @@
 /* Syntactic details of assembler.  */
 
 /* This macro is for setting proper CFI with DW_CFA_expression describing
-   the register as saved relative to %rsp instead of relative to the CFA.
-   Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
-   from %rsp.  */
+    the register as saved relative to %rsp instead of relative to the CFA.
+    Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
+    from %rsp.  */
 #define cfi_offset_rel_rsp(regn, off)    .cfi_escape 0x10, regn, 0x4, 0x13, \
                     0x77, off & 0x7F | 0x80, off >> 7
 
 /* If compiled for profiling, call `mcount' at the start of each function.  */
 #ifdef    PROF
 /* The mcount code relies on a normal frame pointer being on the stack
-   to locate our caller, so push one just for its benefit.  */
+    to locate our caller, so push one just for its benefit.  */
 #define CALL_MCOUNT                                                          \
-  pushq %rbp;                                                                \
-  cfi_adjust_cfa_offset(8);                                                  \
-  movq %rsp, %rbp;                                                           \
-  cfi_def_cfa_register(%rbp);                                                \
-  call JUMPTARGET(mcount);                                                   \
-  popq %rbp;                                                                 \
-  cfi_def_cfa(rsp,8);
+    pushq %rbp;                                                                \
+    cfi_adjust_cfa_offset(8);                                                  \
+    movq %rsp, %rbp;                                                           \
+    cfi_def_cfa_register(%rbp);                                                \
+    call JUMPTARGET(mcount);                                                   \
+    popq %rbp;                                                                 \
+    cfi_def_cfa(rsp,8);
 #else
 #define CALL_MCOUNT        /* Do nothing.  */
 #endif
 
 #define    PSEUDO(name, syscall_name, args)                      \
 lose:                                          \
-  jmp JUMPTARGET(syscall_error)                              \
-  .globl syscall_error;                                  \
-  ENTRY (name)                                      \
-  DO_CALL (syscall_name, args);                              \
-  jb lose
+    jmp JUMPTARGET(syscall_error)                              \
+    .globl syscall_error;                                  \
+    ENTRY (name)                                      \
+    DO_CALL (syscall_name, args);                              \
+    jb lose
 
 #undef JUMPTARGET
 #ifdef SHARED
diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h
index 0cb5bca4102..e6183d72792 100644
--- a/utils/memcpy-bench/glibc/sysdep_generic.h
+++ b/utils/memcpy-bench/glibc/sysdep_generic.h
@@ -1,22 +1,22 @@
 #pragma once
 
 /* Generic asm macros used on many machines.
-   Copyright (C) 1991-2020 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
+    Copyright (C) 1991-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
 
 #define C_SYMBOL_NAME(name) name
 #define HIDDEN_JUMPTARGET(name) 0x0
@@ -31,8 +31,8 @@
 #define ASM_LINE_SEP ;
 
 #define strong_alias(original, alias)                \
-  .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP        \
-  C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
+    .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP        \
+    C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
 
 #ifndef C_LABEL
 
@@ -43,7 +43,7 @@
 
 #ifdef __ASSEMBLER__
 /* Mark the end of function named SYM.  This is used on some platforms
-   to generate correct debugging information.  */
+    to generate correct debugging information.  */
 # ifndef END
 #  define END(sym)
 # endif
@@ -81,35 +81,35 @@
 # define CFI_STARTPROC    ".cfi_startproc"
 # define CFI_ENDPROC    ".cfi_endproc"
 # define CFI_DEF_CFA(reg, off)    \
-   ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+    ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
 # define CFI_DEF_CFA_REGISTER(reg) \
-   ".cfi_def_cfa_register " CFI_STRINGIFY(reg)
+    ".cfi_def_cfa_register " CFI_STRINGIFY(reg)
 # define CFI_DEF_CFA_OFFSET(off) \
-   ".cfi_def_cfa_offset " CFI_STRINGIFY(off)
+    ".cfi_def_cfa_offset " CFI_STRINGIFY(off)
 # define CFI_ADJUST_CFA_OFFSET(off) \
-   ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off)
+    ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off)
 # define CFI_OFFSET(reg, off) \
-   ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+    ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
 # define CFI_REL_OFFSET(reg, off) \
-   ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
+    ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
 # define CFI_REGISTER(r1, r2) \
-   ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2)
+    ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2)
 # define CFI_RETURN_COLUMN(reg) \
-   ".cfi_return_column " CFI_STRINGIFY(reg)
+    ".cfi_return_column " CFI_STRINGIFY(reg)
 # define CFI_RESTORE(reg) \
-   ".cfi_restore " CFI_STRINGIFY(reg)
+    ".cfi_restore " CFI_STRINGIFY(reg)
 # define CFI_UNDEFINED(reg) \
-   ".cfi_undefined " CFI_STRINGIFY(reg)
+    ".cfi_undefined " CFI_STRINGIFY(reg)
 # define CFI_REMEMBER_STATE \
-   ".cfi_remember_state"
+    ".cfi_remember_state"
 # define CFI_RESTORE_STATE \
-   ".cfi_restore_state"
+    ".cfi_restore_state"
 # define CFI_WINDOW_SAVE \
-   ".cfi_window_save"
+    ".cfi_window_save"
 # define CFI_PERSONALITY(enc, exp) \
-   ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+    ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
 # define CFI_LSDA(enc, exp) \
-   ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
+    ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
 #endif
 
 #include "dwarf2.h"
diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h
index 4469ed2e885..1c482cfabb7 100644
--- a/utils/memcpy-bench/glibc/sysdep_x86.h
+++ b/utils/memcpy-bench/glibc/sysdep_x86.h
@@ -1,22 +1,22 @@
 #pragma once
 
 /* Assembler macros for x86.
-   Copyright (C) 2017-2020 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
+    Copyright (C) 2017-2020 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
 
 #ifndef _X86_SYSDEP_H
 #define _X86_SYSDEP_H 1
@@ -27,11 +27,11 @@
 
 enum cf_protection_level
 {
-  CF_NONE = 0,
-  CF_BRANCH = 1 << 0,
-  CF_RETURN = 1 << 1,
-  CF_FULL = CF_BRANCH | CF_RETURN,
-  CF_SET = 1 << 2
+    CF_NONE = 0,
+    CF_BRANCH = 1 << 0,
+    CF_RETURN = 1 << 1,
+    CF_FULL = CF_BRANCH | CF_RETURN,
+    CF_SET = 1 << 2
 };
 */
 
@@ -51,13 +51,13 @@ enum cf_protection_level
 #endif
 
 /* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
-   space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
-   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
+    space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
+    aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
 #define STATE_SAVE_OFFSET (8 * 7 + 8)
 
 /* Save SSE, AVX, AVX512, mask and bound registers.  */
 #define STATE_SAVE_MASK \
-  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
+    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
 
 #ifdef    __ASSEMBLER__
 
@@ -76,31 +76,31 @@ enum cf_protection_level
 
 /* Define an entry point visible from C.  */
 #define    ENTRY(name)                                  \
-  .globl C_SYMBOL_NAME(name);                              \
-  .type C_SYMBOL_NAME(name),@function;                          \
-  .align ALIGNARG(4);                                  \
-  C_LABEL(name)                                      \
-  cfi_startproc;                                  \
-  _CET_ENDBR;                                      \
-  CALL_MCOUNT
+    .globl C_SYMBOL_NAME(name);                              \
+    .type C_SYMBOL_NAME(name),@function;                          \
+    .align ALIGNARG(4);                                  \
+    C_LABEL(name)                                      \
+    cfi_startproc;                                  \
+    _CET_ENDBR;                                      \
+    CALL_MCOUNT
 
 #undef    END
 #define END(name)                                  \
-  cfi_endproc;                                      \
-  ASM_SIZE_DIRECTIVE(name)
+    cfi_endproc;                                      \
+    ASM_SIZE_DIRECTIVE(name)
 
 #define ENTRY_CHK(name) ENTRY (name)
 #define END_CHK(name) END (name)
 
 /* Since C identifiers are not normally prefixed with an underscore
-   on this system, the asm identifier `syscall_error' intrudes on the
-   C name space.  Make sure we use an innocuous name.  */
+    on this system, the asm identifier `syscall_error' intrudes on the
+    C name space.  Make sure we use an innocuous name.  */
 #define    syscall_error    __syscall_error
 #define mcount        _mcount
 
 #undef    PSEUDO_END
 #define    PSEUDO_END(name)                              \
-  END (name)
+    END (name)
 
 /* Local label name for asm code. */
 #ifndef L