From 1bc21789d239cd3f90703711629b6d15905d86c1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 19:52:51 +0300 Subject: [PATCH 1/6] Add more variants --- utils/memcpy-bench/CMakeLists.txt | 23 +- utils/memcpy-bench/FastMemcpy.cpp | 1 + utils/memcpy-bench/FastMemcpy.h | 2 +- utils/memcpy-bench/FastMemcpy_Avx.cpp | 1 + utils/memcpy-bench/glibc/asm-syntax.h | 24 + utils/memcpy-bench/glibc/dwarf2.h | 590 +++ utils/memcpy-bench/glibc/memcpy-ssse3-back.S | 3182 +++++++++++++++++ utils/memcpy-bench/glibc/memcpy-ssse3.S | 3152 ++++++++++++++++ .../glibc/memmove-avx-unaligned-erms.S | 12 + .../glibc/memmove-avx512-no-vzeroupper.S | 419 +++ .../glibc/memmove-avx512-unaligned-erms.S | 12 + .../glibc/memmove-sse2-unaligned-erms.S | 33 + .../glibc/memmove-vec-unaligned-erms.S | 559 +++ utils/memcpy-bench/glibc/memmove.S | 71 + utils/memcpy-bench/glibc/sysdep.h | 129 + utils/memcpy-bench/glibc/sysdep_generic.h | 113 + utils/memcpy-bench/glibc/sysdep_x86.h | 113 + utils/memcpy-bench/memcpy-bench.cpp | 208 +- 18 files changed, 8585 insertions(+), 59 deletions(-) create mode 100644 utils/memcpy-bench/FastMemcpy.cpp create mode 100644 utils/memcpy-bench/FastMemcpy_Avx.cpp create mode 100644 utils/memcpy-bench/glibc/asm-syntax.h create mode 100644 utils/memcpy-bench/glibc/dwarf2.h create mode 100644 utils/memcpy-bench/glibc/memcpy-ssse3-back.S create mode 100644 utils/memcpy-bench/glibc/memcpy-ssse3.S create mode 100644 utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S create mode 100644 utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S create mode 100644 utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S create mode 100644 utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S create mode 100644 utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S create mode 100644 utils/memcpy-bench/glibc/memmove.S create mode 100644 utils/memcpy-bench/glibc/sysdep.h create mode 100644 utils/memcpy-bench/glibc/sysdep_generic.h create mode 100644 utils/memcpy-bench/glibc/sysdep_x86.h diff --git a/utils/memcpy-bench/CMakeLists.txt b/utils/memcpy-bench/CMakeLists.txt index 54dd0398912..5fcde231688 100644 --- a/utils/memcpy-bench/CMakeLists.txt +++ b/utils/memcpy-bench/CMakeLists.txt @@ -1,5 +1,22 @@ enable_language(ASM) -add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S) -#target_compile_options(memcpy-bench PRIVATE -mavx) -target_link_libraries(memcpy-bench PRIVATE dbms) + +add_executable (memcpy-bench + memcpy-bench.cpp + FastMemcpy.cpp + FastMemcpy_Avx.cpp + memcpy_jart.S + glibc/memcpy-ssse3.S + glibc/memcpy-ssse3-back.S + glibc/memmove-sse2-unaligned-erms.S + glibc/memmove-avx-unaligned-erms.S + glibc/memmove-avx512-unaligned-erms.S + glibc/memmove-avx512-no-vzeroupper.S + ) + +add_compile_options(memcpy-bench PRIVATE -fno-tree-loop-distribute-patterns) + +set_source_files_properties(FastMemcpy.cpp PROPERTIES COMPILE_FLAGS "-Wno-old-style-cast") +set_source_files_properties(FastMemcpy_Avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -Wno-old-style-cast -Wno-cast-qual -Wno-cast-align") + +target_link_libraries(memcpy-bench PRIVATE dbms boost::program_options) diff --git a/utils/memcpy-bench/FastMemcpy.cpp b/utils/memcpy-bench/FastMemcpy.cpp new file mode 100644 index 00000000000..9a50caba2b1 --- /dev/null +++ b/utils/memcpy-bench/FastMemcpy.cpp @@ -0,0 +1 @@ +#include "FastMemcpy.h" diff --git a/utils/memcpy-bench/FastMemcpy.h b/utils/memcpy-bench/FastMemcpy.h index 9c37524443a..85d09c5f53e 100644 --- a/utils/memcpy-bench/FastMemcpy.h +++ b/utils/memcpy-bench/FastMemcpy.h @@ -93,7 +93,7 @@ static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restric /// Attribute is used to avoid an error with undefined behaviour sanitizer /// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer /// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) +__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) { unsigned char *dd = ((unsigned char*)dst) + size; const unsigned char *ss = ((const unsigned char*)src) + size; diff --git a/utils/memcpy-bench/FastMemcpy_Avx.cpp b/utils/memcpy-bench/FastMemcpy_Avx.cpp new file mode 100644 index 00000000000..8cef0f89507 --- /dev/null +++ b/utils/memcpy-bench/FastMemcpy_Avx.cpp @@ -0,0 +1 @@ +#include "FastMemcpy_Avx.h" diff --git a/utils/memcpy-bench/glibc/asm-syntax.h b/utils/memcpy-bench/glibc/asm-syntax.h new file mode 100644 index 00000000000..6e299c1fec2 --- /dev/null +++ b/utils/memcpy-bench/glibc/asm-syntax.h @@ -0,0 +1,24 @@ +/* Definitions for x86 syntax variations. + Copyright (C) 1992-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. Its master source is NOT part of + the C library, however. The master source lives in the GNU MP Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#undef ALIGN +#define ALIGN(log) .align 1<. */ + +#ifndef _DWARF2_H +#define _DWARF2_H 1 + +/* This file is derived from the DWARF specification (a public document) + Revision 2.0.0 (July 27, 1993) developed by the UNIX International + Programming Languages Special Interest Group (UI/PLSIG) and distributed + by UNIX International. Copies of this specification are available from + UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. */ + +/* This file is shared between GCC and GDB, and should not contain + prototypes. */ + +#ifndef __ASSEMBLER__ +/* Tag names and codes. */ + +enum dwarf_tag + { + DW_TAG_padding = 0x00, + DW_TAG_array_type = 0x01, + DW_TAG_class_type = 0x02, + DW_TAG_entry_point = 0x03, + DW_TAG_enumeration_type = 0x04, + DW_TAG_formal_parameter = 0x05, + DW_TAG_imported_declaration = 0x08, + DW_TAG_label = 0x0a, + DW_TAG_lexical_block = 0x0b, + DW_TAG_member = 0x0d, + DW_TAG_pointer_type = 0x0f, + DW_TAG_reference_type = 0x10, + DW_TAG_compile_unit = 0x11, + DW_TAG_string_type = 0x12, + DW_TAG_structure_type = 0x13, + DW_TAG_subroutine_type = 0x15, + DW_TAG_typedef = 0x16, + DW_TAG_union_type = 0x17, + DW_TAG_unspecified_parameters = 0x18, + DW_TAG_variant = 0x19, + DW_TAG_common_block = 0x1a, + DW_TAG_common_inclusion = 0x1b, + DW_TAG_inheritance = 0x1c, + DW_TAG_inlined_subroutine = 0x1d, + DW_TAG_module = 0x1e, + DW_TAG_ptr_to_member_type = 0x1f, + DW_TAG_set_type = 0x20, + DW_TAG_subrange_type = 0x21, + DW_TAG_with_stmt = 0x22, + DW_TAG_access_declaration = 0x23, + DW_TAG_base_type = 0x24, + DW_TAG_catch_block = 0x25, + DW_TAG_const_type = 0x26, + DW_TAG_constant = 0x27, + DW_TAG_enumerator = 0x28, + DW_TAG_file_type = 0x29, + DW_TAG_friend = 0x2a, + DW_TAG_namelist = 0x2b, + DW_TAG_namelist_item = 0x2c, + DW_TAG_packed_type = 0x2d, + DW_TAG_subprogram = 0x2e, + DW_TAG_template_type_param = 0x2f, + DW_TAG_template_value_param = 0x30, + DW_TAG_thrown_type = 0x31, + DW_TAG_try_block = 0x32, + DW_TAG_variant_part = 0x33, + DW_TAG_variable = 0x34, + DW_TAG_volatile_type = 0x35, + /* SGI/MIPS Extensions */ + DW_TAG_MIPS_loop = 0x4081, + /* GNU extensions */ + DW_TAG_format_label = 0x4101, /* for FORTRAN 77 and Fortran 90 */ + DW_TAG_function_template = 0x4102, /* for C++ */ + DW_TAG_class_template = 0x4103, /* for C++ */ + DW_TAG_GNU_BINCL = 0x4104, + DW_TAG_GNU_EINCL = 0x4105 + }; + +#define DW_TAG_lo_user 0x4080 +#define DW_TAG_hi_user 0xffff + +/* flag that tells whether entry has a child or not */ +#define DW_children_no 0 +#define DW_children_yes 1 + +/* Form names and codes. */ +enum dwarf_form + { + DW_FORM_addr = 0x01, + DW_FORM_block2 = 0x03, + DW_FORM_block4 = 0x04, + DW_FORM_data2 = 0x05, + DW_FORM_data4 = 0x06, + DW_FORM_data8 = 0x07, + DW_FORM_string = 0x08, + DW_FORM_block = 0x09, + DW_FORM_block1 = 0x0a, + DW_FORM_data1 = 0x0b, + DW_FORM_flag = 0x0c, + DW_FORM_sdata = 0x0d, + DW_FORM_strp = 0x0e, + DW_FORM_udata = 0x0f, + DW_FORM_ref_addr = 0x10, + DW_FORM_ref1 = 0x11, + DW_FORM_ref2 = 0x12, + DW_FORM_ref4 = 0x13, + DW_FORM_ref8 = 0x14, + DW_FORM_ref_udata = 0x15, + DW_FORM_indirect = 0x16 + }; + +/* Attribute names and codes. */ + +enum dwarf_attribute + { + DW_AT_sibling = 0x01, + DW_AT_location = 0x02, + DW_AT_name = 0x03, + DW_AT_ordering = 0x09, + DW_AT_subscr_data = 0x0a, + DW_AT_byte_size = 0x0b, + DW_AT_bit_offset = 0x0c, + DW_AT_bit_size = 0x0d, + DW_AT_element_list = 0x0f, + DW_AT_stmt_list = 0x10, + DW_AT_low_pc = 0x11, + DW_AT_high_pc = 0x12, + DW_AT_language = 0x13, + DW_AT_member = 0x14, + DW_AT_discr = 0x15, + DW_AT_discr_value = 0x16, + DW_AT_visibility = 0x17, + DW_AT_import = 0x18, + DW_AT_string_length = 0x19, + DW_AT_common_reference = 0x1a, + DW_AT_comp_dir = 0x1b, + DW_AT_const_value = 0x1c, + DW_AT_containing_type = 0x1d, + DW_AT_default_value = 0x1e, + DW_AT_inline = 0x20, + DW_AT_is_optional = 0x21, + DW_AT_lower_bound = 0x22, + DW_AT_producer = 0x25, + DW_AT_prototyped = 0x27, + DW_AT_return_addr = 0x2a, + DW_AT_start_scope = 0x2c, + DW_AT_stride_size = 0x2e, + DW_AT_upper_bound = 0x2f, + DW_AT_abstract_origin = 0x31, + DW_AT_accessibility = 0x32, + DW_AT_address_class = 0x33, + DW_AT_artificial = 0x34, + DW_AT_base_types = 0x35, + DW_AT_calling_convention = 0x36, + DW_AT_count = 0x37, + DW_AT_data_member_location = 0x38, + DW_AT_decl_column = 0x39, + DW_AT_decl_file = 0x3a, + DW_AT_decl_line = 0x3b, + DW_AT_declaration = 0x3c, + DW_AT_discr_list = 0x3d, + DW_AT_encoding = 0x3e, + DW_AT_external = 0x3f, + DW_AT_frame_base = 0x40, + DW_AT_friend = 0x41, + DW_AT_identifier_case = 0x42, + DW_AT_macro_info = 0x43, + DW_AT_namelist_items = 0x44, + DW_AT_priority = 0x45, + DW_AT_segment = 0x46, + DW_AT_specification = 0x47, + DW_AT_static_link = 0x48, + DW_AT_type = 0x49, + DW_AT_use_location = 0x4a, + DW_AT_variable_parameter = 0x4b, + DW_AT_virtuality = 0x4c, + DW_AT_vtable_elem_location = 0x4d, + /* SGI/MIPS Extensions */ + DW_AT_MIPS_fde = 0x2001, + DW_AT_MIPS_loop_begin = 0x2002, + DW_AT_MIPS_tail_loop_begin = 0x2003, + DW_AT_MIPS_epilog_begin = 0x2004, + DW_AT_MIPS_loop_unroll_factor = 0x2005, + DW_AT_MIPS_software_pipeline_depth = 0x2006, + DW_AT_MIPS_linkage_name = 0x2007, + DW_AT_MIPS_stride = 0x2008, + DW_AT_MIPS_abstract_name = 0x2009, + DW_AT_MIPS_clone_origin = 0x200a, + DW_AT_MIPS_has_inlines = 0x200b, + /* GNU extensions. */ + DW_AT_sf_names = 0x2101, + DW_AT_src_info = 0x2102, + DW_AT_mac_info = 0x2103, + DW_AT_src_coords = 0x2104, + DW_AT_body_begin = 0x2105, + DW_AT_body_end = 0x2106 + }; + +#define DW_AT_lo_user 0x2000 /* implementation-defined range start */ +#define DW_AT_hi_user 0x3ff0 /* implementation-defined range end */ + +/* Location atom names and codes. */ + +enum dwarf_location_atom + { + DW_OP_addr = 0x03, + DW_OP_deref = 0x06, + DW_OP_const1u = 0x08, + DW_OP_const1s = 0x09, + DW_OP_const2u = 0x0a, + DW_OP_const2s = 0x0b, + DW_OP_const4u = 0x0c, + DW_OP_const4s = 0x0d, + DW_OP_const8u = 0x0e, + DW_OP_const8s = 0x0f, + DW_OP_constu = 0x10, + DW_OP_consts = 0x11, + DW_OP_dup = 0x12, + DW_OP_drop = 0x13, + DW_OP_over = 0x14, + DW_OP_pick = 0x15, + DW_OP_swap = 0x16, + DW_OP_rot = 0x17, + DW_OP_xderef = 0x18, + DW_OP_abs = 0x19, + DW_OP_and = 0x1a, + DW_OP_div = 0x1b, + DW_OP_minus = 0x1c, + DW_OP_mod = 0x1d, + DW_OP_mul = 0x1e, + DW_OP_neg = 0x1f, + DW_OP_not = 0x20, + DW_OP_or = 0x21, + DW_OP_plus = 0x22, + DW_OP_plus_uconst = 0x23, + DW_OP_shl = 0x24, + DW_OP_shr = 0x25, + DW_OP_shra = 0x26, + DW_OP_xor = 0x27, + DW_OP_bra = 0x28, + DW_OP_eq = 0x29, + DW_OP_ge = 0x2a, + DW_OP_gt = 0x2b, + DW_OP_le = 0x2c, + DW_OP_lt = 0x2d, + DW_OP_ne = 0x2e, + DW_OP_skip = 0x2f, + DW_OP_lit0 = 0x30, + DW_OP_lit1 = 0x31, + DW_OP_lit2 = 0x32, + DW_OP_lit3 = 0x33, + DW_OP_lit4 = 0x34, + DW_OP_lit5 = 0x35, + DW_OP_lit6 = 0x36, + DW_OP_lit7 = 0x37, + DW_OP_lit8 = 0x38, + DW_OP_lit9 = 0x39, + DW_OP_lit10 = 0x3a, + DW_OP_lit11 = 0x3b, + DW_OP_lit12 = 0x3c, + DW_OP_lit13 = 0x3d, + DW_OP_lit14 = 0x3e, + DW_OP_lit15 = 0x3f, + DW_OP_lit16 = 0x40, + DW_OP_lit17 = 0x41, + DW_OP_lit18 = 0x42, + DW_OP_lit19 = 0x43, + DW_OP_lit20 = 0x44, + DW_OP_lit21 = 0x45, + DW_OP_lit22 = 0x46, + DW_OP_lit23 = 0x47, + DW_OP_lit24 = 0x48, + DW_OP_lit25 = 0x49, + DW_OP_lit26 = 0x4a, + DW_OP_lit27 = 0x4b, + DW_OP_lit28 = 0x4c, + DW_OP_lit29 = 0x4d, + DW_OP_lit30 = 0x4e, + DW_OP_lit31 = 0x4f, + DW_OP_reg0 = 0x50, + DW_OP_reg1 = 0x51, + DW_OP_reg2 = 0x52, + DW_OP_reg3 = 0x53, + DW_OP_reg4 = 0x54, + DW_OP_reg5 = 0x55, + DW_OP_reg6 = 0x56, + DW_OP_reg7 = 0x57, + DW_OP_reg8 = 0x58, + DW_OP_reg9 = 0x59, + DW_OP_reg10 = 0x5a, + DW_OP_reg11 = 0x5b, + DW_OP_reg12 = 0x5c, + DW_OP_reg13 = 0x5d, + DW_OP_reg14 = 0x5e, + DW_OP_reg15 = 0x5f, + DW_OP_reg16 = 0x60, + DW_OP_reg17 = 0x61, + DW_OP_reg18 = 0x62, + DW_OP_reg19 = 0x63, + DW_OP_reg20 = 0x64, + DW_OP_reg21 = 0x65, + DW_OP_reg22 = 0x66, + DW_OP_reg23 = 0x67, + DW_OP_reg24 = 0x68, + DW_OP_reg25 = 0x69, + DW_OP_reg26 = 0x6a, + DW_OP_reg27 = 0x6b, + DW_OP_reg28 = 0x6c, + DW_OP_reg29 = 0x6d, + DW_OP_reg30 = 0x6e, + DW_OP_reg31 = 0x6f, + DW_OP_breg0 = 0x70, + DW_OP_breg1 = 0x71, + DW_OP_breg2 = 0x72, + DW_OP_breg3 = 0x73, + DW_OP_breg4 = 0x74, + DW_OP_breg5 = 0x75, + DW_OP_breg6 = 0x76, + DW_OP_breg7 = 0x77, + DW_OP_breg8 = 0x78, + DW_OP_breg9 = 0x79, + DW_OP_breg10 = 0x7a, + DW_OP_breg11 = 0x7b, + DW_OP_breg12 = 0x7c, + DW_OP_breg13 = 0x7d, + DW_OP_breg14 = 0x7e, + DW_OP_breg15 = 0x7f, + DW_OP_breg16 = 0x80, + DW_OP_breg17 = 0x81, + DW_OP_breg18 = 0x82, + DW_OP_breg19 = 0x83, + DW_OP_breg20 = 0x84, + DW_OP_breg21 = 0x85, + DW_OP_breg22 = 0x86, + DW_OP_breg23 = 0x87, + DW_OP_breg24 = 0x88, + DW_OP_breg25 = 0x89, + DW_OP_breg26 = 0x8a, + DW_OP_breg27 = 0x8b, + DW_OP_breg28 = 0x8c, + DW_OP_breg29 = 0x8d, + DW_OP_breg30 = 0x8e, + DW_OP_breg31 = 0x8f, + DW_OP_regx = 0x90, + DW_OP_fbreg = 0x91, + DW_OP_bregx = 0x92, + DW_OP_piece = 0x93, + DW_OP_deref_size = 0x94, + DW_OP_xderef_size = 0x95, + DW_OP_nop = 0x96 + }; + +#define DW_OP_lo_user 0x80 /* implementation-defined range start */ +#define DW_OP_hi_user 0xff /* implementation-defined range end */ + +/* Type encodings. */ + +enum dwarf_type + { + DW_ATE_void = 0x0, + DW_ATE_address = 0x1, + DW_ATE_boolean = 0x2, + DW_ATE_complex_float = 0x3, + DW_ATE_float = 0x4, + DW_ATE_signed = 0x5, + DW_ATE_signed_char = 0x6, + DW_ATE_unsigned = 0x7, + DW_ATE_unsigned_char = 0x8 + }; + +#define DW_ATE_lo_user 0x80 +#define DW_ATE_hi_user 0xff + +/* Array ordering names and codes. */ +enum dwarf_array_dim_ordering + { + DW_ORD_row_major = 0, + DW_ORD_col_major = 1 + }; + +/* access attribute */ +enum dwarf_access_attribute + { + DW_ACCESS_public = 1, + DW_ACCESS_protected = 2, + DW_ACCESS_private = 3 + }; + +/* visibility */ +enum dwarf_visibility_attribute + { + DW_VIS_local = 1, + DW_VIS_exported = 2, + DW_VIS_qualified = 3 + }; + +/* virtuality */ +enum dwarf_virtuality_attribute + { + DW_VIRTUALITY_none = 0, + DW_VIRTUALITY_virtual = 1, + DW_VIRTUALITY_pure_virtual = 2 + }; + +/* case sensitivity */ +enum dwarf_id_case + { + DW_ID_case_sensitive = 0, + DW_ID_up_case = 1, + DW_ID_down_case = 2, + DW_ID_case_insensitive = 3 + }; + +/* calling convention */ +enum dwarf_calling_convention + { + DW_CC_normal = 0x1, + DW_CC_program = 0x2, + DW_CC_nocall = 0x3 + }; + +#define DW_CC_lo_user 0x40 +#define DW_CC_hi_user 0xff + +/* inline attribute */ +enum dwarf_inline_attribute + { + DW_INL_not_inlined = 0, + DW_INL_inlined = 1, + DW_INL_declared_not_inlined = 2, + DW_INL_declared_inlined = 3 + }; + +/* discriminant lists */ +enum dwarf_discrim_list + { + DW_DSC_label = 0, + DW_DSC_range = 1 + }; + +/* line number opcodes */ +enum dwarf_line_number_ops + { + DW_LNS_extended_op = 0, + DW_LNS_copy = 1, + DW_LNS_advance_pc = 2, + DW_LNS_advance_line = 3, + DW_LNS_set_file = 4, + DW_LNS_set_column = 5, + DW_LNS_negate_stmt = 6, + DW_LNS_set_basic_block = 7, + DW_LNS_const_add_pc = 8, + DW_LNS_fixed_advance_pc = 9 + }; + +/* line number extended opcodes */ +enum dwarf_line_number_x_ops + { + DW_LNE_end_sequence = 1, + DW_LNE_set_address = 2, + DW_LNE_define_file = 3 + }; + +/* call frame information */ +enum dwarf_call_frame_info + { + DW_CFA_advance_loc = 0x40, + DW_CFA_offset = 0x80, + DW_CFA_restore = 0xc0, + DW_CFA_nop = 0x00, + DW_CFA_set_loc = 0x01, + DW_CFA_advance_loc1 = 0x02, + DW_CFA_advance_loc2 = 0x03, + DW_CFA_advance_loc4 = 0x04, + DW_CFA_offset_extended = 0x05, + DW_CFA_restore_extended = 0x06, + DW_CFA_undefined = 0x07, + DW_CFA_same_value = 0x08, + DW_CFA_register = 0x09, + DW_CFA_remember_state = 0x0a, + DW_CFA_restore_state = 0x0b, + DW_CFA_def_cfa = 0x0c, + DW_CFA_def_cfa_register = 0x0d, + DW_CFA_def_cfa_offset = 0x0e, + DW_CFA_def_cfa_expression = 0x0f, + DW_CFA_expression = 0x10, + /* Dwarf 2.1 */ + DW_CFA_offset_extended_sf = 0x11, + DW_CFA_def_cfa_sf = 0x12, + DW_CFA_def_cfa_offset_sf = 0x13, + + /* SGI/MIPS specific */ + DW_CFA_MIPS_advance_loc8 = 0x1d, + + /* GNU extensions */ + DW_CFA_GNU_window_save = 0x2d, + DW_CFA_GNU_args_size = 0x2e, + DW_CFA_GNU_negative_offset_extended = 0x2f + }; + +#define DW_CIE_ID 0xffffffff +#define DW_CIE_VERSION 1 + +#define DW_CFA_extended 0 +#define DW_CFA_low_user 0x1c +#define DW_CFA_high_user 0x3f + +#define DW_CHILDREN_no 0x00 +#define DW_CHILDREN_yes 0x01 + +#define DW_ADDR_none 0 + +/* Source language names and codes. */ + +enum dwarf_source_language + { + DW_LANG_C89 = 0x0001, + DW_LANG_C = 0x0002, + DW_LANG_Ada83 = 0x0003, + DW_LANG_C_plus_plus = 0x0004, + DW_LANG_Cobol74 = 0x0005, + DW_LANG_Cobol85 = 0x0006, + DW_LANG_Fortran77 = 0x0007, + DW_LANG_Fortran90 = 0x0008, + DW_LANG_Pascal83 = 0x0009, + DW_LANG_Modula2 = 0x000a, + DW_LANG_Java = 0x000b, + DW_LANG_Mips_Assembler = 0x8001 + }; + + +#define DW_LANG_lo_user 0x8000 /* implementation-defined range start */ +#define DW_LANG_hi_user 0xffff /* implementation-defined range start */ + +/* Names and codes for macro information. */ + +enum dwarf_macinfo_record_type + { + DW_MACINFO_define = 1, + DW_MACINFO_undef = 2, + DW_MACINFO_start_file = 3, + DW_MACINFO_end_file = 4, + DW_MACINFO_vendor_ext = 255 + }; + +#endif /* !ASSEMBLER */ + +/* @@@ For use with GNU frame unwind information. */ + +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff + +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0A +#define DW_EH_PE_sdata4 0x0B +#define DW_EH_PE_sdata8 0x0C +#define DW_EH_PE_signed 0x08 + +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 + +#define DW_EH_PE_indirect 0x80 + +#endif /* dwarf2.h */ diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S new file mode 100644 index 00000000000..1492dd38e73 --- /dev/null +++ b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S @@ -0,0 +1,3182 @@ +/* memcpy with SSSE3 and REP string + Copyright (C) 2010-2020 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "sysdep.h" + +#if 1 + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_back +# define MEMCPY_CHK __memcpy_chk_ssse3_back +# define MEMPCPY __mempcpy_ssse3_back +# define MEMPCPY_CHK __mempcpy_chk_ssse3_back +#endif + +#define JMPTBL(I, B) I - B + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + _CET_NOTRACK jmp *INDEX; \ + ud2 + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) +END (MEMPCPY) +#endif + +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %RDI_LP, %RAX_LP +#ifdef USE_AS_MEMPCPY + add %RDX_LP, %RAX_LP +#endif + +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +#endif + +#ifdef USE_AS_MEMMOVE + cmp %rsi, %rdi + jb L(copy_forward) + je L(bwd_write_0bytes) + cmp $144, %rdx + jae L(copy_backward) + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +L(copy_forward): +#endif +L(start): + cmp $144, %rdx + jae L(144bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jbe L(bk_write) +#endif + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +#endif + + .p2align 4 +L(144bytesormore): + +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jle L(copy_backward) +#endif + movdqu (%rsi), %xmm0 + mov %rdi, %r8 + and $-16, %rdi + add $16, %rdi + mov %rdi, %r9 + sub %r8, %r9 + sub %r9, %rdx + add %r9, %rsi + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0) +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %RCX_LP +#else + mov __x86_data_cache_size(%rip), %RCX_LP +#endif + cmp %rcx, %rdx + jae L(gobble_mem_fwd) + lea L(shl_table_fwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + _CET_NOTRACK jmp *%r9 + ud2 + + .p2align 4 +L(copy_backward): +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %RCX_LP +#else + mov __x86_data_cache_size(%rip), %RCX_LP +#endif + shl $1, %rcx + cmp %rcx, %rdx + ja L(gobble_mem_bwd) + + add %rdx, %rdi + add %rdx, %rsi + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $0xf, %r9 + xor %r9, %rdi + sub %r9, %rsi + sub %r9, %rdx + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0_bwd) + lea L(shl_table_bwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + _CET_NOTRACK jmp *%r9 + ud2 + + .p2align 4 +L(shl_0): + + mov %rdx, %r9 + shr $8, %r9 + add %rdx, %r9 +#ifdef DATA_CACHE_SIZE + cmp $DATA_CACHE_SIZE_HALF, %R9_LP +#else + cmp __x86_data_cache_size_half(%rip), %R9_LP +#endif + jae L(gobble_mem_fwd) + sub $0x80, %rdx + .p2align 4 +L(shl_0_loop): + movdqa (%rsi), %xmm1 + movdqa %xmm1, (%rdi) + movaps 0x10(%rsi), %xmm2 + movaps %xmm2, 0x10(%rdi) + movaps 0x20(%rsi), %xmm3 + movaps %xmm3, 0x20(%rdi) + movaps 0x30(%rsi), %xmm4 + movaps %xmm4, 0x30(%rdi) + movaps 0x40(%rsi), %xmm1 + movaps %xmm1, 0x40(%rdi) + movaps 0x50(%rsi), %xmm2 + movaps %xmm2, 0x50(%rdi) + movaps 0x60(%rsi), %xmm3 + movaps %xmm3, 0x60(%rdi) + movaps 0x70(%rsi), %xmm4 + movaps %xmm4, 0x70(%rdi) + sub $0x80, %rdx + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(shl_0_loop) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_0_bwd): + sub $0x80, %rdx +L(copy_backward_loop): + movaps -0x10(%rsi), %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps -0x20(%rsi), %xmm2 + movaps %xmm2, -0x20(%rdi) + movaps -0x30(%rsi), %xmm3 + movaps %xmm3, -0x30(%rdi) + movaps -0x40(%rsi), %xmm4 + movaps %xmm4, -0x40(%rdi) + movaps -0x50(%rsi), %xmm5 + movaps %xmm5, -0x50(%rdi) + movaps -0x60(%rsi), %xmm5 + movaps %xmm5, -0x60(%rdi) + movaps -0x70(%rsi), %xmm5 + movaps %xmm5, -0x70(%rdi) + movaps -0x80(%rsi), %xmm5 + movaps %xmm5, -0x80(%rdi) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(copy_backward_loop) + + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_1): + sub $0x80, %rdx + movaps -0x01(%rsi), %xmm1 + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movaps 0x4f(%rsi), %xmm6 + movaps 0x5f(%rsi), %xmm7 + movaps 0x6f(%rsi), %xmm8 + movaps 0x7f(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $1, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $1, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $1, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $1, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $1, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $1, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $1, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_1) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_1_bwd): + movaps -0x01(%rsi), %xmm1 + + movaps -0x11(%rsi), %xmm2 + palignr $1, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x21(%rsi), %xmm3 + palignr $1, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x31(%rsi), %xmm4 + palignr $1, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x41(%rsi), %xmm5 + palignr $1, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x51(%rsi), %xmm6 + palignr $1, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x61(%rsi), %xmm7 + palignr $1, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x71(%rsi), %xmm8 + palignr $1, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x81(%rsi), %xmm9 + palignr $1, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_1_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_2): + sub $0x80, %rdx + movaps -0x02(%rsi), %xmm1 + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movaps 0x4e(%rsi), %xmm6 + movaps 0x5e(%rsi), %xmm7 + movaps 0x6e(%rsi), %xmm8 + movaps 0x7e(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $2, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $2, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $2, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $2, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $2, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $2, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $2, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_2) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_2_bwd): + movaps -0x02(%rsi), %xmm1 + + movaps -0x12(%rsi), %xmm2 + palignr $2, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x22(%rsi), %xmm3 + palignr $2, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x32(%rsi), %xmm4 + palignr $2, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x42(%rsi), %xmm5 + palignr $2, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x52(%rsi), %xmm6 + palignr $2, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x62(%rsi), %xmm7 + palignr $2, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x72(%rsi), %xmm8 + palignr $2, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x82(%rsi), %xmm9 + palignr $2, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_2_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_3): + sub $0x80, %rdx + movaps -0x03(%rsi), %xmm1 + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movaps 0x4d(%rsi), %xmm6 + movaps 0x5d(%rsi), %xmm7 + movaps 0x6d(%rsi), %xmm8 + movaps 0x7d(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $3, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $3, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $3, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $3, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $3, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $3, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $3, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_3) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_3_bwd): + movaps -0x03(%rsi), %xmm1 + + movaps -0x13(%rsi), %xmm2 + palignr $3, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x23(%rsi), %xmm3 + palignr $3, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x33(%rsi), %xmm4 + palignr $3, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x43(%rsi), %xmm5 + palignr $3, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x53(%rsi), %xmm6 + palignr $3, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x63(%rsi), %xmm7 + palignr $3, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x73(%rsi), %xmm8 + palignr $3, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x83(%rsi), %xmm9 + palignr $3, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_3_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_4): + sub $0x80, %rdx + movaps -0x04(%rsi), %xmm1 + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movaps 0x4c(%rsi), %xmm6 + movaps 0x5c(%rsi), %xmm7 + movaps 0x6c(%rsi), %xmm8 + movaps 0x7c(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $4, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $4, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $4, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $4, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $4, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $4, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $4, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_4) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_4_bwd): + movaps -0x04(%rsi), %xmm1 + + movaps -0x14(%rsi), %xmm2 + palignr $4, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x24(%rsi), %xmm3 + palignr $4, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x34(%rsi), %xmm4 + palignr $4, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x44(%rsi), %xmm5 + palignr $4, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x54(%rsi), %xmm6 + palignr $4, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x64(%rsi), %xmm7 + palignr $4, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x74(%rsi), %xmm8 + palignr $4, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x84(%rsi), %xmm9 + palignr $4, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_4_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_5): + sub $0x80, %rdx + movaps -0x05(%rsi), %xmm1 + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movaps 0x4b(%rsi), %xmm6 + movaps 0x5b(%rsi), %xmm7 + movaps 0x6b(%rsi), %xmm8 + movaps 0x7b(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $5, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $5, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $5, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $5, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $5, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $5, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $5, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_5) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_5_bwd): + movaps -0x05(%rsi), %xmm1 + + movaps -0x15(%rsi), %xmm2 + palignr $5, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x25(%rsi), %xmm3 + palignr $5, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x35(%rsi), %xmm4 + palignr $5, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x45(%rsi), %xmm5 + palignr $5, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x55(%rsi), %xmm6 + palignr $5, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x65(%rsi), %xmm7 + palignr $5, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x75(%rsi), %xmm8 + palignr $5, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x85(%rsi), %xmm9 + palignr $5, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_5_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_6): + sub $0x80, %rdx + movaps -0x06(%rsi), %xmm1 + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movaps 0x4a(%rsi), %xmm6 + movaps 0x5a(%rsi), %xmm7 + movaps 0x6a(%rsi), %xmm8 + movaps 0x7a(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $6, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $6, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $6, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $6, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $6, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $6, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $6, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_6) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_6_bwd): + movaps -0x06(%rsi), %xmm1 + + movaps -0x16(%rsi), %xmm2 + palignr $6, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x26(%rsi), %xmm3 + palignr $6, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x36(%rsi), %xmm4 + palignr $6, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x46(%rsi), %xmm5 + palignr $6, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x56(%rsi), %xmm6 + palignr $6, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x66(%rsi), %xmm7 + palignr $6, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x76(%rsi), %xmm8 + palignr $6, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x86(%rsi), %xmm9 + palignr $6, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_6_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_7): + sub $0x80, %rdx + movaps -0x07(%rsi), %xmm1 + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movaps 0x49(%rsi), %xmm6 + movaps 0x59(%rsi), %xmm7 + movaps 0x69(%rsi), %xmm8 + movaps 0x79(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $7, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $7, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $7, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $7, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $7, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $7, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $7, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_7) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_7_bwd): + movaps -0x07(%rsi), %xmm1 + + movaps -0x17(%rsi), %xmm2 + palignr $7, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x27(%rsi), %xmm3 + palignr $7, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x37(%rsi), %xmm4 + palignr $7, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x47(%rsi), %xmm5 + palignr $7, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x57(%rsi), %xmm6 + palignr $7, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x67(%rsi), %xmm7 + palignr $7, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x77(%rsi), %xmm8 + palignr $7, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x87(%rsi), %xmm9 + palignr $7, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_7_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_8): + sub $0x80, %rdx + movaps -0x08(%rsi), %xmm1 + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movaps 0x48(%rsi), %xmm6 + movaps 0x58(%rsi), %xmm7 + movaps 0x68(%rsi), %xmm8 + movaps 0x78(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $8, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $8, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $8, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $8, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $8, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $8, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $8, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_8) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_8_bwd): + movaps -0x08(%rsi), %xmm1 + + movaps -0x18(%rsi), %xmm2 + palignr $8, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x28(%rsi), %xmm3 + palignr $8, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x38(%rsi), %xmm4 + palignr $8, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x48(%rsi), %xmm5 + palignr $8, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x58(%rsi), %xmm6 + palignr $8, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x68(%rsi), %xmm7 + palignr $8, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x78(%rsi), %xmm8 + palignr $8, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x88(%rsi), %xmm9 + palignr $8, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_8_bwd) +L(shl_8_end_bwd): + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_9): + sub $0x80, %rdx + movaps -0x09(%rsi), %xmm1 + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movaps 0x47(%rsi), %xmm6 + movaps 0x57(%rsi), %xmm7 + movaps 0x67(%rsi), %xmm8 + movaps 0x77(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $9, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $9, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $9, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $9, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $9, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $9, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $9, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_9) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_9_bwd): + movaps -0x09(%rsi), %xmm1 + + movaps -0x19(%rsi), %xmm2 + palignr $9, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x29(%rsi), %xmm3 + palignr $9, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x39(%rsi), %xmm4 + palignr $9, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x49(%rsi), %xmm5 + palignr $9, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x59(%rsi), %xmm6 + palignr $9, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x69(%rsi), %xmm7 + palignr $9, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x79(%rsi), %xmm8 + palignr $9, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x89(%rsi), %xmm9 + palignr $9, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_9_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_10): + sub $0x80, %rdx + movaps -0x0a(%rsi), %xmm1 + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movaps 0x46(%rsi), %xmm6 + movaps 0x56(%rsi), %xmm7 + movaps 0x66(%rsi), %xmm8 + movaps 0x76(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $10, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $10, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $10, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $10, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $10, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $10, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $10, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_10) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_10_bwd): + movaps -0x0a(%rsi), %xmm1 + + movaps -0x1a(%rsi), %xmm2 + palignr $10, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2a(%rsi), %xmm3 + palignr $10, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3a(%rsi), %xmm4 + palignr $10, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4a(%rsi), %xmm5 + palignr $10, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5a(%rsi), %xmm6 + palignr $10, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6a(%rsi), %xmm7 + palignr $10, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7a(%rsi), %xmm8 + palignr $10, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8a(%rsi), %xmm9 + palignr $10, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_10_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_11): + sub $0x80, %rdx + movaps -0x0b(%rsi), %xmm1 + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movaps 0x45(%rsi), %xmm6 + movaps 0x55(%rsi), %xmm7 + movaps 0x65(%rsi), %xmm8 + movaps 0x75(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $11, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $11, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $11, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $11, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $11, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $11, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $11, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_11) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_11_bwd): + movaps -0x0b(%rsi), %xmm1 + + movaps -0x1b(%rsi), %xmm2 + palignr $11, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2b(%rsi), %xmm3 + palignr $11, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3b(%rsi), %xmm4 + palignr $11, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4b(%rsi), %xmm5 + palignr $11, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5b(%rsi), %xmm6 + palignr $11, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6b(%rsi), %xmm7 + palignr $11, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7b(%rsi), %xmm8 + palignr $11, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8b(%rsi), %xmm9 + palignr $11, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_11_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_12): + sub $0x80, %rdx + movdqa -0x0c(%rsi), %xmm1 + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movaps 0x44(%rsi), %xmm6 + movaps 0x54(%rsi), %xmm7 + movaps 0x64(%rsi), %xmm8 + movaps 0x74(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $12, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $12, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $12, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $12, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $12, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $12, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $12, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + + lea 0x80(%rdi), %rdi + jae L(shl_12) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_12_bwd): + movaps -0x0c(%rsi), %xmm1 + + movaps -0x1c(%rsi), %xmm2 + palignr $12, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2c(%rsi), %xmm3 + palignr $12, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3c(%rsi), %xmm4 + palignr $12, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4c(%rsi), %xmm5 + palignr $12, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5c(%rsi), %xmm6 + palignr $12, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6c(%rsi), %xmm7 + palignr $12, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7c(%rsi), %xmm8 + palignr $12, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8c(%rsi), %xmm9 + palignr $12, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_12_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_13): + sub $0x80, %rdx + movaps -0x0d(%rsi), %xmm1 + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movaps 0x43(%rsi), %xmm6 + movaps 0x53(%rsi), %xmm7 + movaps 0x63(%rsi), %xmm8 + movaps 0x73(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $13, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $13, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $13, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $13, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $13, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $13, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $13, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_13) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_13_bwd): + movaps -0x0d(%rsi), %xmm1 + + movaps -0x1d(%rsi), %xmm2 + palignr $13, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2d(%rsi), %xmm3 + palignr $13, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3d(%rsi), %xmm4 + palignr $13, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4d(%rsi), %xmm5 + palignr $13, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5d(%rsi), %xmm6 + palignr $13, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6d(%rsi), %xmm7 + palignr $13, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7d(%rsi), %xmm8 + palignr $13, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8d(%rsi), %xmm9 + palignr $13, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_13_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_14): + sub $0x80, %rdx + movaps -0x0e(%rsi), %xmm1 + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movaps 0x42(%rsi), %xmm6 + movaps 0x52(%rsi), %xmm7 + movaps 0x62(%rsi), %xmm8 + movaps 0x72(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $14, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $14, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $14, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $14, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $14, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $14, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $14, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_14) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_14_bwd): + movaps -0x0e(%rsi), %xmm1 + + movaps -0x1e(%rsi), %xmm2 + palignr $14, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2e(%rsi), %xmm3 + palignr $14, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3e(%rsi), %xmm4 + palignr $14, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4e(%rsi), %xmm5 + palignr $14, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5e(%rsi), %xmm6 + palignr $14, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6e(%rsi), %xmm7 + palignr $14, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7e(%rsi), %xmm8 + palignr $14, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8e(%rsi), %xmm9 + palignr $14, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_14_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_15): + sub $0x80, %rdx + movaps -0x0f(%rsi), %xmm1 + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movaps 0x41(%rsi), %xmm6 + movaps 0x51(%rsi), %xmm7 + movaps 0x61(%rsi), %xmm8 + movaps 0x71(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $15, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $15, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $15, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $15, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $15, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $15, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $15, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_15) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_15_bwd): + movaps -0x0f(%rsi), %xmm1 + + movaps -0x1f(%rsi), %xmm2 + palignr $15, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2f(%rsi), %xmm3 + palignr $15, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3f(%rsi), %xmm4 + palignr $15, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4f(%rsi), %xmm5 + palignr $15, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5f(%rsi), %xmm6 + palignr $15, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6f(%rsi), %xmm7 + palignr $15, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7f(%rsi), %xmm8 + palignr $15, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8f(%rsi), %xmm9 + palignr $15, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_15_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(gobble_mem_fwd): + movdqu (%rsi), %xmm1 + movdqu %xmm0, (%r8) + movdqa %xmm1, (%rdi) + sub $16, %rdx + add $16, %rsi + add $16, %rdi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif +#ifdef USE_AS_MEMMOVE + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_fwd_start) +L(memmove_is_memcpy_fwd): +#endif + cmp %rcx, %rdx + ja L(bigger_in_fwd) + mov %rdx, %rcx +L(bigger_in_fwd): + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy_fwd) + + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy_fwd) + add %rcx, %rdx + xor %rcx, %rcx +L(2steps_copy_fwd): + sub $0x80, %rdx +L(gobble_mem_fwd_loop): + sub $0x80, %rdx + prefetcht0 0x200(%rsi) + prefetcht0 0x300(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lfence + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_mem_fwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_fwd_end) + add $0x80, %rdx +L(ll_cache_copy_fwd): + add %rcx, %rdx +L(ll_cache_copy_fwd_start): + sub $0x80, %rdx +L(gobble_ll_loop_fwd): + prefetchnta 0x1c0(%rsi) + prefetchnta 0x280(%rsi) + prefetchnta 0x1c0(%rdi) + prefetchnta 0x280(%rdi) + sub $0x80, %rdx + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_ll_loop_fwd) +L(gobble_mem_fwd_end): + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(gobble_mem_bwd): + add %rdx, %rsi + add %rdx, %rdi + + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $-16, %rdi + sub %rdi, %r9 + sub %r9, %rsi + sub %r9, %rdx + + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif +#ifdef USE_AS_MEMMOVE + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_bwd_start) +L(memmove_is_memcpy_bwd): +#endif + cmp %rcx, %rdx + ja L(bigger) + mov %rdx, %rcx +L(bigger): + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy) + + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy) + add %rcx, %rdx + xor %rcx, %rcx +L(2steps_copy): + sub $0x80, %rdx +L(gobble_mem_bwd_loop): + sub $0x80, %rdx + prefetcht0 -0x200(%rsi) + prefetcht0 -0x300(%rsi) + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + lfence + movntdq %xmm1, -0x10(%rdi) + movntdq %xmm2, -0x20(%rdi) + movntdq %xmm3, -0x30(%rdi) + movntdq %xmm4, -0x40(%rdi) + movntdq %xmm5, -0x50(%rdi) + movntdq %xmm6, -0x60(%rdi) + movntdq %xmm7, -0x70(%rdi) + movntdq %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_mem_bwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_bwd_end) + add $0x80, %rdx +L(ll_cache_copy): + add %rcx, %rdx +L(ll_cache_copy_bwd_start): + sub $0x80, %rdx +L(gobble_ll_loop): + prefetchnta -0x1c0(%rsi) + prefetchnta -0x280(%rsi) + prefetchnta -0x1c0(%rdi) + prefetchnta -0x280(%rdi) + sub $0x80, %rdx + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + movdqa %xmm1, -0x10(%rdi) + movdqa %xmm2, -0x20(%rdi) + movdqa %xmm3, -0x30(%rdi) + movdqa %xmm4, -0x40(%rdi) + movdqa %xmm5, -0x50(%rdi) + movdqa %xmm6, -0x60(%rdi) + movdqa %xmm7, -0x70(%rdi) + movdqa %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_ll_loop) +L(gobble_mem_bwd_end): + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rsi + sub %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(fwd_write_128bytes): + lddqu -128(%rsi), %xmm0 + movdqu %xmm0, -128(%rdi) +L(fwd_write_112bytes): + lddqu -112(%rsi), %xmm0 + movdqu %xmm0, -112(%rdi) +L(fwd_write_96bytes): + lddqu -96(%rsi), %xmm0 + movdqu %xmm0, -96(%rdi) +L(fwd_write_80bytes): + lddqu -80(%rsi), %xmm0 + movdqu %xmm0, -80(%rdi) +L(fwd_write_64bytes): + lddqu -64(%rsi), %xmm0 + movdqu %xmm0, -64(%rdi) +L(fwd_write_48bytes): + lddqu -48(%rsi), %xmm0 + movdqu %xmm0, -48(%rdi) +L(fwd_write_32bytes): + lddqu -32(%rsi), %xmm0 + movdqu %xmm0, -32(%rdi) +L(fwd_write_16bytes): + lddqu -16(%rsi), %xmm0 + movdqu %xmm0, -16(%rdi) +L(fwd_write_0bytes): + ret + + + .p2align 4 +L(fwd_write_143bytes): + lddqu -143(%rsi), %xmm0 + movdqu %xmm0, -143(%rdi) +L(fwd_write_127bytes): + lddqu -127(%rsi), %xmm0 + movdqu %xmm0, -127(%rdi) +L(fwd_write_111bytes): + lddqu -111(%rsi), %xmm0 + movdqu %xmm0, -111(%rdi) +L(fwd_write_95bytes): + lddqu -95(%rsi), %xmm0 + movdqu %xmm0, -95(%rdi) +L(fwd_write_79bytes): + lddqu -79(%rsi), %xmm0 + movdqu %xmm0, -79(%rdi) +L(fwd_write_63bytes): + lddqu -63(%rsi), %xmm0 + movdqu %xmm0, -63(%rdi) +L(fwd_write_47bytes): + lddqu -47(%rsi), %xmm0 + movdqu %xmm0, -47(%rdi) +L(fwd_write_31bytes): + lddqu -31(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -31(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_15bytes): + mov -15(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -15(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_142bytes): + lddqu -142(%rsi), %xmm0 + movdqu %xmm0, -142(%rdi) +L(fwd_write_126bytes): + lddqu -126(%rsi), %xmm0 + movdqu %xmm0, -126(%rdi) +L(fwd_write_110bytes): + lddqu -110(%rsi), %xmm0 + movdqu %xmm0, -110(%rdi) +L(fwd_write_94bytes): + lddqu -94(%rsi), %xmm0 + movdqu %xmm0, -94(%rdi) +L(fwd_write_78bytes): + lddqu -78(%rsi), %xmm0 + movdqu %xmm0, -78(%rdi) +L(fwd_write_62bytes): + lddqu -62(%rsi), %xmm0 + movdqu %xmm0, -62(%rdi) +L(fwd_write_46bytes): + lddqu -46(%rsi), %xmm0 + movdqu %xmm0, -46(%rdi) +L(fwd_write_30bytes): + lddqu -30(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -30(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_14bytes): + mov -14(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -14(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_141bytes): + lddqu -141(%rsi), %xmm0 + movdqu %xmm0, -141(%rdi) +L(fwd_write_125bytes): + lddqu -125(%rsi), %xmm0 + movdqu %xmm0, -125(%rdi) +L(fwd_write_109bytes): + lddqu -109(%rsi), %xmm0 + movdqu %xmm0, -109(%rdi) +L(fwd_write_93bytes): + lddqu -93(%rsi), %xmm0 + movdqu %xmm0, -93(%rdi) +L(fwd_write_77bytes): + lddqu -77(%rsi), %xmm0 + movdqu %xmm0, -77(%rdi) +L(fwd_write_61bytes): + lddqu -61(%rsi), %xmm0 + movdqu %xmm0, -61(%rdi) +L(fwd_write_45bytes): + lddqu -45(%rsi), %xmm0 + movdqu %xmm0, -45(%rdi) +L(fwd_write_29bytes): + lddqu -29(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -29(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_13bytes): + mov -13(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -13(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_140bytes): + lddqu -140(%rsi), %xmm0 + movdqu %xmm0, -140(%rdi) +L(fwd_write_124bytes): + lddqu -124(%rsi), %xmm0 + movdqu %xmm0, -124(%rdi) +L(fwd_write_108bytes): + lddqu -108(%rsi), %xmm0 + movdqu %xmm0, -108(%rdi) +L(fwd_write_92bytes): + lddqu -92(%rsi), %xmm0 + movdqu %xmm0, -92(%rdi) +L(fwd_write_76bytes): + lddqu -76(%rsi), %xmm0 + movdqu %xmm0, -76(%rdi) +L(fwd_write_60bytes): + lddqu -60(%rsi), %xmm0 + movdqu %xmm0, -60(%rdi) +L(fwd_write_44bytes): + lddqu -44(%rsi), %xmm0 + movdqu %xmm0, -44(%rdi) +L(fwd_write_28bytes): + lddqu -28(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -28(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_12bytes): + mov -12(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -12(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_139bytes): + lddqu -139(%rsi), %xmm0 + movdqu %xmm0, -139(%rdi) +L(fwd_write_123bytes): + lddqu -123(%rsi), %xmm0 + movdqu %xmm0, -123(%rdi) +L(fwd_write_107bytes): + lddqu -107(%rsi), %xmm0 + movdqu %xmm0, -107(%rdi) +L(fwd_write_91bytes): + lddqu -91(%rsi), %xmm0 + movdqu %xmm0, -91(%rdi) +L(fwd_write_75bytes): + lddqu -75(%rsi), %xmm0 + movdqu %xmm0, -75(%rdi) +L(fwd_write_59bytes): + lddqu -59(%rsi), %xmm0 + movdqu %xmm0, -59(%rdi) +L(fwd_write_43bytes): + lddqu -43(%rsi), %xmm0 + movdqu %xmm0, -43(%rdi) +L(fwd_write_27bytes): + lddqu -27(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -27(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_11bytes): + mov -11(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -11(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_138bytes): + lddqu -138(%rsi), %xmm0 + movdqu %xmm0, -138(%rdi) +L(fwd_write_122bytes): + lddqu -122(%rsi), %xmm0 + movdqu %xmm0, -122(%rdi) +L(fwd_write_106bytes): + lddqu -106(%rsi), %xmm0 + movdqu %xmm0, -106(%rdi) +L(fwd_write_90bytes): + lddqu -90(%rsi), %xmm0 + movdqu %xmm0, -90(%rdi) +L(fwd_write_74bytes): + lddqu -74(%rsi), %xmm0 + movdqu %xmm0, -74(%rdi) +L(fwd_write_58bytes): + lddqu -58(%rsi), %xmm0 + movdqu %xmm0, -58(%rdi) +L(fwd_write_42bytes): + lddqu -42(%rsi), %xmm0 + movdqu %xmm0, -42(%rdi) +L(fwd_write_26bytes): + lddqu -26(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -26(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_10bytes): + mov -10(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -10(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_137bytes): + lddqu -137(%rsi), %xmm0 + movdqu %xmm0, -137(%rdi) +L(fwd_write_121bytes): + lddqu -121(%rsi), %xmm0 + movdqu %xmm0, -121(%rdi) +L(fwd_write_105bytes): + lddqu -105(%rsi), %xmm0 + movdqu %xmm0, -105(%rdi) +L(fwd_write_89bytes): + lddqu -89(%rsi), %xmm0 + movdqu %xmm0, -89(%rdi) +L(fwd_write_73bytes): + lddqu -73(%rsi), %xmm0 + movdqu %xmm0, -73(%rdi) +L(fwd_write_57bytes): + lddqu -57(%rsi), %xmm0 + movdqu %xmm0, -57(%rdi) +L(fwd_write_41bytes): + lddqu -41(%rsi), %xmm0 + movdqu %xmm0, -41(%rdi) +L(fwd_write_25bytes): + lddqu -25(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -25(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_9bytes): + mov -9(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -9(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_136bytes): + lddqu -136(%rsi), %xmm0 + movdqu %xmm0, -136(%rdi) +L(fwd_write_120bytes): + lddqu -120(%rsi), %xmm0 + movdqu %xmm0, -120(%rdi) +L(fwd_write_104bytes): + lddqu -104(%rsi), %xmm0 + movdqu %xmm0, -104(%rdi) +L(fwd_write_88bytes): + lddqu -88(%rsi), %xmm0 + movdqu %xmm0, -88(%rdi) +L(fwd_write_72bytes): + lddqu -72(%rsi), %xmm0 + movdqu %xmm0, -72(%rdi) +L(fwd_write_56bytes): + lddqu -56(%rsi), %xmm0 + movdqu %xmm0, -56(%rdi) +L(fwd_write_40bytes): + lddqu -40(%rsi), %xmm0 + movdqu %xmm0, -40(%rdi) +L(fwd_write_24bytes): + lddqu -24(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -24(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_8bytes): + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_135bytes): + lddqu -135(%rsi), %xmm0 + movdqu %xmm0, -135(%rdi) +L(fwd_write_119bytes): + lddqu -119(%rsi), %xmm0 + movdqu %xmm0, -119(%rdi) +L(fwd_write_103bytes): + lddqu -103(%rsi), %xmm0 + movdqu %xmm0, -103(%rdi) +L(fwd_write_87bytes): + lddqu -87(%rsi), %xmm0 + movdqu %xmm0, -87(%rdi) +L(fwd_write_71bytes): + lddqu -71(%rsi), %xmm0 + movdqu %xmm0, -71(%rdi) +L(fwd_write_55bytes): + lddqu -55(%rsi), %xmm0 + movdqu %xmm0, -55(%rdi) +L(fwd_write_39bytes): + lddqu -39(%rsi), %xmm0 + movdqu %xmm0, -39(%rdi) +L(fwd_write_23bytes): + lddqu -23(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -23(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_7bytes): + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_134bytes): + lddqu -134(%rsi), %xmm0 + movdqu %xmm0, -134(%rdi) +L(fwd_write_118bytes): + lddqu -118(%rsi), %xmm0 + movdqu %xmm0, -118(%rdi) +L(fwd_write_102bytes): + lddqu -102(%rsi), %xmm0 + movdqu %xmm0, -102(%rdi) +L(fwd_write_86bytes): + lddqu -86(%rsi), %xmm0 + movdqu %xmm0, -86(%rdi) +L(fwd_write_70bytes): + lddqu -70(%rsi), %xmm0 + movdqu %xmm0, -70(%rdi) +L(fwd_write_54bytes): + lddqu -54(%rsi), %xmm0 + movdqu %xmm0, -54(%rdi) +L(fwd_write_38bytes): + lddqu -38(%rsi), %xmm0 + movdqu %xmm0, -38(%rdi) +L(fwd_write_22bytes): + lddqu -22(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -22(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_6bytes): + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_133bytes): + lddqu -133(%rsi), %xmm0 + movdqu %xmm0, -133(%rdi) +L(fwd_write_117bytes): + lddqu -117(%rsi), %xmm0 + movdqu %xmm0, -117(%rdi) +L(fwd_write_101bytes): + lddqu -101(%rsi), %xmm0 + movdqu %xmm0, -101(%rdi) +L(fwd_write_85bytes): + lddqu -85(%rsi), %xmm0 + movdqu %xmm0, -85(%rdi) +L(fwd_write_69bytes): + lddqu -69(%rsi), %xmm0 + movdqu %xmm0, -69(%rdi) +L(fwd_write_53bytes): + lddqu -53(%rsi), %xmm0 + movdqu %xmm0, -53(%rdi) +L(fwd_write_37bytes): + lddqu -37(%rsi), %xmm0 + movdqu %xmm0, -37(%rdi) +L(fwd_write_21bytes): + lddqu -21(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -21(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_5bytes): + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_132bytes): + lddqu -132(%rsi), %xmm0 + movdqu %xmm0, -132(%rdi) +L(fwd_write_116bytes): + lddqu -116(%rsi), %xmm0 + movdqu %xmm0, -116(%rdi) +L(fwd_write_100bytes): + lddqu -100(%rsi), %xmm0 + movdqu %xmm0, -100(%rdi) +L(fwd_write_84bytes): + lddqu -84(%rsi), %xmm0 + movdqu %xmm0, -84(%rdi) +L(fwd_write_68bytes): + lddqu -68(%rsi), %xmm0 + movdqu %xmm0, -68(%rdi) +L(fwd_write_52bytes): + lddqu -52(%rsi), %xmm0 + movdqu %xmm0, -52(%rdi) +L(fwd_write_36bytes): + lddqu -36(%rsi), %xmm0 + movdqu %xmm0, -36(%rdi) +L(fwd_write_20bytes): + lddqu -20(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -20(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_4bytes): + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_131bytes): + lddqu -131(%rsi), %xmm0 + movdqu %xmm0, -131(%rdi) +L(fwd_write_115bytes): + lddqu -115(%rsi), %xmm0 + movdqu %xmm0, -115(%rdi) +L(fwd_write_99bytes): + lddqu -99(%rsi), %xmm0 + movdqu %xmm0, -99(%rdi) +L(fwd_write_83bytes): + lddqu -83(%rsi), %xmm0 + movdqu %xmm0, -83(%rdi) +L(fwd_write_67bytes): + lddqu -67(%rsi), %xmm0 + movdqu %xmm0, -67(%rdi) +L(fwd_write_51bytes): + lddqu -51(%rsi), %xmm0 + movdqu %xmm0, -51(%rdi) +L(fwd_write_35bytes): + lddqu -35(%rsi), %xmm0 + movdqu %xmm0, -35(%rdi) +L(fwd_write_19bytes): + lddqu -19(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -19(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_3bytes): + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret + + .p2align 4 +L(fwd_write_130bytes): + lddqu -130(%rsi), %xmm0 + movdqu %xmm0, -130(%rdi) +L(fwd_write_114bytes): + lddqu -114(%rsi), %xmm0 + movdqu %xmm0, -114(%rdi) +L(fwd_write_98bytes): + lddqu -98(%rsi), %xmm0 + movdqu %xmm0, -98(%rdi) +L(fwd_write_82bytes): + lddqu -82(%rsi), %xmm0 + movdqu %xmm0, -82(%rdi) +L(fwd_write_66bytes): + lddqu -66(%rsi), %xmm0 + movdqu %xmm0, -66(%rdi) +L(fwd_write_50bytes): + lddqu -50(%rsi), %xmm0 + movdqu %xmm0, -50(%rdi) +L(fwd_write_34bytes): + lddqu -34(%rsi), %xmm0 + movdqu %xmm0, -34(%rdi) +L(fwd_write_18bytes): + lddqu -18(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -18(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_2bytes): + movzwl -2(%rsi), %edx + mov %dx, -2(%rdi) + ret + + .p2align 4 +L(fwd_write_129bytes): + lddqu -129(%rsi), %xmm0 + movdqu %xmm0, -129(%rdi) +L(fwd_write_113bytes): + lddqu -113(%rsi), %xmm0 + movdqu %xmm0, -113(%rdi) +L(fwd_write_97bytes): + lddqu -97(%rsi), %xmm0 + movdqu %xmm0, -97(%rdi) +L(fwd_write_81bytes): + lddqu -81(%rsi), %xmm0 + movdqu %xmm0, -81(%rdi) +L(fwd_write_65bytes): + lddqu -65(%rsi), %xmm0 + movdqu %xmm0, -65(%rdi) +L(fwd_write_49bytes): + lddqu -49(%rsi), %xmm0 + movdqu %xmm0, -49(%rdi) +L(fwd_write_33bytes): + lddqu -33(%rsi), %xmm0 + movdqu %xmm0, -33(%rdi) +L(fwd_write_17bytes): + lddqu -17(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -17(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_1bytes): + movzbl -1(%rsi), %edx + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(bwd_write_128bytes): + lddqu 112(%rsi), %xmm0 + movdqu %xmm0, 112(%rdi) +L(bwd_write_112bytes): + lddqu 96(%rsi), %xmm0 + movdqu %xmm0, 96(%rdi) +L(bwd_write_96bytes): + lddqu 80(%rsi), %xmm0 + movdqu %xmm0, 80(%rdi) +L(bwd_write_80bytes): + lddqu 64(%rsi), %xmm0 + movdqu %xmm0, 64(%rdi) +L(bwd_write_64bytes): + lddqu 48(%rsi), %xmm0 + movdqu %xmm0, 48(%rdi) +L(bwd_write_48bytes): + lddqu 32(%rsi), %xmm0 + movdqu %xmm0, 32(%rdi) +L(bwd_write_32bytes): + lddqu 16(%rsi), %xmm0 + movdqu %xmm0, 16(%rdi) +L(bwd_write_16bytes): + lddqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +L(bwd_write_0bytes): + ret + + .p2align 4 +L(bwd_write_143bytes): + lddqu 127(%rsi), %xmm0 + movdqu %xmm0, 127(%rdi) +L(bwd_write_127bytes): + lddqu 111(%rsi), %xmm0 + movdqu %xmm0, 111(%rdi) +L(bwd_write_111bytes): + lddqu 95(%rsi), %xmm0 + movdqu %xmm0, 95(%rdi) +L(bwd_write_95bytes): + lddqu 79(%rsi), %xmm0 + movdqu %xmm0, 79(%rdi) +L(bwd_write_79bytes): + lddqu 63(%rsi), %xmm0 + movdqu %xmm0, 63(%rdi) +L(bwd_write_63bytes): + lddqu 47(%rsi), %xmm0 + movdqu %xmm0, 47(%rdi) +L(bwd_write_47bytes): + lddqu 31(%rsi), %xmm0 + movdqu %xmm0, 31(%rdi) +L(bwd_write_31bytes): + lddqu 15(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 15(%rdi) + movdqu %xmm1, (%rdi) + ret + + + .p2align 4 +L(bwd_write_15bytes): + mov 7(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 7(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_142bytes): + lddqu 126(%rsi), %xmm0 + movdqu %xmm0, 126(%rdi) +L(bwd_write_126bytes): + lddqu 110(%rsi), %xmm0 + movdqu %xmm0, 110(%rdi) +L(bwd_write_110bytes): + lddqu 94(%rsi), %xmm0 + movdqu %xmm0, 94(%rdi) +L(bwd_write_94bytes): + lddqu 78(%rsi), %xmm0 + movdqu %xmm0, 78(%rdi) +L(bwd_write_78bytes): + lddqu 62(%rsi), %xmm0 + movdqu %xmm0, 62(%rdi) +L(bwd_write_62bytes): + lddqu 46(%rsi), %xmm0 + movdqu %xmm0, 46(%rdi) +L(bwd_write_46bytes): + lddqu 30(%rsi), %xmm0 + movdqu %xmm0, 30(%rdi) +L(bwd_write_30bytes): + lddqu 14(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 14(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_14bytes): + mov 6(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 6(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_141bytes): + lddqu 125(%rsi), %xmm0 + movdqu %xmm0, 125(%rdi) +L(bwd_write_125bytes): + lddqu 109(%rsi), %xmm0 + movdqu %xmm0, 109(%rdi) +L(bwd_write_109bytes): + lddqu 93(%rsi), %xmm0 + movdqu %xmm0, 93(%rdi) +L(bwd_write_93bytes): + lddqu 77(%rsi), %xmm0 + movdqu %xmm0, 77(%rdi) +L(bwd_write_77bytes): + lddqu 61(%rsi), %xmm0 + movdqu %xmm0, 61(%rdi) +L(bwd_write_61bytes): + lddqu 45(%rsi), %xmm0 + movdqu %xmm0, 45(%rdi) +L(bwd_write_45bytes): + lddqu 29(%rsi), %xmm0 + movdqu %xmm0, 29(%rdi) +L(bwd_write_29bytes): + lddqu 13(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 13(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_13bytes): + mov 5(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 5(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_140bytes): + lddqu 124(%rsi), %xmm0 + movdqu %xmm0, 124(%rdi) +L(bwd_write_124bytes): + lddqu 108(%rsi), %xmm0 + movdqu %xmm0, 108(%rdi) +L(bwd_write_108bytes): + lddqu 92(%rsi), %xmm0 + movdqu %xmm0, 92(%rdi) +L(bwd_write_92bytes): + lddqu 76(%rsi), %xmm0 + movdqu %xmm0, 76(%rdi) +L(bwd_write_76bytes): + lddqu 60(%rsi), %xmm0 + movdqu %xmm0, 60(%rdi) +L(bwd_write_60bytes): + lddqu 44(%rsi), %xmm0 + movdqu %xmm0, 44(%rdi) +L(bwd_write_44bytes): + lddqu 28(%rsi), %xmm0 + movdqu %xmm0, 28(%rdi) +L(bwd_write_28bytes): + lddqu 12(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 12(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_12bytes): + mov 4(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 4(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_139bytes): + lddqu 123(%rsi), %xmm0 + movdqu %xmm0, 123(%rdi) +L(bwd_write_123bytes): + lddqu 107(%rsi), %xmm0 + movdqu %xmm0, 107(%rdi) +L(bwd_write_107bytes): + lddqu 91(%rsi), %xmm0 + movdqu %xmm0, 91(%rdi) +L(bwd_write_91bytes): + lddqu 75(%rsi), %xmm0 + movdqu %xmm0, 75(%rdi) +L(bwd_write_75bytes): + lddqu 59(%rsi), %xmm0 + movdqu %xmm0, 59(%rdi) +L(bwd_write_59bytes): + lddqu 43(%rsi), %xmm0 + movdqu %xmm0, 43(%rdi) +L(bwd_write_43bytes): + lddqu 27(%rsi), %xmm0 + movdqu %xmm0, 27(%rdi) +L(bwd_write_27bytes): + lddqu 11(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 11(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_11bytes): + mov 3(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 3(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_138bytes): + lddqu 122(%rsi), %xmm0 + movdqu %xmm0, 122(%rdi) +L(bwd_write_122bytes): + lddqu 106(%rsi), %xmm0 + movdqu %xmm0, 106(%rdi) +L(bwd_write_106bytes): + lddqu 90(%rsi), %xmm0 + movdqu %xmm0, 90(%rdi) +L(bwd_write_90bytes): + lddqu 74(%rsi), %xmm0 + movdqu %xmm0, 74(%rdi) +L(bwd_write_74bytes): + lddqu 58(%rsi), %xmm0 + movdqu %xmm0, 58(%rdi) +L(bwd_write_58bytes): + lddqu 42(%rsi), %xmm0 + movdqu %xmm0, 42(%rdi) +L(bwd_write_42bytes): + lddqu 26(%rsi), %xmm0 + movdqu %xmm0, 26(%rdi) +L(bwd_write_26bytes): + lddqu 10(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 10(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_10bytes): + mov 2(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 2(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_137bytes): + lddqu 121(%rsi), %xmm0 + movdqu %xmm0, 121(%rdi) +L(bwd_write_121bytes): + lddqu 105(%rsi), %xmm0 + movdqu %xmm0, 105(%rdi) +L(bwd_write_105bytes): + lddqu 89(%rsi), %xmm0 + movdqu %xmm0, 89(%rdi) +L(bwd_write_89bytes): + lddqu 73(%rsi), %xmm0 + movdqu %xmm0, 73(%rdi) +L(bwd_write_73bytes): + lddqu 57(%rsi), %xmm0 + movdqu %xmm0, 57(%rdi) +L(bwd_write_57bytes): + lddqu 41(%rsi), %xmm0 + movdqu %xmm0, 41(%rdi) +L(bwd_write_41bytes): + lddqu 25(%rsi), %xmm0 + movdqu %xmm0, 25(%rdi) +L(bwd_write_25bytes): + lddqu 9(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 9(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_9bytes): + mov 1(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 1(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_136bytes): + lddqu 120(%rsi), %xmm0 + movdqu %xmm0, 120(%rdi) +L(bwd_write_120bytes): + lddqu 104(%rsi), %xmm0 + movdqu %xmm0, 104(%rdi) +L(bwd_write_104bytes): + lddqu 88(%rsi), %xmm0 + movdqu %xmm0, 88(%rdi) +L(bwd_write_88bytes): + lddqu 72(%rsi), %xmm0 + movdqu %xmm0, 72(%rdi) +L(bwd_write_72bytes): + lddqu 56(%rsi), %xmm0 + movdqu %xmm0, 56(%rdi) +L(bwd_write_56bytes): + lddqu 40(%rsi), %xmm0 + movdqu %xmm0, 40(%rdi) +L(bwd_write_40bytes): + lddqu 24(%rsi), %xmm0 + movdqu %xmm0, 24(%rdi) +L(bwd_write_24bytes): + lddqu 8(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 8(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_8bytes): + mov (%rsi), %rdx + mov %rdx, (%rdi) + ret + + .p2align 4 +L(bwd_write_135bytes): + lddqu 119(%rsi), %xmm0 + movdqu %xmm0, 119(%rdi) +L(bwd_write_119bytes): + lddqu 103(%rsi), %xmm0 + movdqu %xmm0, 103(%rdi) +L(bwd_write_103bytes): + lddqu 87(%rsi), %xmm0 + movdqu %xmm0, 87(%rdi) +L(bwd_write_87bytes): + lddqu 71(%rsi), %xmm0 + movdqu %xmm0, 71(%rdi) +L(bwd_write_71bytes): + lddqu 55(%rsi), %xmm0 + movdqu %xmm0, 55(%rdi) +L(bwd_write_55bytes): + lddqu 39(%rsi), %xmm0 + movdqu %xmm0, 39(%rdi) +L(bwd_write_39bytes): + lddqu 23(%rsi), %xmm0 + movdqu %xmm0, 23(%rdi) +L(bwd_write_23bytes): + lddqu 7(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 7(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_7bytes): + mov 3(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 3(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_134bytes): + lddqu 118(%rsi), %xmm0 + movdqu %xmm0, 118(%rdi) +L(bwd_write_118bytes): + lddqu 102(%rsi), %xmm0 + movdqu %xmm0, 102(%rdi) +L(bwd_write_102bytes): + lddqu 86(%rsi), %xmm0 + movdqu %xmm0, 86(%rdi) +L(bwd_write_86bytes): + lddqu 70(%rsi), %xmm0 + movdqu %xmm0, 70(%rdi) +L(bwd_write_70bytes): + lddqu 54(%rsi), %xmm0 + movdqu %xmm0, 54(%rdi) +L(bwd_write_54bytes): + lddqu 38(%rsi), %xmm0 + movdqu %xmm0, 38(%rdi) +L(bwd_write_38bytes): + lddqu 22(%rsi), %xmm0 + movdqu %xmm0, 22(%rdi) +L(bwd_write_22bytes): + lddqu 6(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 6(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_6bytes): + mov 2(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 2(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_133bytes): + lddqu 117(%rsi), %xmm0 + movdqu %xmm0, 117(%rdi) +L(bwd_write_117bytes): + lddqu 101(%rsi), %xmm0 + movdqu %xmm0, 101(%rdi) +L(bwd_write_101bytes): + lddqu 85(%rsi), %xmm0 + movdqu %xmm0, 85(%rdi) +L(bwd_write_85bytes): + lddqu 69(%rsi), %xmm0 + movdqu %xmm0, 69(%rdi) +L(bwd_write_69bytes): + lddqu 53(%rsi), %xmm0 + movdqu %xmm0, 53(%rdi) +L(bwd_write_53bytes): + lddqu 37(%rsi), %xmm0 + movdqu %xmm0, 37(%rdi) +L(bwd_write_37bytes): + lddqu 21(%rsi), %xmm0 + movdqu %xmm0, 21(%rdi) +L(bwd_write_21bytes): + lddqu 5(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 5(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_5bytes): + mov 1(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 1(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_132bytes): + lddqu 116(%rsi), %xmm0 + movdqu %xmm0, 116(%rdi) +L(bwd_write_116bytes): + lddqu 100(%rsi), %xmm0 + movdqu %xmm0, 100(%rdi) +L(bwd_write_100bytes): + lddqu 84(%rsi), %xmm0 + movdqu %xmm0, 84(%rdi) +L(bwd_write_84bytes): + lddqu 68(%rsi), %xmm0 + movdqu %xmm0, 68(%rdi) +L(bwd_write_68bytes): + lddqu 52(%rsi), %xmm0 + movdqu %xmm0, 52(%rdi) +L(bwd_write_52bytes): + lddqu 36(%rsi), %xmm0 + movdqu %xmm0, 36(%rdi) +L(bwd_write_36bytes): + lddqu 20(%rsi), %xmm0 + movdqu %xmm0, 20(%rdi) +L(bwd_write_20bytes): + lddqu 4(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 4(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_4bytes): + mov (%rsi), %edx + mov %edx, (%rdi) + ret + + .p2align 4 +L(bwd_write_131bytes): + lddqu 115(%rsi), %xmm0 + movdqu %xmm0, 115(%rdi) +L(bwd_write_115bytes): + lddqu 99(%rsi), %xmm0 + movdqu %xmm0, 99(%rdi) +L(bwd_write_99bytes): + lddqu 83(%rsi), %xmm0 + movdqu %xmm0, 83(%rdi) +L(bwd_write_83bytes): + lddqu 67(%rsi), %xmm0 + movdqu %xmm0, 67(%rdi) +L(bwd_write_67bytes): + lddqu 51(%rsi), %xmm0 + movdqu %xmm0, 51(%rdi) +L(bwd_write_51bytes): + lddqu 35(%rsi), %xmm0 + movdqu %xmm0, 35(%rdi) +L(bwd_write_35bytes): + lddqu 19(%rsi), %xmm0 + movdqu %xmm0, 19(%rdi) +L(bwd_write_19bytes): + lddqu 3(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 3(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_3bytes): + mov 1(%rsi), %dx + mov (%rsi), %cx + mov %dx, 1(%rdi) + mov %cx, (%rdi) + ret + + .p2align 4 +L(bwd_write_130bytes): + lddqu 114(%rsi), %xmm0 + movdqu %xmm0, 114(%rdi) +L(bwd_write_114bytes): + lddqu 98(%rsi), %xmm0 + movdqu %xmm0, 98(%rdi) +L(bwd_write_98bytes): + lddqu 82(%rsi), %xmm0 + movdqu %xmm0, 82(%rdi) +L(bwd_write_82bytes): + lddqu 66(%rsi), %xmm0 + movdqu %xmm0, 66(%rdi) +L(bwd_write_66bytes): + lddqu 50(%rsi), %xmm0 + movdqu %xmm0, 50(%rdi) +L(bwd_write_50bytes): + lddqu 34(%rsi), %xmm0 + movdqu %xmm0, 34(%rdi) +L(bwd_write_34bytes): + lddqu 18(%rsi), %xmm0 + movdqu %xmm0, 18(%rdi) +L(bwd_write_18bytes): + lddqu 2(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 2(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_2bytes): + movzwl (%rsi), %edx + mov %dx, (%rdi) + ret + + .p2align 4 +L(bwd_write_129bytes): + lddqu 113(%rsi), %xmm0 + movdqu %xmm0, 113(%rdi) +L(bwd_write_113bytes): + lddqu 97(%rsi), %xmm0 + movdqu %xmm0, 97(%rdi) +L(bwd_write_97bytes): + lddqu 81(%rsi), %xmm0 + movdqu %xmm0, 81(%rdi) +L(bwd_write_81bytes): + lddqu 65(%rsi), %xmm0 + movdqu %xmm0, 65(%rdi) +L(bwd_write_65bytes): + lddqu 49(%rsi), %xmm0 + movdqu %xmm0, 49(%rdi) +L(bwd_write_49bytes): + lddqu 33(%rsi), %xmm0 + movdqu %xmm0, 33(%rdi) +L(bwd_write_33bytes): + lddqu 17(%rsi), %xmm0 + movdqu %xmm0, 17(%rdi) +L(bwd_write_17bytes): + lddqu 1(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 1(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_1bytes): + movzbl (%rsi), %edx + mov %dl, (%rdi) + ret + +END (MEMCPY) + + .section .rodata.ssse3,"a",@progbits + .p2align 3 +L(table_144_bytes_bwd): + .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) + + .p2align 3 +L(table_144_bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) + + .p2align 3 +L(shl_table_fwd): + .int JMPTBL (L(shl_0), L(shl_table_fwd)) + .int JMPTBL (L(shl_1), L(shl_table_fwd)) + .int JMPTBL (L(shl_2), L(shl_table_fwd)) + .int JMPTBL (L(shl_3), L(shl_table_fwd)) + .int JMPTBL (L(shl_4), L(shl_table_fwd)) + .int JMPTBL (L(shl_5), L(shl_table_fwd)) + .int JMPTBL (L(shl_6), L(shl_table_fwd)) + .int JMPTBL (L(shl_7), L(shl_table_fwd)) + .int JMPTBL (L(shl_8), L(shl_table_fwd)) + .int JMPTBL (L(shl_9), L(shl_table_fwd)) + .int JMPTBL (L(shl_10), L(shl_table_fwd)) + .int JMPTBL (L(shl_11), L(shl_table_fwd)) + .int JMPTBL (L(shl_12), L(shl_table_fwd)) + .int JMPTBL (L(shl_13), L(shl_table_fwd)) + .int JMPTBL (L(shl_14), L(shl_table_fwd)) + .int JMPTBL (L(shl_15), L(shl_table_fwd)) + + .p2align 3 +L(shl_table_bwd): + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + +#endif diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3.S b/utils/memcpy-bench/glibc/memcpy-ssse3.S new file mode 100644 index 00000000000..2fd26651645 --- /dev/null +++ b/utils/memcpy-bench/glibc/memcpy-ssse3.S @@ -0,0 +1,3152 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2020 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "sysdep.h" + +#if 1 + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 +#endif + +#define JMPTBL(I, B) I - B + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + _CET_NOTRACK jmp *INDEX; \ + ud2 + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) +END (MEMPCPY) +#endif + +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %RDI_LP, %RAX_LP +#ifdef USE_AS_MEMPCPY + add %RDX_LP, %RAX_LP +#endif + +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +#endif + +#ifdef USE_AS_MEMMOVE + cmp %rsi, %rdi + jb L(copy_forward) + je L(write_0bytes) + cmp $79, %rdx + jbe L(copy_forward) + jmp L(copy_backward) +L(copy_forward): +#endif +L(start): + cmp $79, %rdx + lea L(table_less_80bytes)(%rip), %r11 + ja L(80bytesormore) + movslq (%r11, %rdx, 4), %r9 + add %rdx, %rsi + add %rdx, %rdi + add %r11, %r9 + _CET_NOTRACK jmp *%r9 + ud2 + + .p2align 4 +L(80bytesormore): +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jle L(copy_backward) +#endif + + movdqu (%rsi), %xmm0 + mov %rdi, %rcx + and $-16, %rdi + add $16, %rdi + mov %rcx, %r8 + sub %rdi, %rcx + add %rcx, %rdx + sub %rcx, %rsi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_fwd) + and $0xf, %r9 + jz L(shl_0) +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_data_cache_size_half(%rip), %RCX_LP +#endif + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) + + .p2align 4 +L(copy_backward): + movdqu -16(%rsi, %rdx), %xmm0 + add %rdx, %rsi + lea -16(%rdi, %rdx), %r8 + add %rdx, %rdi + + mov %rdi, %rcx + and $0xf, %rcx + xor %rcx, %rdi + sub %rcx, %rdx + sub %rcx, %rsi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif + + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_bwd) + and $0xf, %r9 + jz L(shl_0_bwd) +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_data_cache_size_half(%rip), %RCX_LP +#endif + BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) + + .p2align 4 +L(shl_0): + sub $16, %rdx + movdqa (%rsi), %xmm1 + add $16, %rsi + movdqa %xmm1, (%rdi) + add $16, %rdi + cmp $128, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble) + cmp $64, %rdx + jb L(shl_0_less_64bytes) + movaps (%rsi), %xmm4 + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm4, (%rdi) + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + sub $64, %rdx + add $64, %rsi + add $64, %rdi +L(shl_0_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble): +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_data_cache_size_half(%rip), %RDX_LP +#endif + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_loop) +L(shl_0_gobble_cache_loop): + movdqa (%rsi), %xmm4 + movaps 0x10(%rsi), %xmm1 + movaps 0x20(%rsi), %xmm2 + movaps 0x30(%rsi), %xmm3 + + movdqa %xmm4, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + + sub $128, %rdx + movaps 0x40(%rsi), %xmm4 + movaps 0x50(%rsi), %xmm5 + movaps 0x60(%rsi), %xmm6 + movaps 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_cache_less_64bytes) + + movdqa (%rsi), %xmm4 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 + + movdqa %xmm4, (%rdi) + movdqa %xmm1, 0x10(%rdi) + + movdqa 0x20(%rsi), %xmm4 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi + + movdqa %xmm4, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi +L(shl_0_cache_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%rsi) + prefetcht0 0x280(%rsi) + + movdqa (%rsi), %xmm0 + movdqa 0x10(%rsi), %xmm1 + movdqa 0x20(%rsi), %xmm2 + movdqa 0x30(%rsi), %xmm3 + movdqa 0x40(%rsi), %xmm4 + movdqa 0x50(%rsi), %xmm5 + movdqa 0x60(%rsi), %xmm6 + movdqa 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_less_64bytes) + + movdqa (%rsi), %xmm0 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 + + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + + movdqa 0x20(%rsi), %xmm0 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi + + movdqa %xmm0, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi +L(shl_0_mem_less_64bytes): + cmp $0x20, %rdx + jb L(shl_0_mem_less_32bytes) + movdqa (%rsi), %xmm0 + sub $0x20, %rdx + movdqa 0x10(%rsi), %xmm1 + add $0x20, %rsi + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + add $0x20, %rdi +L(shl_0_mem_less_32bytes): + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_bwd): + sub $16, %rdx + movdqa -0x10(%rsi), %xmm1 + sub $16, %rsi + movdqa %xmm1, -0x10(%rdi) + sub $16, %rdi + cmp $0x80, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble_bwd) + cmp $64, %rdx + jb L(shl_0_less_64bytes_bwd) + movaps -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + sub $64, %rdx + sub $0x40, %rsi + sub $0x40, %rdi +L(shl_0_less_64bytes_bwd): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_bwd): +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_data_cache_size_half(%rip), %RDX_LP +#endif + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_bwd_loop) +L(shl_0_gobble_bwd_loop): + movdqa -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 + + movdqa %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + + sub $0x80, %rdx + movaps -0x50(%rsi), %xmm4 + movaps -0x60(%rsi), %xmm5 + movaps -0x70(%rsi), %xmm6 + movaps -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + + jae L(shl_0_gobble_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_gobble_bwd_less_64bytes) + + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 + + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi + + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi +L(shl_0_gobble_bwd_less_64bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_mem_bwd_loop): + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x280(%rsi) + movdqa -0x10(%rsi), %xmm0 + movdqa -0x20(%rsi), %xmm1 + movdqa -0x30(%rsi), %xmm2 + movdqa -0x40(%rsi), %xmm3 + movdqa -0x50(%rsi), %xmm4 + movdqa -0x60(%rsi), %xmm5 + movdqa -0x70(%rsi), %xmm6 + movdqa -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + movdqa %xmm2, -0x30(%rdi) + movdqa %xmm3, -0x40(%rdi) + movdqa %xmm4, -0x50(%rdi) + movdqa %xmm5, -0x60(%rdi) + movdqa %xmm6, -0x70(%rdi) + movdqa %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + + jae L(shl_0_gobble_mem_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_bwd_less_64bytes) + + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 + + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi + + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi +L(shl_0_mem_bwd_less_64bytes): + cmp $0x20, %rdx + jb L(shl_0_mem_bwd_less_32bytes) + movdqa -0x10(%rsi), %xmm0 + sub $0x20, %rdx + movdqa -0x20(%rsi), %xmm1 + sub $0x20, %rsi + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + sub $0x20, %rdi +L(shl_0_mem_bwd_less_32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_1): + lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_fwd) + lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 +L(L1_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_1_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_1_loop_L1): + sub $64, %rdx + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $1, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $1, %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $1, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_1_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_1_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_1_bwd): + lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_bwd) + lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 +L(L1_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_1_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_1_bwd_loop_L1): + movaps -0x11(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x21(%rsi), %xmm3 + movaps -0x31(%rsi), %xmm4 + movaps -0x41(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $1, %xmm2, %xmm1 + palignr $1, %xmm3, %xmm2 + palignr $1, %xmm4, %xmm3 + palignr $1, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_1_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_1_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_2): + lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_fwd) + lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 +L(L2_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_2_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_2_loop_L1): + sub $64, %rdx + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $2, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $2, %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $2, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_2_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_2_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_2_bwd): + lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_bwd) + lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 +L(L2_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_2_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_2_bwd_loop_L1): + movaps -0x12(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x22(%rsi), %xmm3 + movaps -0x32(%rsi), %xmm4 + movaps -0x42(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $2, %xmm2, %xmm1 + palignr $2, %xmm3, %xmm2 + palignr $2, %xmm4, %xmm3 + palignr $2, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_2_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_2_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_3): + lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_fwd) + lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 +L(L3_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_3_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_3_loop_L1): + sub $64, %rdx + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $3, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $3, %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $3, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_3_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_3_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_3_bwd): + lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_bwd) + lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 +L(L3_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_3_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_3_bwd_loop_L1): + movaps -0x13(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x23(%rsi), %xmm3 + movaps -0x33(%rsi), %xmm4 + movaps -0x43(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $3, %xmm2, %xmm1 + palignr $3, %xmm3, %xmm2 + palignr $3, %xmm4, %xmm3 + palignr $3, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_3_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_3_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_4): + lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_fwd) + lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 +L(L4_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_4_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_4_loop_L1): + sub $64, %rdx + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $4, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $4, %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $4, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_4_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_4_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_4_bwd): + lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_bwd) + lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 +L(L4_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_4_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_4_bwd_loop_L1): + movaps -0x14(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x24(%rsi), %xmm3 + movaps -0x34(%rsi), %xmm4 + movaps -0x44(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $4, %xmm2, %xmm1 + palignr $4, %xmm3, %xmm2 + palignr $4, %xmm4, %xmm3 + palignr $4, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_4_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_4_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_5): + lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_fwd) + lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 +L(L5_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_5_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_5_loop_L1): + sub $64, %rdx + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $5, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $5, %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $5, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_5_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_5_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_5_bwd): + lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_bwd) + lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 +L(L5_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_5_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_5_bwd_loop_L1): + movaps -0x15(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x25(%rsi), %xmm3 + movaps -0x35(%rsi), %xmm4 + movaps -0x45(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $5, %xmm2, %xmm1 + palignr $5, %xmm3, %xmm2 + palignr $5, %xmm4, %xmm3 + palignr $5, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_5_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_5_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_6): + lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_fwd) + lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 +L(L6_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_6_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_6_loop_L1): + sub $64, %rdx + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $6, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $6, %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $6, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_6_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_6_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_6_bwd): + lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_bwd) + lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 +L(L6_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_6_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_6_bwd_loop_L1): + movaps -0x16(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x26(%rsi), %xmm3 + movaps -0x36(%rsi), %xmm4 + movaps -0x46(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $6, %xmm2, %xmm1 + palignr $6, %xmm3, %xmm2 + palignr $6, %xmm4, %xmm3 + palignr $6, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_6_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_6_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_7): + lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_fwd) + lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 +L(L7_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_7_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_7_loop_L1): + sub $64, %rdx + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $7, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $7, %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $7, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_7_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_7_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_7_bwd): + lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_bwd) + lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 +L(L7_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_7_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_7_bwd_loop_L1): + movaps -0x17(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x27(%rsi), %xmm3 + movaps -0x37(%rsi), %xmm4 + movaps -0x47(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $7, %xmm2, %xmm1 + palignr $7, %xmm3, %xmm2 + palignr $7, %xmm4, %xmm3 + palignr $7, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_7_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_7_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_8): + lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_fwd) + lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 +L(L8_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 +L(shl_8_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_8_loop_L1): + sub $64, %rdx + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $8, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $8, %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $8, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_8_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 + .p2align 4 +L(shl_8_end): + lea 64(%rdx), %rdx + movaps %xmm4, -0x20(%rdi) + add %rdx, %rsi + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_8_bwd): + lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_bwd) + lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 +L(L8_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_8_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_8_bwd_loop_L1): + movaps -0x18(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x28(%rsi), %xmm3 + movaps -0x38(%rsi), %xmm4 + movaps -0x48(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $8, %xmm2, %xmm1 + palignr $8, %xmm3, %xmm2 + palignr $8, %xmm4, %xmm3 + palignr $8, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_8_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_8_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_9): + lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_fwd) + lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 +L(L9_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_9_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_9_loop_L1): + sub $64, %rdx + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $9, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $9, %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $9, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_9_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_9_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_9_bwd): + lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_bwd) + lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 +L(L9_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_9_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_9_bwd_loop_L1): + movaps -0x19(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x29(%rsi), %xmm3 + movaps -0x39(%rsi), %xmm4 + movaps -0x49(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $9, %xmm2, %xmm1 + palignr $9, %xmm3, %xmm2 + palignr $9, %xmm4, %xmm3 + palignr $9, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_9_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_9_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_10): + lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_fwd) + lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 +L(L10_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_10_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_10_loop_L1): + sub $64, %rdx + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $10, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $10, %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $10, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_10_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_10_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_10_bwd): + lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_bwd) + lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 +L(L10_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_10_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_10_bwd_loop_L1): + movaps -0x1a(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2a(%rsi), %xmm3 + movaps -0x3a(%rsi), %xmm4 + movaps -0x4a(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $10, %xmm2, %xmm1 + palignr $10, %xmm3, %xmm2 + palignr $10, %xmm4, %xmm3 + palignr $10, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_10_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_10_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_11): + lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_fwd) + lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 +L(L11_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_11_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_11_loop_L1): + sub $64, %rdx + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $11, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $11, %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $11, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_11_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_11_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_11_bwd): + lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_bwd) + lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 +L(L11_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_11_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_11_bwd_loop_L1): + movaps -0x1b(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2b(%rsi), %xmm3 + movaps -0x3b(%rsi), %xmm4 + movaps -0x4b(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $11, %xmm2, %xmm1 + palignr $11, %xmm3, %xmm2 + palignr $11, %xmm4, %xmm3 + palignr $11, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_11_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_11_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_12): + lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_fwd) + lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 +L(L12_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_12_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_12_loop_L1): + sub $64, %rdx + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $12, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $12, %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $12, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_12_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_12_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_12_bwd): + lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_bwd) + lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 +L(L12_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_12_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_12_bwd_loop_L1): + movaps -0x1c(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2c(%rsi), %xmm3 + movaps -0x3c(%rsi), %xmm4 + movaps -0x4c(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $12, %xmm2, %xmm1 + palignr $12, %xmm3, %xmm2 + palignr $12, %xmm4, %xmm3 + palignr $12, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_12_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_12_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_13): + lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_fwd) + lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 +L(L13_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_13_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_13_loop_L1): + sub $64, %rdx + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $13, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $13, %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $13, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_13_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_13_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_13_bwd): + lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_bwd) + lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 +L(L13_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_13_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_13_bwd_loop_L1): + movaps -0x1d(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2d(%rsi), %xmm3 + movaps -0x3d(%rsi), %xmm4 + movaps -0x4d(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $13, %xmm2, %xmm1 + palignr $13, %xmm3, %xmm2 + palignr $13, %xmm4, %xmm3 + palignr $13, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_13_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_13_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_14): + lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_fwd) + lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 +L(L14_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_14_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_14_loop_L1): + sub $64, %rdx + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $14, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $14, %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $14, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_14_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_14_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_14_bwd): + lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_bwd) + lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 +L(L14_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_14_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_14_bwd_loop_L1): + movaps -0x1e(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2e(%rsi), %xmm3 + movaps -0x3e(%rsi), %xmm4 + movaps -0x4e(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $14, %xmm2, %xmm1 + palignr $14, %xmm3, %xmm2 + palignr $14, %xmm4, %xmm3 + palignr $14, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_14_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_14_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_15): + lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_fwd) + lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 +L(L15_fwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_15_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_15_loop_L1): + sub $64, %rdx + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $15, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $15, %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $15, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_15_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_15_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_15_bwd): + lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_bwd) + lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 +L(L15_bwd): + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_15_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_15_bwd_loop_L1): + movaps -0x1f(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2f(%rsi), %xmm3 + movaps -0x3f(%rsi), %xmm4 + movaps -0x4f(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $15, %xmm2, %xmm1 + palignr $15, %xmm3, %xmm2 + palignr $15, %xmm4, %xmm3 + palignr $15, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_15_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 +L(shl_15_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(write_72bytes): + movdqu -72(%rsi), %xmm0 + movdqu -56(%rsi), %xmm1 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -72(%rdi) + movdqu %xmm1, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(write_64bytes): + movdqu -64(%rsi), %xmm0 + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + movdqu %xmm0, -64(%rdi) + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_56bytes): + movdqu -56(%rsi), %xmm0 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(write_48bytes): + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_40bytes): + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_32bytes): + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_24bytes): + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_16bytes): + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_8bytes): + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) +L(write_0bytes): + ret + + .p2align 4 +L(write_73bytes): + movdqu -73(%rsi), %xmm0 + movdqu -57(%rsi), %xmm1 + mov -41(%rsi), %rcx + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %r8 + mov -4(%rsi), %edx + movdqu %xmm0, -73(%rdi) + movdqu %xmm1, -57(%rdi) + mov %rcx, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %r8, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_65bytes): + movdqu -65(%rsi), %xmm0 + movdqu -49(%rsi), %xmm1 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -65(%rdi) + movdqu %xmm1, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_57bytes): + movdqu -57(%rsi), %xmm0 + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -57(%rdi) + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_49bytes): + movdqu -49(%rsi), %xmm0 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_41bytes): + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_33bytes): + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_25bytes): + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_17bytes): + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_9bytes): + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_1bytes): + mov -1(%rsi), %dl + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_74bytes): + movdqu -74(%rsi), %xmm0 + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -74(%rdi) + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_66bytes): + movdqu -66(%rsi), %xmm0 + movdqu -50(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -66(%rdi) + movdqu %xmm1, -50(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_58bytes): + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_50bytes): + movdqu -50(%rsi), %xmm0 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -50(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_42bytes): + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_34bytes): + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_26bytes): + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_18bytes): + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_10bytes): + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_2bytes): + mov -2(%rsi), %dx + mov %dx, -2(%rdi) + ret + + .p2align 4 +L(write_75bytes): + movdqu -75(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -75(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_67bytes): + movdqu -67(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -67(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_59bytes): + movdqu -59(%rsi), %xmm0 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_51bytes): + movdqu -51(%rsi), %xmm0 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -51(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_43bytes): + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_35bytes): + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_27bytes): + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_19bytes): + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_11bytes): + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_3bytes): + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret + + .p2align 4 +L(write_76bytes): + movdqu -76(%rsi), %xmm0 + movdqu -60(%rsi), %xmm1 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -76(%rdi) + movdqu %xmm1, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_68bytes): + movdqu -68(%rsi), %xmm0 + movdqu -52(%rsi), %xmm1 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -68(%rdi) + movdqu %xmm1, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_60bytes): + movdqu -60(%rsi), %xmm0 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_52bytes): + movdqu -52(%rsi), %xmm0 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_44bytes): + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_36bytes): + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_28bytes): + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_20bytes): + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_12bytes): + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_4bytes): + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_77bytes): + movdqu -77(%rsi), %xmm0 + movdqu -61(%rsi), %xmm1 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -77(%rdi) + movdqu %xmm1, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_69bytes): + movdqu -69(%rsi), %xmm0 + movdqu -53(%rsi), %xmm1 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -69(%rdi) + movdqu %xmm1, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_61bytes): + movdqu -61(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_53bytes): + movdqu -53(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_45bytes): + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_37bytes): + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_29bytes): + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_21bytes): + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_13bytes): + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_5bytes): + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(write_78bytes): + movdqu -78(%rsi), %xmm0 + movdqu -62(%rsi), %xmm1 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -78(%rdi) + movdqu %xmm1, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_70bytes): + movdqu -70(%rsi), %xmm0 + movdqu -54(%rsi), %xmm1 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -70(%rdi) + movdqu %xmm1, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_62bytes): + movdqu -62(%rsi), %xmm0 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_54bytes): + movdqu -54(%rsi), %xmm0 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_46bytes): + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_38bytes): + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_30bytes): + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_22bytes): + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_14bytes): + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_6bytes): + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(write_79bytes): + movdqu -79(%rsi), %xmm0 + movdqu -63(%rsi), %xmm1 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -79(%rdi) + movdqu %xmm1, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_71bytes): + movdqu -71(%rsi), %xmm0 + movdqu -55(%rsi), %xmm1 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -71(%rdi) + movdqu %xmm1, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_63bytes): + movdqu -63(%rsi), %xmm0 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_55bytes): + movdqu -55(%rsi), %xmm0 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_47bytes): + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_39bytes): + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_31bytes): + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_23bytes): + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_15bytes): + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_7bytes): + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(large_page_fwd): + movdqu (%rsi), %xmm1 + lea 16(%rsi), %rsi + movdqu %xmm0, (%r8) + movntdq %xmm1, (%rdi) + lea 16(%rdi), %rdi + lea -0x90(%rdx), %rdx +#ifdef USE_AS_MEMMOVE + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + shl $2, %rcx + cmp %rcx, %rdx + jb L(ll_cache_copy_fwd_start) +L(memmove_is_memcpy_fwd): +#endif +L(large_page_loop): + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + + sub $0x80, %rdx + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(large_page_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_64bytes) + + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi + + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#ifdef USE_AS_MEMMOVE + .p2align 4 +L(ll_cache_copy_fwd_start): + prefetcht0 0x1c0(%rsi) + prefetcht0 0x200(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + + sub $0x80, %rdx + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(ll_cache_copy_fwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_fwd_64bytes) + + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi + + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_ll_less_fwd_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#endif + .p2align 4 +L(large_page_bwd): + movdqu -0x10(%rsi), %xmm1 + lea -16(%rsi), %rsi + movdqu %xmm0, (%r8) + movdqa %xmm1, -0x10(%rdi) + lea -16(%rdi), %rdi + lea -0x90(%rdx), %rdx +#ifdef USE_AS_MEMMOVE + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jb L(ll_cache_copy_bwd_start) +L(memmove_is_memcpy_bwd): +#endif +L(large_page_bwd_loop): + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + + sub $0x80, %rdx + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + movntdq %xmm4, -0x50(%rdi) + movntdq %xmm5, -0x60(%rdi) + movntdq %xmm6, -0x70(%rdi) + movntdq %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(large_page_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_bwd_64bytes) + + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi + + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_less_bwd_64bytes): + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#ifdef USE_AS_MEMMOVE + .p2align 4 +L(ll_cache_copy_bwd_start): + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x200(%rsi) + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + + sub $0x80, %rdx + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(ll_cache_copy_bwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_bwd_64bytes) + + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi + + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_ll_less_bwd_64bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) +#endif + +END (MEMCPY) + + .section .rodata.ssse3,"a",@progbits + .p2align 3 +L(table_less_80bytes): + .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) + + .p2align 3 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 3 +L(shl_table_bwd): + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + +#endif diff --git a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S new file mode 100644 index 00000000000..9ee6f0a71c3 --- /dev/null +++ b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S @@ -0,0 +1,12 @@ +#if 1 +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define SECTION(p) p##.avx +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S new file mode 100644 index 00000000000..b14d92fd6a8 --- /dev/null +++ b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S @@ -0,0 +1,419 @@ +/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. + Copyright (C) 2016-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "sysdep.h" + +#if 1 + +# include "asm-syntax.h" + + .section .text.avx512,"ax",@progbits +ENTRY (__mempcpy_chk_avx512_no_vzeroupper) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__mempcpy_chk_avx512_no_vzeroupper) + +ENTRY (__mempcpy_avx512_no_vzeroupper) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) +END (__mempcpy_avx512_no_vzeroupper) + +ENTRY (__memmove_chk_avx512_no_vzeroupper) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memmove_chk_avx512_no_vzeroupper) + +ENTRY (__memmove_avx512_no_vzeroupper) + mov %RDI_LP, %RAX_LP +# ifdef USE_AS_MEMPCPY + add %RDX_LP, %RAX_LP +# endif +L(start): +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +# endif + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx + ja L(512bytesormore) + +L(check): + cmp $16, %rdx + jbe L(less_16bytes) + cmp $256, %rdx + jb L(less_256bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups -0x100(%rcx), %zmm4 + vmovups -0xC0(%rcx), %zmm5 + vmovups -0x80(%rcx), %zmm6 + vmovups -0x40(%rcx), %zmm7 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, -0x100(%r9) + vmovups %zmm5, -0xC0(%r9) + vmovups %zmm6, -0x80(%r9) + vmovups %zmm7, -0x40(%r9) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups -0x80(%rcx), %zmm2 + vmovups -0x40(%rcx), %zmm3 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, -0x80(%r9) + vmovups %zmm3, -0x40(%r9) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu 0x20(%rsi), %ymm1 + vmovdqu -0x40(%rcx), %ymm2 + vmovdqu -0x20(%rcx), %ymm3 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 0x20(%rdi) + vmovdqu %ymm2, -0x40(%r9) + vmovdqu %ymm3, -0x20(%r9) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu -0x20(%rcx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -0x20(%r9) + ret + +L(less_32bytes): + vmovdqu (%rsi), %xmm0 + vmovdqu -0x10(%rcx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -0x10(%r9) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + movq (%rsi), %rsi + movq -0x8(%rcx), %rcx + movq %rsi, (%rdi) + movq %rcx, -0x8(%r9) + ret + +L(less_8bytes): + cmp $4, %dl + jb L(less_4bytes) + mov (%rsi), %esi + mov -0x4(%rcx), %ecx + mov %esi, (%rdi) + mov %ecx, -0x4(%r9) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov (%rsi), %si + mov -0x2(%rcx), %cx + mov %si, (%rdi) + mov %cx, -0x2(%r9) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov (%rsi), %cl + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): +# ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %r8 +# else + mov __x86_shared_cache_size_half(%rip), %r8 +# endif + cmp %r8, %rdx + jae L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + prefetcht1 -0x200(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0x40(%rcx) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + vmovups %zmm8, -0x200(%r9) + vmovups %zmm9, -0x1C0(%r9) + vmovups %zmm10, -0x180(%r9) + vmovups %zmm11, -0x140(%r9) + vmovups %zmm12, -0x100(%r9) + vmovups %zmm13, -0xC0(%r9) + vmovups %zmm14, -0x80(%r9) + vmovups %zmm15, -0x40(%r9) + ret + +L(1024bytesormore): + cmp %rsi, %rdi + ja L(1024bytesormore_bkw) + sub $512, %r9 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + +/* Loop with unaligned memory access. */ +L(gobble_512bytes_loop): + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + add $512, %rsi + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + add $512, %rdi + cmp %r9, %rdi + jb L(gobble_512bytes_loop) + vmovups %zmm8, (%r9) + vmovups %zmm9, 0x40(%r9) + vmovups %zmm10, 0x80(%r9) + vmovups %zmm11, 0xC0(%r9) + vmovups %zmm12, 0x100(%r9) + vmovups %zmm13, 0x140(%r9) + vmovups %zmm14, 0x180(%r9) + vmovups %zmm15, 0x1C0(%r9) + ret + +L(1024bytesormore_bkw): + add $512, %rdi + vmovups 0x1C0(%rsi), %zmm8 + vmovups 0x180(%rsi), %zmm9 + vmovups 0x140(%rsi), %zmm10 + vmovups 0x100(%rsi), %zmm11 + vmovups 0xC0(%rsi), %zmm12 + vmovups 0x80(%rsi), %zmm13 + vmovups 0x40(%rsi), %zmm14 + vmovups (%rsi), %zmm15 + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + +/* Backward loop with unaligned memory access. */ +L(gobble_512bytes_loop_bkw): + vmovups -0x40(%rcx), %zmm0 + vmovups -0x80(%rcx), %zmm1 + vmovups -0xC0(%rcx), %zmm2 + vmovups -0x100(%rcx), %zmm3 + vmovups -0x140(%rcx), %zmm4 + vmovups -0x180(%rcx), %zmm5 + vmovups -0x1C0(%rcx), %zmm6 + vmovups -0x200(%rcx), %zmm7 + sub $512, %rcx + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + vmovups %zmm0, -0x40(%r9) + vmovups %zmm1, -0x80(%r9) + vmovups %zmm2, -0xC0(%r9) + vmovups %zmm3, -0x100(%r9) + vmovups %zmm4, -0x140(%r9) + vmovups %zmm5, -0x180(%r9) + vmovups %zmm6, -0x1C0(%r9) + vmovups %zmm7, -0x200(%r9) + sub $512, %r9 + cmp %rdi, %r9 + ja L(gobble_512bytes_loop_bkw) + vmovups %zmm8, -0x40(%rdi) + vmovups %zmm9, -0x80(%rdi) + vmovups %zmm10, -0xC0(%rdi) + vmovups %zmm11, -0x100(%rdi) + vmovups %zmm12, -0x140(%rdi) + vmovups %zmm13, -0x180(%rdi) + vmovups %zmm14, -0x1C0(%rdi) + vmovups %zmm15, -0x200(%rdi) + ret + +L(preloop_large): + cmp %rsi, %rdi + ja L(preloop_large_bkw) + vmovups (%rsi), %zmm4 + vmovups 0x40(%rsi), %zmm5 + + mov %rdi, %r11 +/* Align destination for access with non-temporal stores in the loop. */ + mov %rdi, %r8 + and $-0x80, %rdi + add $0x80, %rdi + sub %rdi, %r8 + sub %r8, %rsi + add %r8, %rdx +L(gobble_256bytes_nt_loop): + prefetcht1 0x200(%rsi) + prefetcht1 0x240(%rsi) + prefetcht1 0x280(%rsi) + prefetcht1 0x2C0(%rsi) + prefetcht1 0x300(%rsi) + prefetcht1 0x340(%rsi) + prefetcht1 0x380(%rsi) + prefetcht1 0x3C0(%rsi) + vmovdqu64 (%rsi), %zmm0 + vmovdqu64 0x40(%rsi), %zmm1 + vmovdqu64 0x80(%rsi), %zmm2 + vmovdqu64 0xC0(%rsi), %zmm3 + vmovntdq %zmm0, (%rdi) + vmovntdq %zmm1, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm3, 0xC0(%rdi) + sub $256, %rdx + add $256, %rsi + add $256, %rdi + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop) + sfence + vmovups %zmm4, (%r11) + vmovups %zmm5, 0x40(%r11) + jmp L(check) + +L(preloop_large_bkw): + vmovups -0x80(%rcx), %zmm4 + vmovups -0x40(%rcx), %zmm5 + +/* Align end of destination for access with non-temporal stores. */ + mov %r9, %r8 + and $-0x80, %r9 + sub %r9, %r8 + sub %r8, %rcx + sub %r8, %rdx + add %r9, %r8 +L(gobble_256bytes_nt_loop_bkw): + prefetcht1 -0x400(%rcx) + prefetcht1 -0x3C0(%rcx) + prefetcht1 -0x380(%rcx) + prefetcht1 -0x340(%rcx) + prefetcht1 -0x300(%rcx) + prefetcht1 -0x2C0(%rcx) + prefetcht1 -0x280(%rcx) + prefetcht1 -0x240(%rcx) + vmovdqu64 -0x100(%rcx), %zmm0 + vmovdqu64 -0xC0(%rcx), %zmm1 + vmovdqu64 -0x80(%rcx), %zmm2 + vmovdqu64 -0x40(%rcx), %zmm3 + vmovntdq %zmm0, -0x100(%r9) + vmovntdq %zmm1, -0xC0(%r9) + vmovntdq %zmm2, -0x80(%r9) + vmovntdq %zmm3, -0x40(%r9) + sub $256, %rdx + sub $256, %rcx + sub $256, %r9 + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop_bkw) + sfence + vmovups %zmm4, -0x80(%r8) + vmovups %zmm5, -0x40(%r8) + jmp L(check) +END (__memmove_avx512_no_vzeroupper) + +strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) +strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) +#endif diff --git a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S new file mode 100644 index 00000000000..db70fdf1b4e --- /dev/null +++ b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S @@ -0,0 +1,12 @@ +#if 1 +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define SECTION(p) p##.avx512 +# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S new file mode 100644 index 00000000000..17b4f861621 --- /dev/null +++ b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S @@ -0,0 +1,33 @@ +/* memmove with SSE2. + Copyright (C) 2017-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if 1 +# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s +#else +weak_alias (__mempcpy, mempcpy) +#endif + +#include "memmove.S" + +#if defined SHARED +# include +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +/* Use __memmove_sse2_unaligned to support overlapping addresses. */ +compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5); +# endif +#endif diff --git a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S new file mode 100644 index 00000000000..21be351b4e7 --- /dev/null +++ b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S @@ -0,0 +1,559 @@ +/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb + Copyright (C) 2016-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* memmove/memcpy/mempcpy is implemented as: + 1. Use overlapping load and store to avoid branch. + 2. Load all sources into registers and store them together to avoid + possible address overlap between source and destination. + 3. If size is 8 * VEC_SIZE or less, load all sources into registers + and store them together. + 4. If address of destination > address of source, backward copy + 4 * VEC_SIZE at a time with unaligned load and aligned store. + Load the first 4 * VEC and last VEC before the loop and store + them after the loop to support overlapping addresses. + 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned + load and aligned store. Load the last 4 * VEC and first VEC + before the loop and store them after the loop to support + overlapping addresses. + 6. If size >= __x86_shared_non_temporal_threshold and there is no + overlap between destination and source, use non-temporal store + instead of aligned store. */ + +#include "sysdep.h" + +#ifndef MEMCPY_SYMBOL +# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMPCPY_SYMBOL +# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMMOVE_CHK_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +#ifndef PREFETCH +# define PREFETCH(addr) prefetcht0 addr +#endif + +/* Assume 64-byte prefetch size. */ +#ifndef PREFETCH_SIZE +# define PREFETCH_SIZE 64 +#endif + +#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) + +#if PREFETCH_SIZE == 64 +# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base) +# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base) +# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) +# else +# error Unsupported PREFETCHED_LOAD_SIZE! +# endif +#else +# error Unsupported PREFETCH_SIZE! +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + + .section SECTION(.text),"ax",@progbits +#if defined SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) +#endif + +ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) +END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + +#if defined SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) +#endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) + movq %rdi, %rax +L(start): +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(more_2x_vec) +#if !defined USE_MULTIARCH +L(last_2x_vec): +#endif + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VZEROUPPER +#if !defined USE_MULTIARCH +L(nop): +#endif + ret +#if defined USE_MULTIARCH +END (MEMMOVE_SYMBOL (__memmove, unaligned)) + +# if VEC_SIZE == 16 +ENTRY (__mempcpy_chk_erms) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__mempcpy_chk_erms) + +/* Only used to measure performance of REP MOVSB. */ +ENTRY (__mempcpy_erms) + mov %RDI_LP, %RAX_LP + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz 2f + add %RDX_LP, %RAX_LP + jmp L(start_movsb) +END (__mempcpy_erms) + +ENTRY (__memmove_chk_erms) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memmove_chk_erms) + +ENTRY (__memmove_erms) + movq %rdi, %rax + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz 2f +L(start_movsb): + mov %RDX_LP, %RCX_LP + cmp %RSI_LP, %RDI_LP + jb 1f + /* Source == destination is less common. */ + je 2f + lea (%rsi,%rcx), %RDX_LP + cmp %RDX_LP, %RDI_LP + jb L(movsb_backward) +1: + rep movsb +2: + ret +L(movsb_backward): + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +END (__memmove_erms) +strong_alias (__memmove_erms, __memcpy_erms) +strong_alias (__memmove_chk_erms, __memcpy_chk_erms) +# endif + +# ifdef SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start_erms) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + +# ifdef SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax +L(start_erms): +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(movsb_more_2x_vec) +L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(movsb): + cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP + jae L(more_8x_vec) + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je L(nop) + leaq (%rsi,%rdx), %r9 + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ + jb L(more_8x_vec_backward) +1: + mov %RDX_LP, %RCX_LP + rep movsb +L(nop): + ret +#endif + +L(less_vec): + /* Less than 1 VEC. */ +#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +#endif +#if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +#endif +#if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +#endif + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movzbl (%rsi), %ecx + movb %cl, (%rdi) +1: + ret +#if VEC_SIZE > 32 +L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ + vmovdqu (%rsi), %ymm0 + vmovdqu -32(%rsi,%rdx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -32(%rdi,%rdx) + VZEROUPPER + ret +#endif +#if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -16(%rsi,%rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -16(%rdi,%rdx) + ret +#endif +L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ + movq -8(%rsi,%rdx), %rcx + movq (%rsi), %rsi + movq %rcx, -8(%rdi,%rdx) + movq %rsi, (%rdi) + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl -4(%rsi,%rdx), %ecx + movl (%rsi), %esi + movl %ecx, -4(%rdi,%rdx) + movl %esi, (%rdi) + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movzwl -2(%rsi,%rdx), %ecx + movzwl (%rsi), %esi + movw %cx, -2(%rdi,%rdx) + movw %si, (%rdi) + ret + +#if defined USE_MULTIARCH +L(movsb_more_2x_vec): + cmp $REP_MOSB_THRESHOLD, %RDX_LP + ja L(movsb) +#endif +L(more_2x_vec): + /* More than 2 * VEC and there may be overlap between destination + and source. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + /* Copy from 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER + ret +L(last_4x_vec): + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VZEROUPPER + ret + +L(more_8x_vec): + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + /* Source == destination is less common. */ + je L(nop) + /* Load the first VEC and last 4 * VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) + /* Save start and stop of the destination buffer. */ + movq %rdi, %r11 + leaq -VEC_SIZE(%rdi, %rdx), %rcx + /* Align destination for aligned stores in the loop. Compute + how much destination is misaligned. */ + movq %rdi, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) + /* Check non-temporal store threshold. */ + cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP + ja L(large_forward) +#endif +L(loop_4x_vec_forward): + /* Copy 4 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $(VEC_SIZE * 4), %rsi + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(more_8x_vec_backward): + /* Load the first 4 * VEC and last VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU VEC_SIZE(%rsi), %VEC(5) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) + /* Save stop of the destination buffer. */ + leaq -VEC_SIZE(%rdi, %rdx), %r11 + /* Align destination end for aligned stores in the loop. Compute + how much destination end is misaligned. */ + leaq -VEC_SIZE(%rsi, %rdx), %rcx + movq %r11, %r9 + movq %r11, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Adjust source. */ + subq %r8, %rcx + /* Adjust the end of destination which should be aligned now. */ + subq %r8, %r9 + /* Adjust length. */ + subq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) + /* Check non-temporal store threshold. */ + cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP + ja L(large_backward) +#endif +L(loop_4x_vec_backward): + /* Copy 4 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $(VEC_SIZE * 4), %rcx + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $(VEC_SIZE * 4), %r9 + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret + +#if (defined USE_MULTIARCH || VEC_SIZE == 16) +L(large_forward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rdi, %rdx), %r10 + cmpq %r10, %rsi + jb L(loop_4x_vec_forward) +L(loop_large_forward): + /* Copy 4 * VEC a time forward with non-temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $PREFETCHED_LOAD_SIZE, %rsi + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%rdi) + VMOVNT %VEC(1), VEC_SIZE(%rdi) + VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $PREFETCHED_LOAD_SIZE, %rdi + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_forward) + sfence + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(large_backward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rcx, %rdx), %r10 + cmpq %r10, %r9 + jb L(loop_4x_vec_backward) +L(loop_large_backward): + /* Copy 4 * VEC a time backward with non-temporal stores. */ + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $PREFETCHED_LOAD_SIZE, %rcx + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%r9) + VMOVNT %VEC(1), -VEC_SIZE(%r9) + VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $PREFETCHED_LOAD_SIZE, %r9 + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_backward) + sfence + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret +#endif +END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + +#if 1 +# ifdef USE_MULTIARCH +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) +# ifdef SHARED +strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) +# endif +# endif +# ifdef SHARED +strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), + MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) +# endif +#endif +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), + MEMCPY_SYMBOL (__memcpy, unaligned)) diff --git a/utils/memcpy-bench/glibc/memmove.S b/utils/memcpy-bench/glibc/memmove.S new file mode 100644 index 00000000000..97e735facff --- /dev/null +++ b/utils/memcpy-bench/glibc/memmove.S @@ -0,0 +1,71 @@ +/* Optimized memmove for x86-64. + Copyright (C) 2016-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "sysdep.h" + +#define VEC_SIZE 16 +#define VEC(i) xmm##i +#define PREFETCHNT prefetchnta +#define VMOVNT movntdq +/* Use movups and movaps for smaller code sizes. */ +#define VMOVU movups +#define VMOVA movaps + +#define SECTION(p) p + +#ifdef USE_MULTIARCH +# if 0 +# define MEMCPY_SYMBOL(p,s) memcpy +# endif +#else +# if defined SHARED +# define MEMCPY_SYMBOL(p,s) __memcpy +# else +# define MEMCPY_SYMBOL(p,s) memcpy +# endif +#endif +#if !defined USE_MULTIARCH +# define MEMPCPY_SYMBOL(p,s) __mempcpy +#endif +#ifndef MEMMOVE_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) p +# define MEMMOVE_SYMBOL(p,s) memmove +#endif + +#include "memmove-vec-unaligned-erms.S" + +#ifndef USE_MULTIARCH +libc_hidden_builtin_def (memmove) +# if defined SHARED && IS_IN (libc) +strong_alias (memmove, __memcpy) +libc_hidden_ver (memmove, memcpy) +# endif +libc_hidden_def (__mempcpy) +weak_alias (__mempcpy, mempcpy) +libc_hidden_builtin_def (mempcpy) + +# if defined SHARED && IS_IN (libc) +# undef memcpy +# include +versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14); + +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5); +# endif +# endif +#endif diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h new file mode 100644 index 00000000000..099134b2a2f --- /dev/null +++ b/utils/memcpy-bench/glibc/sysdep.h @@ -0,0 +1,129 @@ +/* Assembler macros for x86-64. + Copyright (C) 2001-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _X86_64_SYSDEP_H +#define _X86_64_SYSDEP_H 1 + +#include "sysdep_x86.h" + +#ifdef __ASSEMBLER__ + +/* Syntactic details of assembler. */ + +/* This macro is for setting proper CFI with DW_CFA_expression describing + the register as saved relative to %rsp instead of relative to the CFA. + Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset + from %rsp. */ +#define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \ + 0x77, off & 0x7F | 0x80, off >> 7 + +/* If compiled for profiling, call `mcount' at the start of each function. */ +#ifdef PROF +/* The mcount code relies on a normal frame pointer being on the stack + to locate our caller, so push one just for its benefit. */ +#define CALL_MCOUNT \ + pushq %rbp; \ + cfi_adjust_cfa_offset(8); \ + movq %rsp, %rbp; \ + cfi_def_cfa_register(%rbp); \ + call JUMPTARGET(mcount); \ + popq %rbp; \ + cfi_def_cfa(rsp,8); +#else +#define CALL_MCOUNT /* Do nothing. */ +#endif + +#define PSEUDO(name, syscall_name, args) \ +lose: \ + jmp JUMPTARGET(syscall_error) \ + .globl syscall_error; \ + ENTRY (name) \ + DO_CALL (syscall_name, args); \ + jb lose + +#undef JUMPTARGET +#ifdef SHARED +# ifdef BIND_NOW +# define JUMPTARGET(name) *name##@GOTPCREL(%rip) +# else +# define JUMPTARGET(name) name##@PLT +# endif +#else +/* For static archives, branch to target directly. */ +# define JUMPTARGET(name) name +#endif + +/* Long and pointer size in bytes. */ +#define LP_SIZE 8 + +/* Instruction to operate on long and pointer. */ +#define LP_OP(insn) insn##q + +/* Assembler address directive. */ +#define ASM_ADDR .quad + +/* Registers to hold long and pointer. */ +#define RAX_LP rax +#define RBP_LP rbp +#define RBX_LP rbx +#define RCX_LP rcx +#define RDI_LP rdi +#define RDX_LP rdx +#define RSI_LP rsi +#define RSP_LP rsp +#define R8_LP r8 +#define R9_LP r9 +#define R10_LP r10 +#define R11_LP r11 +#define R12_LP r12 +#define R13_LP r13 +#define R14_LP r14 +#define R15_LP r15 + +#else /* __ASSEMBLER__ */ + +/* Long and pointer size in bytes. */ +#define LP_SIZE "8" + +/* Instruction to operate on long and pointer. */ +#define LP_OP(insn) #insn "q" + +/* Assembler address directive. */ +#define ASM_ADDR ".quad" + +/* Registers to hold long and pointer. */ +#define RAX_LP "rax" +#define RBP_LP "rbp" +#define RBX_LP "rbx" +#define RCX_LP "rcx" +#define RDI_LP "rdi" +#define RDX_LP "rdx" +#define RSI_LP "rsi" +#define RSP_LP "rsp" +#define R8_LP "r8" +#define R9_LP "r9" +#define R10_LP "r10" +#define R11_LP "r11" +#define R12_LP "r12" +#define R13_LP "r13" +#define R14_LP "r14" +#define R15_LP "r15" + +#endif /* __ASSEMBLER__ */ + +#endif /* _X86_64_SYSDEP_H */ diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h new file mode 100644 index 00000000000..91f78e1b04d --- /dev/null +++ b/utils/memcpy-bench/glibc/sysdep_generic.h @@ -0,0 +1,113 @@ +/* Generic asm macros used on many machines. + Copyright (C) 1991-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define C_SYMBOL_NAME(name) name +#define HIDDEN_JUMPTARGET(name) 0x0 +#define SHARED_CACHE_SIZE_HALF (1024*1024) +#define DATA_CACHE_SIZE_HALF (1024*32/2) +#define DATA_CACHE_SIZE (1024*32) +#define SHARED_NON_TEMPORAL_THRESHOLD (1024*1024*4) +#define REP_MOSB_THRESHOLD 1024 + +#define USE_MULTIARCH + +#define ASM_LINE_SEP ; + +#define strong_alias(original, alias) \ + .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP \ + C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original) + +#ifndef C_LABEL + +/* Define a macro we can use to construct the asm name for a C symbol. */ +# define C_LABEL(name) name##: + +#endif + +#ifdef __ASSEMBLER__ +/* Mark the end of function named SYM. This is used on some platforms + to generate correct debugging information. */ +# ifndef END +# define END(sym) +# endif + +# ifndef JUMPTARGET +# define JUMPTARGET(sym) sym +# endif +#endif + +/* Makros to generate eh_frame unwind information. */ +#ifdef __ASSEMBLER__ +# define cfi_startproc .cfi_startproc +# define cfi_endproc .cfi_endproc +# define cfi_def_cfa(reg, off) .cfi_def_cfa reg, off +# define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg +# define cfi_def_cfa_offset(off) .cfi_def_cfa_offset off +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +# define cfi_offset(reg, off) .cfi_offset reg, off +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +# define cfi_register(r1, r2) .cfi_register r1, r2 +# define cfi_return_column(reg) .cfi_return_column reg +# define cfi_restore(reg) .cfi_restore reg +# define cfi_same_value(reg) .cfi_same_value reg +# define cfi_undefined(reg) .cfi_undefined reg +# define cfi_remember_state .cfi_remember_state +# define cfi_restore_state .cfi_restore_state +# define cfi_window_save .cfi_window_save +# define cfi_personality(enc, exp) .cfi_personality enc, exp +# define cfi_lsda(enc, exp) .cfi_lsda enc, exp + +#else /* ! ASSEMBLER */ + +# define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name) +# define CFI_STRINGIFY2(Name) #Name +# define CFI_STARTPROC ".cfi_startproc" +# define CFI_ENDPROC ".cfi_endproc" +# define CFI_DEF_CFA(reg, off) \ + ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) +# define CFI_DEF_CFA_REGISTER(reg) \ + ".cfi_def_cfa_register " CFI_STRINGIFY(reg) +# define CFI_DEF_CFA_OFFSET(off) \ + ".cfi_def_cfa_offset " CFI_STRINGIFY(off) +# define CFI_ADJUST_CFA_OFFSET(off) \ + ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off) +# define CFI_OFFSET(reg, off) \ + ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) +# define CFI_REL_OFFSET(reg, off) \ + ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) +# define CFI_REGISTER(r1, r2) \ + ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2) +# define CFI_RETURN_COLUMN(reg) \ + ".cfi_return_column " CFI_STRINGIFY(reg) +# define CFI_RESTORE(reg) \ + ".cfi_restore " CFI_STRINGIFY(reg) +# define CFI_UNDEFINED(reg) \ + ".cfi_undefined " CFI_STRINGIFY(reg) +# define CFI_REMEMBER_STATE \ + ".cfi_remember_state" +# define CFI_RESTORE_STATE \ + ".cfi_restore_state" +# define CFI_WINDOW_SAVE \ + ".cfi_window_save" +# define CFI_PERSONALITY(enc, exp) \ + ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) +# define CFI_LSDA(enc, exp) \ + ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) +#endif + +#include "dwarf2.h" diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h new file mode 100644 index 00000000000..a3fecd01268 --- /dev/null +++ b/utils/memcpy-bench/glibc/sysdep_x86.h @@ -0,0 +1,113 @@ +/* Assembler macros for x86. + Copyright (C) 2017-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _X86_SYSDEP_H +#define _X86_SYSDEP_H 1 + +#include "sysdep_generic.h" + +/* __CET__ is defined by GCC with Control-Flow Protection values: + +enum cf_protection_level +{ + CF_NONE = 0, + CF_BRANCH = 1 << 0, + CF_RETURN = 1 << 1, + CF_FULL = CF_BRANCH | CF_RETURN, + CF_SET = 1 << 2 +}; +*/ + +/* Set if CF_BRANCH (IBT) is enabled. */ +#define X86_FEATURE_1_IBT (1U << 0) +/* Set if CF_RETURN (SHSTK) is enabled. */ +#define X86_FEATURE_1_SHSTK (1U << 1) + +#ifdef __CET__ +# define CET_ENABLED 1 +# define IBT_ENABLED (__CET__ & X86_FEATURE_1_IBT) +# define SHSTK_ENABLED (__CET__ & X86_FEATURE_1_SHSTK) +#else +# define CET_ENABLED 0 +# define IBT_ENABLED 0 +# define SHSTK_ENABLED 0 +#endif + +/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need + space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be + aligned to 16 bytes for fxsave and 64 bytes for xsave. */ +#define STATE_SAVE_OFFSET (8 * 7 + 8) + +/* Save SSE, AVX, AVX512, mask and bound registers. */ +#define STATE_SAVE_MASK \ + ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) + +#ifdef __ASSEMBLER__ + +/* Syntactic details of assembler. */ + +#ifdef _CET_ENDBR +# define _CET_NOTRACK notrack +#else +# define _CET_ENDBR +# define _CET_NOTRACK +#endif + +/* ELF uses byte-counts for .align, most others use log2 of count of bytes. */ +#define ALIGNARG(log2) 1< #include +#include #include #include #include @@ -14,15 +15,11 @@ #include -#pragma GCC diagnostic ignored "-Wold-style-cast" -#pragma GCC diagnostic ignored "-Wcast-align" -#pragma GCC diagnostic ignored "-Wcast-qual" -#include "FastMemcpy.h" -//#include "FastMemcpy_Avx.h" - #include #include +#include + template void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl) @@ -47,7 +44,7 @@ size_t generatorUniform(RNG & rng) { return rng() % N; }; template -void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl) +uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl, const char * name) { Stopwatch watch; @@ -76,15 +73,22 @@ void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t n for (auto & thread : threads) thread.join(); - double elapsed_ns = watch.elapsed(); + uint64_t elapsed_ns = watch.elapsed(); /// Validation size_t sum = 0; + size_t reference = 0; for (size_t i = 0; i < size; ++i) + { sum += dst[i]; + reference += uint8_t(i); + } - std::cerr << std::fixed << std::setprecision(3) - << "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n"; + if (sum != reference) + throw std::logic_error("Incorrect result"); + + std::cout << name; + return elapsed_ns; } @@ -101,9 +105,30 @@ static void * memcpy_erms(void * dst, const void * src, size_t size) return dst; } +static void * memcpy_trivial(void * __restrict dst_, const void * __restrict src_, size_t size) +{ + char * __restrict dst = reinterpret_cast(dst_); + const char * __restrict src = reinterpret_cast(src_); + void * ret = dst; + + while (size > 0) + { + *dst = *src; + ++dst; + ++src; + --size; + } + + return ret; +} + extern "C" void * memcpy_jart(void * dst, const void * src, size_t size); extern "C" void MemCpy(void * dst, const void * src, size_t size); +void * memcpy_fast_sse(void * dst, const void * src, size_t size); +void * memcpy_fast_avx(void * dst, const void * src, size_t size); +void * memcpy_tiny(void * dst, const void * src, size_t size); + static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size) { @@ -329,7 +354,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t * if (padding > 0) { __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); - _mm256_storeu_si256((__m256i*)dst, head); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head); dst += padding; src += padding; size -= padding; @@ -539,70 +564,125 @@ tail: return ret; } +extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_ssse3_back(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_avx_unaligned(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_avx_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_avx512_unaligned(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_avx512_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size); +extern "C" void * __memcpy_avx512_no_vzeroupper(void * __restrict destination, const void * __restrict source, size_t size); + + +#define VARIANT(N, NAME) \ + if (memcpy_variant == N) \ + return test(dst, src, size, iterations, num_threads, std::forward(generator), NAME, #NAME); template -void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator) +uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator) { - memcpy_type memcpy_libc = reinterpret_cast(dlsym(RTLD_NEXT, "memcpy")); + memcpy_type memcpy_libc_old = reinterpret_cast(dlsym(RTLD_NEXT, "memcpy")); - if (memcpy_variant == 1) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy); - if (memcpy_variant == 2) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_libc); - if (memcpy_variant == 3) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_erms); - if (memcpy_variant == 4) - test(dst, src, size, iterations, num_threads, std::forward(generator), MemCpy); - if (memcpy_variant == 5) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2); - if (memcpy_variant == 6) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled2); - if (memcpy_variant == 7) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled4); - if (memcpy_variant == 8) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled8); -// if (memcpy_variant == 9) -// test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_fast_avx); - if (memcpy_variant == 10) - test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_my); + VARIANT(1, memcpy) + VARIANT(2, memcpy_trivial) + VARIANT(3, memcpy_libc_old) + VARIANT(4, memcpy_erms) + VARIANT(5, MemCpy) + VARIANT(6, memcpySSE2) + VARIANT(7, memcpySSE2Unrolled2) + VARIANT(8, memcpySSE2Unrolled4) + VARIANT(9, memcpySSE2Unrolled8) + VARIANT(10, memcpy_fast_sse) + VARIANT(11, memcpy_fast_avx) + VARIANT(12, memcpy_my) + + VARIANT(21, __memcpy_erms) + VARIANT(22, __memcpy_sse2_unaligned) + VARIANT(23, __memcpy_ssse3) + VARIANT(24, __memcpy_ssse3_back) + VARIANT(25, __memcpy_avx_unaligned) + VARIANT(26, __memcpy_avx_unaligned_erms) + VARIANT(27, __memcpy_avx512_unaligned) + VARIANT(28, __memcpy_avx512_unaligned_erms) + VARIANT(29, __memcpy_avx512_no_vzeroupper) + + return 0; } -void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads) +uint64_t dispatchVariants( + size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads) { if (generator_variant == 1) - dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>); + return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>); if (generator_variant == 2) - dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>); + return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>); if (generator_variant == 3) - dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>); + return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>); if (generator_variant == 4) - dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>); + return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>); if (generator_variant == 5) - dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>); + return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>); + + return 0; } int main(int argc, char ** argv) { - size_t size = 1000000000; - if (argc >= 2) - size = std::stoull(argv[1]); + boost::program_options::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message") + ("size", boost::program_options::value()->default_value(1000000), "Bytes to copy on every iteration") + ("iterations", boost::program_options::value(), "Number of iterations") + ("threads", boost::program_options::value()->default_value(1), "Number of copying threads") + ("distribution", boost::program_options::value()->default_value(4), "Distribution of chunk sizes to perform copy") + ("variant", boost::program_options::value(), "Variant of memcpy implementation") + ("tsv", "Print result in tab-separated format") + ; - size_t iterations = 10; - if (argc >= 3) - iterations = std::stoull(argv[2]); + boost::program_options::variables_map options; + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); - size_t num_threads = 1; - if (argc >= 4) - num_threads = std::stoull(argv[3]); + if (options.count("help") || !options.count("variant")) + { + std::cout << R"(Usage: - size_t memcpy_variant = 1; - if (argc >= 5) - memcpy_variant = std::stoull(argv[4]); +for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do + for threads in 1 2 4 $(($(nproc) / 2)) $(nproc); do + for distribution in 1 2 3 4 5; do + for variant in {1..12} {21..29}; do + for i in {1..10}; do + ./memcpy-bench --tsv --size $size --variant $variant --threads $threads --distribution $distribution; + done; + done; + done; + done; +done | tee result.tsv - size_t generator_variant = 1; - if (argc >= 6) - generator_variant = std::stoull(argv[5]); +)" << std::endl; + std::cout << desc << std::endl; + return 1; + } + + size_t size = options["size"].as(); + size_t num_threads = options["threads"].as(); + size_t memcpy_variant = options["variant"].as(); + size_t generator_variant = options["distribution"].as(); + + size_t iterations; + if (options.count("iterations")) + { + iterations = options["iterations"].as(); + } + else + { + iterations = 10000000000ULL * num_threads / size; + + if (generator_variant == 1) + iterations /= 100; + if (generator_variant == 2) + iterations /= 10; + } std::unique_ptr src(new uint8_t[size]); std::unique_ptr dst(new uint8_t[size]); @@ -614,7 +694,25 @@ int main(int argc, char ** argv) /// Fill dst to avoid page faults. memset(dst.get(), 0, size); - dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads); + uint64_t elapsed_ns = dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads); + + std::cout << std::fixed << std::setprecision(3); + + if (options.count("tsv")) + { + std::cout + << '\t' << size + << '\t' << iterations + << '\t' << num_threads + << '\t' << generator_variant + << '\t' << memcpy_variant + << '\t' << elapsed_ns + << '\n'; + } + else + { + std::cout << ": processed in " << (elapsed_ns / 1e9) << " sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec\n"; + } return 0; } From 145116bfb64a9a135cd60a8d5b5ebdd6d8310676 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 23:22:12 +0300 Subject: [PATCH 2/6] Fix style --- utils/memcpy-bench/glibc/dwarf2.h | 74 +- utils/memcpy-bench/glibc/memcpy-ssse3-back.S | 4988 ++++++++-------- utils/memcpy-bench/glibc/memcpy-ssse3.S | 5176 ++++++++--------- .../glibc/memmove-avx-unaligned-erms.S | 14 +- .../glibc/memmove-avx512-no-vzeroupper.S | 662 +-- .../glibc/memmove-avx512-unaligned-erms.S | 14 +- .../glibc/memmove-sse2-unaligned-erms.S | 2 +- .../glibc/memmove-vec-unaligned-erms.S | 694 +-- utils/memcpy-bench/glibc/memmove.S | 26 +- utils/memcpy-bench/glibc/sysdep.h | 100 +- utils/memcpy-bench/glibc/sysdep_generic.h | 52 +- utils/memcpy-bench/glibc/sysdep_x86.h | 52 +- 12 files changed, 5927 insertions(+), 5927 deletions(-) diff --git a/utils/memcpy-bench/glibc/dwarf2.h b/utils/memcpy-bench/glibc/dwarf2.h index 4c7de0d8737..2be827f00ae 100644 --- a/utils/memcpy-bench/glibc/dwarf2.h +++ b/utils/memcpy-bench/glibc/dwarf2.h @@ -21,7 +21,7 @@ . */ #ifndef _DWARF2_H -#define _DWARF2_H 1 +#define _DWARF2_H 1 /* This file is derived from the DWARF specification (a public document) Revision 2.0.0 (July 27, 1993) developed by the UNIX International @@ -88,19 +88,19 @@ enum dwarf_tag /* SGI/MIPS Extensions */ DW_TAG_MIPS_loop = 0x4081, /* GNU extensions */ - DW_TAG_format_label = 0x4101, /* for FORTRAN 77 and Fortran 90 */ - DW_TAG_function_template = 0x4102, /* for C++ */ - DW_TAG_class_template = 0x4103, /* for C++ */ + DW_TAG_format_label = 0x4101, /* for FORTRAN 77 and Fortran 90 */ + DW_TAG_function_template = 0x4102, /* for C++ */ + DW_TAG_class_template = 0x4103, /* for C++ */ DW_TAG_GNU_BINCL = 0x4104, DW_TAG_GNU_EINCL = 0x4105 }; -#define DW_TAG_lo_user 0x4080 -#define DW_TAG_hi_user 0xffff +#define DW_TAG_lo_user 0x4080 +#define DW_TAG_hi_user 0xffff /* flag that tells whether entry has a child or not */ #define DW_children_no 0 -#define DW_children_yes 1 +#define DW_children_yes 1 /* Form names and codes. */ enum dwarf_form @@ -215,8 +215,8 @@ enum dwarf_attribute DW_AT_body_end = 0x2106 }; -#define DW_AT_lo_user 0x2000 /* implementation-defined range start */ -#define DW_AT_hi_user 0x3ff0 /* implementation-defined range end */ +#define DW_AT_lo_user 0x2000 /* implementation-defined range start */ +#define DW_AT_hi_user 0x3ff0 /* implementation-defined range end */ /* Location atom names and codes. */ @@ -369,8 +369,8 @@ enum dwarf_location_atom DW_OP_nop = 0x96 }; -#define DW_OP_lo_user 0x80 /* implementation-defined range start */ -#define DW_OP_hi_user 0xff /* implementation-defined range end */ +#define DW_OP_lo_user 0x80 /* implementation-defined range start */ +#define DW_OP_hi_user 0xff /* implementation-defined range end */ /* Type encodings. */ @@ -387,8 +387,8 @@ enum dwarf_type DW_ATE_unsigned_char = 0x8 }; -#define DW_ATE_lo_user 0x80 -#define DW_ATE_hi_user 0xff +#define DW_ATE_lo_user 0x80 +#define DW_ATE_hi_user 0xff /* Array ordering names and codes. */ enum dwarf_array_dim_ordering @@ -517,17 +517,17 @@ enum dwarf_call_frame_info DW_CFA_GNU_negative_offset_extended = 0x2f }; -#define DW_CIE_ID 0xffffffff -#define DW_CIE_VERSION 1 +#define DW_CIE_ID 0xffffffff +#define DW_CIE_VERSION 1 #define DW_CFA_extended 0 #define DW_CFA_low_user 0x1c #define DW_CFA_high_user 0x3f -#define DW_CHILDREN_no 0x00 -#define DW_CHILDREN_yes 0x01 +#define DW_CHILDREN_no 0x00 +#define DW_CHILDREN_yes 0x01 -#define DW_ADDR_none 0 +#define DW_ADDR_none 0 /* Source language names and codes. */ @@ -548,8 +548,8 @@ enum dwarf_source_language }; -#define DW_LANG_lo_user 0x8000 /* implementation-defined range start */ -#define DW_LANG_hi_user 0xffff /* implementation-defined range start */ +#define DW_LANG_lo_user 0x8000 /* implementation-defined range start */ +#define DW_LANG_hi_user 0xffff /* implementation-defined range start */ /* Names and codes for macro information. */ @@ -566,25 +566,25 @@ enum dwarf_macinfo_record_type /* @@@ For use with GNU frame unwind information. */ -#define DW_EH_PE_absptr 0x00 -#define DW_EH_PE_omit 0xff +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff -#define DW_EH_PE_uleb128 0x01 -#define DW_EH_PE_udata2 0x02 -#define DW_EH_PE_udata4 0x03 -#define DW_EH_PE_udata8 0x04 -#define DW_EH_PE_sleb128 0x09 -#define DW_EH_PE_sdata2 0x0A -#define DW_EH_PE_sdata4 0x0B -#define DW_EH_PE_sdata8 0x0C -#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0A +#define DW_EH_PE_sdata4 0x0B +#define DW_EH_PE_sdata8 0x0C +#define DW_EH_PE_signed 0x08 -#define DW_EH_PE_pcrel 0x10 -#define DW_EH_PE_textrel 0x20 -#define DW_EH_PE_datarel 0x30 -#define DW_EH_PE_funcrel 0x40 -#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 -#define DW_EH_PE_indirect 0x80 +#define DW_EH_PE_indirect 0x80 #endif /* dwarf2.h */ diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S index 1492dd38e73..c5257592efa 100644 --- a/utils/memcpy-bench/glibc/memcpy-ssse3-back.S +++ b/utils/memcpy-bench/glibc/memcpy-ssse3-back.S @@ -24,3159 +24,3159 @@ #include "asm-syntax.h" #ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_back -# define MEMCPY_CHK __memcpy_chk_ssse3_back -# define MEMPCPY __mempcpy_ssse3_back -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back +# define MEMCPY __memcpy_ssse3_back +# define MEMCPY_CHK __memcpy_chk_ssse3_back +# define MEMPCPY __mempcpy_ssse3_back +# define MEMPCPY_CHK __mempcpy_chk_ssse3_back #endif -#define JMPTBL(I, B) I - B +#define JMPTBL(I, B) I - B /* Branch to an entry in a jump table. TABLE is a jump table with relative offsets. INDEX is a register contains the index into the jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + _CET_NOTRACK jmp *INDEX; \ ud2 - .section .text.ssse3,"ax",@progbits + .section .text.ssse3,"ax",@progbits #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMPCPY_CHK) ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) END (MEMPCPY) #endif #if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMCPY_CHK) #endif ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP + mov %RDI_LP, %RAX_LP #ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP + add %RDX_LP, %RAX_LP #endif #ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx + /* Clear the upper 32 bits. */ + mov %edx, %edx #endif #ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(bwd_write_0bytes) - cmp $144, %rdx - jae L(copy_backward) - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + cmp %rsi, %rdi + jb L(copy_forward) + je L(bwd_write_0bytes) + cmp $144, %rdx + jae L(copy_backward) + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) L(copy_forward): #endif L(start): - cmp $144, %rdx - jae L(144bytesormore) + cmp $144, %rdx + jae L(144bytesormore) L(fwd_write_less32bytes): #ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jbe L(bk_write) + cmp %dil, %sil + jbe L(bk_write) #endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) #ifndef USE_AS_MEMMOVE L(bk_write): - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) #endif - .p2align 4 + .p2align 4 L(144bytesormore): #ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) + cmp %dil, %sil + jle L(copy_backward) #endif - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0) + movdqu (%rsi), %xmm0 + mov %rdi, %r8 + and $-16, %rdi + add $16, %rdi + mov %rdi, %r9 + sub %r8, %r9 + sub %r9, %rdx + add %r9, %rsi + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0) #ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP + mov $DATA_CACHE_SIZE, %RCX_LP #else - mov __x86_data_cache_size(%rip), %RCX_LP + mov __x86_data_cache_size(%rip), %RCX_LP #endif - cmp %rcx, %rdx - jae L(gobble_mem_fwd) - lea L(shl_table_fwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 + cmp %rcx, %rdx + jae L(gobble_mem_fwd) + lea L(shl_table_fwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + _CET_NOTRACK jmp *%r9 + ud2 - .p2align 4 + .p2align 4 L(copy_backward): #ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP + mov $DATA_CACHE_SIZE, %RCX_LP #else - mov __x86_data_cache_size(%rip), %RCX_LP + mov __x86_data_cache_size(%rip), %RCX_LP #endif - shl $1, %rcx - cmp %rcx, %rdx - ja L(gobble_mem_bwd) + shl $1, %rcx + cmp %rcx, %rdx + ja L(gobble_mem_bwd) - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0_bwd) - lea L(shl_table_bwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 + add %rdx, %rdi + add %rdx, %rsi + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $0xf, %r9 + xor %r9, %rdi + sub %r9, %rsi + sub %r9, %rdx + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0_bwd) + lea L(shl_table_bwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + _CET_NOTRACK jmp *%r9 + ud2 - .p2align 4 + .p2align 4 L(shl_0): - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 + mov %rdx, %r9 + shr $8, %r9 + add %rdx, %r9 #ifdef DATA_CACHE_SIZE - cmp $DATA_CACHE_SIZE_HALF, %R9_LP + cmp $DATA_CACHE_SIZE_HALF, %R9_LP #else - cmp __x86_data_cache_size_half(%rip), %R9_LP + cmp __x86_data_cache_size_half(%rip), %R9_LP #endif - jae L(gobble_mem_fwd) - sub $0x80, %rdx - .p2align 4 + jae L(gobble_mem_fwd) + sub $0x80, %rdx + .p2align 4 L(shl_0_loop): - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(shl_0_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + movdqa (%rsi), %xmm1 + movdqa %xmm1, (%rdi) + movaps 0x10(%rsi), %xmm2 + movaps %xmm2, 0x10(%rdi) + movaps 0x20(%rsi), %xmm3 + movaps %xmm3, 0x20(%rdi) + movaps 0x30(%rsi), %xmm4 + movaps %xmm4, 0x30(%rdi) + movaps 0x40(%rsi), %xmm1 + movaps %xmm1, 0x40(%rdi) + movaps 0x50(%rsi), %xmm2 + movaps %xmm2, 0x50(%rdi) + movaps 0x60(%rsi), %xmm3 + movaps %xmm3, 0x60(%rdi) + movaps 0x70(%rsi), %xmm4 + movaps %xmm4, 0x70(%rdi) + sub $0x80, %rdx + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(shl_0_loop) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_0_bwd): - sub $0x80, %rdx + sub $0x80, %rdx L(copy_backward_loop): - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(copy_backward_loop) + movaps -0x10(%rsi), %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps -0x20(%rsi), %xmm2 + movaps %xmm2, -0x20(%rdi) + movaps -0x30(%rsi), %xmm3 + movaps %xmm3, -0x30(%rdi) + movaps -0x40(%rsi), %xmm4 + movaps %xmm4, -0x40(%rdi) + movaps -0x50(%rsi), %xmm5 + movaps %xmm5, -0x50(%rdi) + movaps -0x60(%rsi), %xmm5 + movaps %xmm5, -0x60(%rdi) + movaps -0x70(%rsi), %xmm5 + movaps %xmm5, -0x70(%rdi) + movaps -0x80(%rsi), %xmm5 + movaps %xmm5, -0x80(%rdi) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(copy_backward_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_1): - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_1) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x01(%rsi), %xmm1 + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movaps 0x4f(%rsi), %xmm6 + movaps 0x5f(%rsi), %xmm7 + movaps 0x6f(%rsi), %xmm8 + movaps 0x7f(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $1, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $1, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $1, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $1, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $1, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $1, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $1, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_1) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_1_bwd): - movaps -0x01(%rsi), %xmm1 + movaps -0x01(%rsi), %xmm1 - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x11(%rsi), %xmm2 + palignr $1, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x21(%rsi), %xmm3 + palignr $1, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x31(%rsi), %xmm4 + palignr $1, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x41(%rsi), %xmm5 + palignr $1, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x51(%rsi), %xmm6 + palignr $1, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x61(%rsi), %xmm7 + palignr $1, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x71(%rsi), %xmm8 + palignr $1, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x81(%rsi), %xmm9 + palignr $1, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_1_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_1_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_2): - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_2) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x02(%rsi), %xmm1 + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movaps 0x4e(%rsi), %xmm6 + movaps 0x5e(%rsi), %xmm7 + movaps 0x6e(%rsi), %xmm8 + movaps 0x7e(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $2, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $2, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $2, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $2, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $2, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $2, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $2, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_2) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_2_bwd): - movaps -0x02(%rsi), %xmm1 + movaps -0x02(%rsi), %xmm1 - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x12(%rsi), %xmm2 + palignr $2, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x22(%rsi), %xmm3 + palignr $2, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x32(%rsi), %xmm4 + palignr $2, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x42(%rsi), %xmm5 + palignr $2, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x52(%rsi), %xmm6 + palignr $2, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x62(%rsi), %xmm7 + palignr $2, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x72(%rsi), %xmm8 + palignr $2, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x82(%rsi), %xmm9 + palignr $2, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_2_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_2_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_3): - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_3) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x03(%rsi), %xmm1 + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movaps 0x4d(%rsi), %xmm6 + movaps 0x5d(%rsi), %xmm7 + movaps 0x6d(%rsi), %xmm8 + movaps 0x7d(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $3, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $3, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $3, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $3, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $3, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $3, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $3, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_3) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_3_bwd): - movaps -0x03(%rsi), %xmm1 + movaps -0x03(%rsi), %xmm1 - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x13(%rsi), %xmm2 + palignr $3, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x23(%rsi), %xmm3 + palignr $3, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x33(%rsi), %xmm4 + palignr $3, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x43(%rsi), %xmm5 + palignr $3, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x53(%rsi), %xmm6 + palignr $3, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x63(%rsi), %xmm7 + palignr $3, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x73(%rsi), %xmm8 + palignr $3, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x83(%rsi), %xmm9 + palignr $3, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_3_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_3_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_4): - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_4) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x04(%rsi), %xmm1 + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movaps 0x4c(%rsi), %xmm6 + movaps 0x5c(%rsi), %xmm7 + movaps 0x6c(%rsi), %xmm8 + movaps 0x7c(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $4, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $4, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $4, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $4, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $4, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $4, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $4, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_4) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_4_bwd): - movaps -0x04(%rsi), %xmm1 + movaps -0x04(%rsi), %xmm1 - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x14(%rsi), %xmm2 + palignr $4, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x24(%rsi), %xmm3 + palignr $4, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x34(%rsi), %xmm4 + palignr $4, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x44(%rsi), %xmm5 + palignr $4, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x54(%rsi), %xmm6 + palignr $4, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x64(%rsi), %xmm7 + palignr $4, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x74(%rsi), %xmm8 + palignr $4, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x84(%rsi), %xmm9 + palignr $4, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_4_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_4_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_5): - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_5) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x05(%rsi), %xmm1 + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movaps 0x4b(%rsi), %xmm6 + movaps 0x5b(%rsi), %xmm7 + movaps 0x6b(%rsi), %xmm8 + movaps 0x7b(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $5, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $5, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $5, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $5, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $5, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $5, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $5, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_5) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_5_bwd): - movaps -0x05(%rsi), %xmm1 + movaps -0x05(%rsi), %xmm1 - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x15(%rsi), %xmm2 + palignr $5, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x25(%rsi), %xmm3 + palignr $5, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x35(%rsi), %xmm4 + palignr $5, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x45(%rsi), %xmm5 + palignr $5, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x55(%rsi), %xmm6 + palignr $5, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x65(%rsi), %xmm7 + palignr $5, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x75(%rsi), %xmm8 + palignr $5, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x85(%rsi), %xmm9 + palignr $5, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_5_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_5_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_6): - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_6) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x06(%rsi), %xmm1 + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movaps 0x4a(%rsi), %xmm6 + movaps 0x5a(%rsi), %xmm7 + movaps 0x6a(%rsi), %xmm8 + movaps 0x7a(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $6, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $6, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $6, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $6, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $6, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $6, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $6, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_6) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_6_bwd): - movaps -0x06(%rsi), %xmm1 + movaps -0x06(%rsi), %xmm1 - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x16(%rsi), %xmm2 + palignr $6, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x26(%rsi), %xmm3 + palignr $6, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x36(%rsi), %xmm4 + palignr $6, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x46(%rsi), %xmm5 + palignr $6, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x56(%rsi), %xmm6 + palignr $6, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x66(%rsi), %xmm7 + palignr $6, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x76(%rsi), %xmm8 + palignr $6, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x86(%rsi), %xmm9 + palignr $6, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_6_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_6_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_7): - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_7) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x07(%rsi), %xmm1 + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movaps 0x49(%rsi), %xmm6 + movaps 0x59(%rsi), %xmm7 + movaps 0x69(%rsi), %xmm8 + movaps 0x79(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $7, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $7, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $7, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $7, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $7, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $7, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $7, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_7) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_7_bwd): - movaps -0x07(%rsi), %xmm1 + movaps -0x07(%rsi), %xmm1 - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x17(%rsi), %xmm2 + palignr $7, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x27(%rsi), %xmm3 + palignr $7, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x37(%rsi), %xmm4 + palignr $7, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x47(%rsi), %xmm5 + palignr $7, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x57(%rsi), %xmm6 + palignr $7, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x67(%rsi), %xmm7 + palignr $7, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x77(%rsi), %xmm8 + palignr $7, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x87(%rsi), %xmm9 + palignr $7, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_7_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_7_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_8): - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_8) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x08(%rsi), %xmm1 + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movaps 0x48(%rsi), %xmm6 + movaps 0x58(%rsi), %xmm7 + movaps 0x68(%rsi), %xmm8 + movaps 0x78(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $8, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $8, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $8, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $8, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $8, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $8, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $8, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_8) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_8_bwd): - movaps -0x08(%rsi), %xmm1 + movaps -0x08(%rsi), %xmm1 - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x18(%rsi), %xmm2 + palignr $8, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x28(%rsi), %xmm3 + palignr $8, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x38(%rsi), %xmm4 + palignr $8, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x48(%rsi), %xmm5 + palignr $8, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x58(%rsi), %xmm6 + palignr $8, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x68(%rsi), %xmm7 + palignr $8, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x78(%rsi), %xmm8 + palignr $8, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x88(%rsi), %xmm9 + palignr $8, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_8_bwd) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_8_bwd) L(shl_8_end_bwd): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_9): - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_9) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x09(%rsi), %xmm1 + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movaps 0x47(%rsi), %xmm6 + movaps 0x57(%rsi), %xmm7 + movaps 0x67(%rsi), %xmm8 + movaps 0x77(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $9, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $9, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $9, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $9, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $9, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $9, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $9, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_9) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_9_bwd): - movaps -0x09(%rsi), %xmm1 + movaps -0x09(%rsi), %xmm1 - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x19(%rsi), %xmm2 + palignr $9, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x29(%rsi), %xmm3 + palignr $9, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x39(%rsi), %xmm4 + palignr $9, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x49(%rsi), %xmm5 + palignr $9, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x59(%rsi), %xmm6 + palignr $9, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x69(%rsi), %xmm7 + palignr $9, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x79(%rsi), %xmm8 + palignr $9, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x89(%rsi), %xmm9 + palignr $9, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_9_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_9_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_10): - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_10) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x0a(%rsi), %xmm1 + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movaps 0x46(%rsi), %xmm6 + movaps 0x56(%rsi), %xmm7 + movaps 0x66(%rsi), %xmm8 + movaps 0x76(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $10, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $10, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $10, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $10, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $10, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $10, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $10, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_10) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_10_bwd): - movaps -0x0a(%rsi), %xmm1 + movaps -0x0a(%rsi), %xmm1 - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x1a(%rsi), %xmm2 + palignr $10, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x2a(%rsi), %xmm3 + palignr $10, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x3a(%rsi), %xmm4 + palignr $10, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x4a(%rsi), %xmm5 + palignr $10, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x5a(%rsi), %xmm6 + palignr $10, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x6a(%rsi), %xmm7 + palignr $10, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x7a(%rsi), %xmm8 + palignr $10, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x8a(%rsi), %xmm9 + palignr $10, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_10_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_10_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_11): - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_11) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x0b(%rsi), %xmm1 + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movaps 0x45(%rsi), %xmm6 + movaps 0x55(%rsi), %xmm7 + movaps 0x65(%rsi), %xmm8 + movaps 0x75(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $11, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $11, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $11, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $11, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $11, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $11, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $11, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_11) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_11_bwd): - movaps -0x0b(%rsi), %xmm1 + movaps -0x0b(%rsi), %xmm1 - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x1b(%rsi), %xmm2 + palignr $11, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x2b(%rsi), %xmm3 + palignr $11, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x3b(%rsi), %xmm4 + palignr $11, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x4b(%rsi), %xmm5 + palignr $11, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x5b(%rsi), %xmm6 + palignr $11, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x6b(%rsi), %xmm7 + palignr $11, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x7b(%rsi), %xmm8 + palignr $11, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x8b(%rsi), %xmm9 + palignr $11, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_11_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_11_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_12): - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) + sub $0x80, %rdx + movdqa -0x0c(%rsi), %xmm1 + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movaps 0x44(%rsi), %xmm6 + movaps 0x54(%rsi), %xmm7 + movaps 0x64(%rsi), %xmm8 + movaps 0x74(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $12, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $12, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $12, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $12, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $12, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $12, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $12, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_12) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + lea 0x80(%rdi), %rdi + jae L(shl_12) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_12_bwd): - movaps -0x0c(%rsi), %xmm1 + movaps -0x0c(%rsi), %xmm1 - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x1c(%rsi), %xmm2 + palignr $12, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x2c(%rsi), %xmm3 + palignr $12, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x3c(%rsi), %xmm4 + palignr $12, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x4c(%rsi), %xmm5 + palignr $12, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x5c(%rsi), %xmm6 + palignr $12, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x6c(%rsi), %xmm7 + palignr $12, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x7c(%rsi), %xmm8 + palignr $12, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x8c(%rsi), %xmm9 + palignr $12, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_12_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_12_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_13): - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_13) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x0d(%rsi), %xmm1 + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movaps 0x43(%rsi), %xmm6 + movaps 0x53(%rsi), %xmm7 + movaps 0x63(%rsi), %xmm8 + movaps 0x73(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $13, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $13, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $13, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $13, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $13, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $13, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $13, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_13) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_13_bwd): - movaps -0x0d(%rsi), %xmm1 + movaps -0x0d(%rsi), %xmm1 - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x1d(%rsi), %xmm2 + palignr $13, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x2d(%rsi), %xmm3 + palignr $13, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x3d(%rsi), %xmm4 + palignr $13, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x4d(%rsi), %xmm5 + palignr $13, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x5d(%rsi), %xmm6 + palignr $13, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x6d(%rsi), %xmm7 + palignr $13, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x7d(%rsi), %xmm8 + palignr $13, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x8d(%rsi), %xmm9 + palignr $13, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_13_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_13_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_14): - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_14) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x0e(%rsi), %xmm1 + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movaps 0x42(%rsi), %xmm6 + movaps 0x52(%rsi), %xmm7 + movaps 0x62(%rsi), %xmm8 + movaps 0x72(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $14, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $14, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $14, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $14, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $14, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $14, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $14, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_14) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_14_bwd): - movaps -0x0e(%rsi), %xmm1 + movaps -0x0e(%rsi), %xmm1 - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x1e(%rsi), %xmm2 + palignr $14, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x2e(%rsi), %xmm3 + palignr $14, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x3e(%rsi), %xmm4 + palignr $14, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x4e(%rsi), %xmm5 + palignr $14, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x5e(%rsi), %xmm6 + palignr $14, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x6e(%rsi), %xmm7 + palignr $14, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x7e(%rsi), %xmm8 + palignr $14, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x8e(%rsi), %xmm9 + palignr $14, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_14_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_14_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_15): - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_15) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + sub $0x80, %rdx + movaps -0x0f(%rsi), %xmm1 + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movaps 0x41(%rsi), %xmm6 + movaps 0x51(%rsi), %xmm7 + movaps 0x61(%rsi), %xmm8 + movaps 0x71(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $15, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $15, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $15, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $15, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $15, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $15, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $15, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_15) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_15_bwd): - movaps -0x0f(%rsi), %xmm1 + movaps -0x0f(%rsi), %xmm1 - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) + movaps -0x1f(%rsi), %xmm2 + palignr $15, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) + movaps -0x2f(%rsi), %xmm3 + palignr $15, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) + movaps -0x3f(%rsi), %xmm4 + palignr $15, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) + movaps -0x4f(%rsi), %xmm5 + palignr $15, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) + movaps -0x5f(%rsi), %xmm6 + palignr $15, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) + movaps -0x6f(%rsi), %xmm7 + palignr $15, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) + movaps -0x7f(%rsi), %xmm8 + palignr $15, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) + movaps -0x8f(%rsi), %xmm9 + palignr $15, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_15_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_15_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(gobble_mem_fwd): - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi + movdqu (%rsi), %xmm1 + movdqu %xmm0, (%r8) + movdqa %xmm1, (%rdi) + sub $16, %rdx + add $16, %rsi + add $16, %rdi #ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP #else - mov __x86_shared_cache_size_half(%rip), %RCX_LP + mov __x86_shared_cache_size_half(%rip), %RCX_LP #endif #ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_fwd_start) + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_fwd_start) L(memmove_is_memcpy_fwd): #endif - cmp %rcx, %rdx - ja L(bigger_in_fwd) - mov %rdx, %rcx + cmp %rcx, %rdx + ja L(bigger_in_fwd) + mov %rdx, %rcx L(bigger_in_fwd): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy_fwd) + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy_fwd) - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy_fwd) - add %rcx, %rdx - xor %rcx, %rcx + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy_fwd) + add %rcx, %rdx + xor %rcx, %rcx L(2steps_copy_fwd): - sub $0x80, %rdx + sub $0x80, %rdx L(gobble_mem_fwd_loop): - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_mem_fwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_fwd_end) - add $0x80, %rdx + sub $0x80, %rdx + prefetcht0 0x200(%rsi) + prefetcht0 0x300(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lfence + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_mem_fwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_fwd_end) + add $0x80, %rdx L(ll_cache_copy_fwd): - add %rcx, %rdx + add %rcx, %rdx L(ll_cache_copy_fwd_start): - sub $0x80, %rdx + sub $0x80, %rdx L(gobble_ll_loop_fwd): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_ll_loop_fwd) + prefetchnta 0x1c0(%rsi) + prefetchnta 0x280(%rsi) + prefetchnta 0x1c0(%rdi) + prefetchnta 0x280(%rdi) + sub $0x80, %rdx + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_ll_loop_fwd) L(gobble_mem_fwd_end): - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - .p2align 4 + .p2align 4 L(gobble_mem_bwd): - add %rdx, %rsi - add %rdx, %rdi + add %rdx, %rsi + add %rdx, %rdi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $-16, %rdi + sub %rdi, %r9 + sub %r9, %rsi + sub %r9, %rdx #ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP #else - mov __x86_shared_cache_size_half(%rip), %RCX_LP + mov __x86_shared_cache_size_half(%rip), %RCX_LP #endif #ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_bwd_start) + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_bwd_start) L(memmove_is_memcpy_bwd): #endif - cmp %rcx, %rdx - ja L(bigger) - mov %rdx, %rcx + cmp %rcx, %rdx + ja L(bigger) + mov %rdx, %rcx L(bigger): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy) + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy) - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy) - add %rcx, %rdx - xor %rcx, %rcx + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy) + add %rcx, %rdx + xor %rcx, %rcx L(2steps_copy): - sub $0x80, %rdx + sub $0x80, %rdx L(gobble_mem_bwd_loop): - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_mem_bwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_bwd_end) - add $0x80, %rdx + sub $0x80, %rdx + prefetcht0 -0x200(%rsi) + prefetcht0 -0x300(%rsi) + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + lfence + movntdq %xmm1, -0x10(%rdi) + movntdq %xmm2, -0x20(%rdi) + movntdq %xmm3, -0x30(%rdi) + movntdq %xmm4, -0x40(%rdi) + movntdq %xmm5, -0x50(%rdi) + movntdq %xmm6, -0x60(%rdi) + movntdq %xmm7, -0x70(%rdi) + movntdq %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_mem_bwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_bwd_end) + add $0x80, %rdx L(ll_cache_copy): - add %rcx, %rdx + add %rcx, %rdx L(ll_cache_copy_bwd_start): - sub $0x80, %rdx + sub $0x80, %rdx L(gobble_ll_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_ll_loop) + prefetchnta -0x1c0(%rsi) + prefetchnta -0x280(%rsi) + prefetchnta -0x1c0(%rdi) + prefetchnta -0x280(%rdi) + sub $0x80, %rdx + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + movdqa %xmm1, -0x10(%rdi) + movdqa %xmm2, -0x20(%rdi) + movdqa %xmm3, -0x30(%rdi) + movdqa %xmm4, -0x40(%rdi) + movdqa %xmm5, -0x50(%rdi) + movdqa %xmm6, -0x60(%rdi) + movdqa %xmm7, -0x70(%rdi) + movdqa %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_ll_loop) L(gobble_mem_bwd_end): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rsi + sub %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - .p2align 4 + .p2align 4 L(fwd_write_128bytes): - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) + lddqu -128(%rsi), %xmm0 + movdqu %xmm0, -128(%rdi) L(fwd_write_112bytes): - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) + lddqu -112(%rsi), %xmm0 + movdqu %xmm0, -112(%rdi) L(fwd_write_96bytes): - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) + lddqu -96(%rsi), %xmm0 + movdqu %xmm0, -96(%rdi) L(fwd_write_80bytes): - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) + lddqu -80(%rsi), %xmm0 + movdqu %xmm0, -80(%rdi) L(fwd_write_64bytes): - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) + lddqu -64(%rsi), %xmm0 + movdqu %xmm0, -64(%rdi) L(fwd_write_48bytes): - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) + lddqu -48(%rsi), %xmm0 + movdqu %xmm0, -48(%rdi) L(fwd_write_32bytes): - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) + lddqu -32(%rsi), %xmm0 + movdqu %xmm0, -32(%rdi) L(fwd_write_16bytes): - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) + lddqu -16(%rsi), %xmm0 + movdqu %xmm0, -16(%rdi) L(fwd_write_0bytes): - ret + ret - .p2align 4 + .p2align 4 L(fwd_write_143bytes): - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) + lddqu -143(%rsi), %xmm0 + movdqu %xmm0, -143(%rdi) L(fwd_write_127bytes): - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) + lddqu -127(%rsi), %xmm0 + movdqu %xmm0, -127(%rdi) L(fwd_write_111bytes): - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) + lddqu -111(%rsi), %xmm0 + movdqu %xmm0, -111(%rdi) L(fwd_write_95bytes): - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) + lddqu -95(%rsi), %xmm0 + movdqu %xmm0, -95(%rdi) L(fwd_write_79bytes): - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) + lddqu -79(%rsi), %xmm0 + movdqu %xmm0, -79(%rdi) L(fwd_write_63bytes): - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) + lddqu -63(%rsi), %xmm0 + movdqu %xmm0, -63(%rdi) L(fwd_write_47bytes): - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) + lddqu -47(%rsi), %xmm0 + movdqu %xmm0, -47(%rdi) L(fwd_write_31bytes): - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -31(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -31(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_15bytes): - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret + mov -15(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -15(%rdi) + mov %rcx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_142bytes): - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) + lddqu -142(%rsi), %xmm0 + movdqu %xmm0, -142(%rdi) L(fwd_write_126bytes): - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) + lddqu -126(%rsi), %xmm0 + movdqu %xmm0, -126(%rdi) L(fwd_write_110bytes): - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) + lddqu -110(%rsi), %xmm0 + movdqu %xmm0, -110(%rdi) L(fwd_write_94bytes): - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) + lddqu -94(%rsi), %xmm0 + movdqu %xmm0, -94(%rdi) L(fwd_write_78bytes): - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) + lddqu -78(%rsi), %xmm0 + movdqu %xmm0, -78(%rdi) L(fwd_write_62bytes): - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) + lddqu -62(%rsi), %xmm0 + movdqu %xmm0, -62(%rdi) L(fwd_write_46bytes): - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) + lddqu -46(%rsi), %xmm0 + movdqu %xmm0, -46(%rdi) L(fwd_write_30bytes): - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -30(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -30(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_14bytes): - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret + mov -14(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -14(%rdi) + mov %rcx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_141bytes): - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) + lddqu -141(%rsi), %xmm0 + movdqu %xmm0, -141(%rdi) L(fwd_write_125bytes): - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) + lddqu -125(%rsi), %xmm0 + movdqu %xmm0, -125(%rdi) L(fwd_write_109bytes): - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) + lddqu -109(%rsi), %xmm0 + movdqu %xmm0, -109(%rdi) L(fwd_write_93bytes): - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) + lddqu -93(%rsi), %xmm0 + movdqu %xmm0, -93(%rdi) L(fwd_write_77bytes): - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) + lddqu -77(%rsi), %xmm0 + movdqu %xmm0, -77(%rdi) L(fwd_write_61bytes): - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) + lddqu -61(%rsi), %xmm0 + movdqu %xmm0, -61(%rdi) L(fwd_write_45bytes): - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) + lddqu -45(%rsi), %xmm0 + movdqu %xmm0, -45(%rdi) L(fwd_write_29bytes): - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -29(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -29(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_13bytes): - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret + mov -13(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -13(%rdi) + mov %rcx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_140bytes): - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) + lddqu -140(%rsi), %xmm0 + movdqu %xmm0, -140(%rdi) L(fwd_write_124bytes): - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) + lddqu -124(%rsi), %xmm0 + movdqu %xmm0, -124(%rdi) L(fwd_write_108bytes): - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) + lddqu -108(%rsi), %xmm0 + movdqu %xmm0, -108(%rdi) L(fwd_write_92bytes): - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) + lddqu -92(%rsi), %xmm0 + movdqu %xmm0, -92(%rdi) L(fwd_write_76bytes): - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) + lddqu -76(%rsi), %xmm0 + movdqu %xmm0, -76(%rdi) L(fwd_write_60bytes): - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) + lddqu -60(%rsi), %xmm0 + movdqu %xmm0, -60(%rdi) L(fwd_write_44bytes): - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) + lddqu -44(%rsi), %xmm0 + movdqu %xmm0, -44(%rdi) L(fwd_write_28bytes): - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -28(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -28(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_12bytes): - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret + mov -12(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -12(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_139bytes): - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) + lddqu -139(%rsi), %xmm0 + movdqu %xmm0, -139(%rdi) L(fwd_write_123bytes): - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) + lddqu -123(%rsi), %xmm0 + movdqu %xmm0, -123(%rdi) L(fwd_write_107bytes): - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) + lddqu -107(%rsi), %xmm0 + movdqu %xmm0, -107(%rdi) L(fwd_write_91bytes): - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) + lddqu -91(%rsi), %xmm0 + movdqu %xmm0, -91(%rdi) L(fwd_write_75bytes): - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) + lddqu -75(%rsi), %xmm0 + movdqu %xmm0, -75(%rdi) L(fwd_write_59bytes): - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) + lddqu -59(%rsi), %xmm0 + movdqu %xmm0, -59(%rdi) L(fwd_write_43bytes): - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) + lddqu -43(%rsi), %xmm0 + movdqu %xmm0, -43(%rdi) L(fwd_write_27bytes): - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -27(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -27(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_11bytes): - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret + mov -11(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -11(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_138bytes): - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) + lddqu -138(%rsi), %xmm0 + movdqu %xmm0, -138(%rdi) L(fwd_write_122bytes): - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) + lddqu -122(%rsi), %xmm0 + movdqu %xmm0, -122(%rdi) L(fwd_write_106bytes): - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) + lddqu -106(%rsi), %xmm0 + movdqu %xmm0, -106(%rdi) L(fwd_write_90bytes): - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) + lddqu -90(%rsi), %xmm0 + movdqu %xmm0, -90(%rdi) L(fwd_write_74bytes): - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) + lddqu -74(%rsi), %xmm0 + movdqu %xmm0, -74(%rdi) L(fwd_write_58bytes): - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) + lddqu -58(%rsi), %xmm0 + movdqu %xmm0, -58(%rdi) L(fwd_write_42bytes): - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) + lddqu -42(%rsi), %xmm0 + movdqu %xmm0, -42(%rdi) L(fwd_write_26bytes): - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -26(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -26(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_10bytes): - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret + mov -10(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -10(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_137bytes): - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) + lddqu -137(%rsi), %xmm0 + movdqu %xmm0, -137(%rdi) L(fwd_write_121bytes): - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) + lddqu -121(%rsi), %xmm0 + movdqu %xmm0, -121(%rdi) L(fwd_write_105bytes): - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) + lddqu -105(%rsi), %xmm0 + movdqu %xmm0, -105(%rdi) L(fwd_write_89bytes): - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) + lddqu -89(%rsi), %xmm0 + movdqu %xmm0, -89(%rdi) L(fwd_write_73bytes): - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) + lddqu -73(%rsi), %xmm0 + movdqu %xmm0, -73(%rdi) L(fwd_write_57bytes): - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) + lddqu -57(%rsi), %xmm0 + movdqu %xmm0, -57(%rdi) L(fwd_write_41bytes): - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) + lddqu -41(%rsi), %xmm0 + movdqu %xmm0, -41(%rdi) L(fwd_write_25bytes): - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -25(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -25(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_9bytes): - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret + mov -9(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -9(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_136bytes): - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) + lddqu -136(%rsi), %xmm0 + movdqu %xmm0, -136(%rdi) L(fwd_write_120bytes): - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) + lddqu -120(%rsi), %xmm0 + movdqu %xmm0, -120(%rdi) L(fwd_write_104bytes): - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) + lddqu -104(%rsi), %xmm0 + movdqu %xmm0, -104(%rdi) L(fwd_write_88bytes): - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) + lddqu -88(%rsi), %xmm0 + movdqu %xmm0, -88(%rdi) L(fwd_write_72bytes): - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) + lddqu -72(%rsi), %xmm0 + movdqu %xmm0, -72(%rdi) L(fwd_write_56bytes): - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) + lddqu -56(%rsi), %xmm0 + movdqu %xmm0, -56(%rdi) L(fwd_write_40bytes): - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) + lddqu -40(%rsi), %xmm0 + movdqu %xmm0, -40(%rdi) L(fwd_write_24bytes): - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -24(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -24(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_135bytes): - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) + lddqu -135(%rsi), %xmm0 + movdqu %xmm0, -135(%rdi) L(fwd_write_119bytes): - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) + lddqu -119(%rsi), %xmm0 + movdqu %xmm0, -119(%rdi) L(fwd_write_103bytes): - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) + lddqu -103(%rsi), %xmm0 + movdqu %xmm0, -103(%rdi) L(fwd_write_87bytes): - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) + lddqu -87(%rsi), %xmm0 + movdqu %xmm0, -87(%rdi) L(fwd_write_71bytes): - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) + lddqu -71(%rsi), %xmm0 + movdqu %xmm0, -71(%rdi) L(fwd_write_55bytes): - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) + lddqu -55(%rsi), %xmm0 + movdqu %xmm0, -55(%rdi) L(fwd_write_39bytes): - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) + lddqu -39(%rsi), %xmm0 + movdqu %xmm0, -39(%rdi) L(fwd_write_23bytes): - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -23(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -23(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_134bytes): - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) + lddqu -134(%rsi), %xmm0 + movdqu %xmm0, -134(%rdi) L(fwd_write_118bytes): - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) + lddqu -118(%rsi), %xmm0 + movdqu %xmm0, -118(%rdi) L(fwd_write_102bytes): - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) + lddqu -102(%rsi), %xmm0 + movdqu %xmm0, -102(%rdi) L(fwd_write_86bytes): - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) + lddqu -86(%rsi), %xmm0 + movdqu %xmm0, -86(%rdi) L(fwd_write_70bytes): - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) + lddqu -70(%rsi), %xmm0 + movdqu %xmm0, -70(%rdi) L(fwd_write_54bytes): - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) + lddqu -54(%rsi), %xmm0 + movdqu %xmm0, -54(%rdi) L(fwd_write_38bytes): - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) + lddqu -38(%rsi), %xmm0 + movdqu %xmm0, -38(%rdi) L(fwd_write_22bytes): - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -22(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -22(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_133bytes): - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) + lddqu -133(%rsi), %xmm0 + movdqu %xmm0, -133(%rdi) L(fwd_write_117bytes): - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) + lddqu -117(%rsi), %xmm0 + movdqu %xmm0, -117(%rdi) L(fwd_write_101bytes): - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) + lddqu -101(%rsi), %xmm0 + movdqu %xmm0, -101(%rdi) L(fwd_write_85bytes): - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) + lddqu -85(%rsi), %xmm0 + movdqu %xmm0, -85(%rdi) L(fwd_write_69bytes): - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) + lddqu -69(%rsi), %xmm0 + movdqu %xmm0, -69(%rdi) L(fwd_write_53bytes): - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) + lddqu -53(%rsi), %xmm0 + movdqu %xmm0, -53(%rdi) L(fwd_write_37bytes): - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) + lddqu -37(%rsi), %xmm0 + movdqu %xmm0, -37(%rdi) L(fwd_write_21bytes): - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -21(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -21(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_132bytes): - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) + lddqu -132(%rsi), %xmm0 + movdqu %xmm0, -132(%rdi) L(fwd_write_116bytes): - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) + lddqu -116(%rsi), %xmm0 + movdqu %xmm0, -116(%rdi) L(fwd_write_100bytes): - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) + lddqu -100(%rsi), %xmm0 + movdqu %xmm0, -100(%rdi) L(fwd_write_84bytes): - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) + lddqu -84(%rsi), %xmm0 + movdqu %xmm0, -84(%rdi) L(fwd_write_68bytes): - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) + lddqu -68(%rsi), %xmm0 + movdqu %xmm0, -68(%rdi) L(fwd_write_52bytes): - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) + lddqu -52(%rsi), %xmm0 + movdqu %xmm0, -52(%rdi) L(fwd_write_36bytes): - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) + lddqu -36(%rsi), %xmm0 + movdqu %xmm0, -36(%rdi) L(fwd_write_20bytes): - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -20(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -20(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_131bytes): - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) + lddqu -131(%rsi), %xmm0 + movdqu %xmm0, -131(%rdi) L(fwd_write_115bytes): - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) + lddqu -115(%rsi), %xmm0 + movdqu %xmm0, -115(%rdi) L(fwd_write_99bytes): - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) + lddqu -99(%rsi), %xmm0 + movdqu %xmm0, -99(%rdi) L(fwd_write_83bytes): - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) + lddqu -83(%rsi), %xmm0 + movdqu %xmm0, -83(%rdi) L(fwd_write_67bytes): - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) + lddqu -67(%rsi), %xmm0 + movdqu %xmm0, -67(%rdi) L(fwd_write_51bytes): - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) + lddqu -51(%rsi), %xmm0 + movdqu %xmm0, -51(%rdi) L(fwd_write_35bytes): - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) + lddqu -35(%rsi), %xmm0 + movdqu %xmm0, -35(%rdi) L(fwd_write_19bytes): - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -19(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -19(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_130bytes): - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) + lddqu -130(%rsi), %xmm0 + movdqu %xmm0, -130(%rdi) L(fwd_write_114bytes): - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) + lddqu -114(%rsi), %xmm0 + movdqu %xmm0, -114(%rdi) L(fwd_write_98bytes): - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) + lddqu -98(%rsi), %xmm0 + movdqu %xmm0, -98(%rdi) L(fwd_write_82bytes): - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) + lddqu -82(%rsi), %xmm0 + movdqu %xmm0, -82(%rdi) L(fwd_write_66bytes): - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) + lddqu -66(%rsi), %xmm0 + movdqu %xmm0, -66(%rdi) L(fwd_write_50bytes): - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) + lddqu -50(%rsi), %xmm0 + movdqu %xmm0, -50(%rdi) L(fwd_write_34bytes): - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) + lddqu -34(%rsi), %xmm0 + movdqu %xmm0, -34(%rdi) L(fwd_write_18bytes): - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -18(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -18(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_2bytes): - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret + movzwl -2(%rsi), %edx + mov %dx, -2(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_129bytes): - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) + lddqu -129(%rsi), %xmm0 + movdqu %xmm0, -129(%rdi) L(fwd_write_113bytes): - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) + lddqu -113(%rsi), %xmm0 + movdqu %xmm0, -113(%rdi) L(fwd_write_97bytes): - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) + lddqu -97(%rsi), %xmm0 + movdqu %xmm0, -97(%rdi) L(fwd_write_81bytes): - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) + lddqu -81(%rsi), %xmm0 + movdqu %xmm0, -81(%rdi) L(fwd_write_65bytes): - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) + lddqu -65(%rsi), %xmm0 + movdqu %xmm0, -65(%rdi) L(fwd_write_49bytes): - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) + lddqu -49(%rsi), %xmm0 + movdqu %xmm0, -49(%rdi) L(fwd_write_33bytes): - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) + lddqu -33(%rsi), %xmm0 + movdqu %xmm0, -33(%rdi) L(fwd_write_17bytes): - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret + lddqu -17(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -17(%rdi) + movdqu %xmm1, -16(%rdi) + ret - .p2align 4 + .p2align 4 L(fwd_write_1bytes): - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret + movzbl -1(%rsi), %edx + mov %dl, -1(%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_128bytes): - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) + lddqu 112(%rsi), %xmm0 + movdqu %xmm0, 112(%rdi) L(bwd_write_112bytes): - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) + lddqu 96(%rsi), %xmm0 + movdqu %xmm0, 96(%rdi) L(bwd_write_96bytes): - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) + lddqu 80(%rsi), %xmm0 + movdqu %xmm0, 80(%rdi) L(bwd_write_80bytes): - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) + lddqu 64(%rsi), %xmm0 + movdqu %xmm0, 64(%rdi) L(bwd_write_64bytes): - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) + lddqu 48(%rsi), %xmm0 + movdqu %xmm0, 48(%rdi) L(bwd_write_48bytes): - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) + lddqu 32(%rsi), %xmm0 + movdqu %xmm0, 32(%rdi) L(bwd_write_32bytes): - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) + lddqu 16(%rsi), %xmm0 + movdqu %xmm0, 16(%rdi) L(bwd_write_16bytes): - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) + lddqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) L(bwd_write_0bytes): - ret + ret - .p2align 4 + .p2align 4 L(bwd_write_143bytes): - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) + lddqu 127(%rsi), %xmm0 + movdqu %xmm0, 127(%rdi) L(bwd_write_127bytes): - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) + lddqu 111(%rsi), %xmm0 + movdqu %xmm0, 111(%rdi) L(bwd_write_111bytes): - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) + lddqu 95(%rsi), %xmm0 + movdqu %xmm0, 95(%rdi) L(bwd_write_95bytes): - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) + lddqu 79(%rsi), %xmm0 + movdqu %xmm0, 79(%rdi) L(bwd_write_79bytes): - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) + lddqu 63(%rsi), %xmm0 + movdqu %xmm0, 63(%rdi) L(bwd_write_63bytes): - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) + lddqu 47(%rsi), %xmm0 + movdqu %xmm0, 47(%rdi) L(bwd_write_47bytes): - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) + lddqu 31(%rsi), %xmm0 + movdqu %xmm0, 31(%rdi) L(bwd_write_31bytes): - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 15(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 15(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_15bytes): - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret + mov 7(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 7(%rdi) + mov %rcx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_142bytes): - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) + lddqu 126(%rsi), %xmm0 + movdqu %xmm0, 126(%rdi) L(bwd_write_126bytes): - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) + lddqu 110(%rsi), %xmm0 + movdqu %xmm0, 110(%rdi) L(bwd_write_110bytes): - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) + lddqu 94(%rsi), %xmm0 + movdqu %xmm0, 94(%rdi) L(bwd_write_94bytes): - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) + lddqu 78(%rsi), %xmm0 + movdqu %xmm0, 78(%rdi) L(bwd_write_78bytes): - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) + lddqu 62(%rsi), %xmm0 + movdqu %xmm0, 62(%rdi) L(bwd_write_62bytes): - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) + lddqu 46(%rsi), %xmm0 + movdqu %xmm0, 46(%rdi) L(bwd_write_46bytes): - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) + lddqu 30(%rsi), %xmm0 + movdqu %xmm0, 30(%rdi) L(bwd_write_30bytes): - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 14(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 14(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_14bytes): - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret + mov 6(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 6(%rdi) + mov %rcx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_141bytes): - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) + lddqu 125(%rsi), %xmm0 + movdqu %xmm0, 125(%rdi) L(bwd_write_125bytes): - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) + lddqu 109(%rsi), %xmm0 + movdqu %xmm0, 109(%rdi) L(bwd_write_109bytes): - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) + lddqu 93(%rsi), %xmm0 + movdqu %xmm0, 93(%rdi) L(bwd_write_93bytes): - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) + lddqu 77(%rsi), %xmm0 + movdqu %xmm0, 77(%rdi) L(bwd_write_77bytes): - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) + lddqu 61(%rsi), %xmm0 + movdqu %xmm0, 61(%rdi) L(bwd_write_61bytes): - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) + lddqu 45(%rsi), %xmm0 + movdqu %xmm0, 45(%rdi) L(bwd_write_45bytes): - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) + lddqu 29(%rsi), %xmm0 + movdqu %xmm0, 29(%rdi) L(bwd_write_29bytes): - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 13(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 13(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_13bytes): - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret + mov 5(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 5(%rdi) + mov %rcx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_140bytes): - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) + lddqu 124(%rsi), %xmm0 + movdqu %xmm0, 124(%rdi) L(bwd_write_124bytes): - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) + lddqu 108(%rsi), %xmm0 + movdqu %xmm0, 108(%rdi) L(bwd_write_108bytes): - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) + lddqu 92(%rsi), %xmm0 + movdqu %xmm0, 92(%rdi) L(bwd_write_92bytes): - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) + lddqu 76(%rsi), %xmm0 + movdqu %xmm0, 76(%rdi) L(bwd_write_76bytes): - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) + lddqu 60(%rsi), %xmm0 + movdqu %xmm0, 60(%rdi) L(bwd_write_60bytes): - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) + lddqu 44(%rsi), %xmm0 + movdqu %xmm0, 44(%rdi) L(bwd_write_44bytes): - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) + lddqu 28(%rsi), %xmm0 + movdqu %xmm0, 28(%rdi) L(bwd_write_28bytes): - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 12(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 12(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_12bytes): - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret + mov 4(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 4(%rdi) + mov %rcx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_139bytes): - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) + lddqu 123(%rsi), %xmm0 + movdqu %xmm0, 123(%rdi) L(bwd_write_123bytes): - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) + lddqu 107(%rsi), %xmm0 + movdqu %xmm0, 107(%rdi) L(bwd_write_107bytes): - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) + lddqu 91(%rsi), %xmm0 + movdqu %xmm0, 91(%rdi) L(bwd_write_91bytes): - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) + lddqu 75(%rsi), %xmm0 + movdqu %xmm0, 75(%rdi) L(bwd_write_75bytes): - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) + lddqu 59(%rsi), %xmm0 + movdqu %xmm0, 59(%rdi) L(bwd_write_59bytes): - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) + lddqu 43(%rsi), %xmm0 + movdqu %xmm0, 43(%rdi) L(bwd_write_43bytes): - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) + lddqu 27(%rsi), %xmm0 + movdqu %xmm0, 27(%rdi) L(bwd_write_27bytes): - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 11(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 11(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_11bytes): - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret + mov 3(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 3(%rdi) + mov %rcx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_138bytes): - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) + lddqu 122(%rsi), %xmm0 + movdqu %xmm0, 122(%rdi) L(bwd_write_122bytes): - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) + lddqu 106(%rsi), %xmm0 + movdqu %xmm0, 106(%rdi) L(bwd_write_106bytes): - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) + lddqu 90(%rsi), %xmm0 + movdqu %xmm0, 90(%rdi) L(bwd_write_90bytes): - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) + lddqu 74(%rsi), %xmm0 + movdqu %xmm0, 74(%rdi) L(bwd_write_74bytes): - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) + lddqu 58(%rsi), %xmm0 + movdqu %xmm0, 58(%rdi) L(bwd_write_58bytes): - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) + lddqu 42(%rsi), %xmm0 + movdqu %xmm0, 42(%rdi) L(bwd_write_42bytes): - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) + lddqu 26(%rsi), %xmm0 + movdqu %xmm0, 26(%rdi) L(bwd_write_26bytes): - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 10(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 10(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_10bytes): - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret + mov 2(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 2(%rdi) + mov %rcx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_137bytes): - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) + lddqu 121(%rsi), %xmm0 + movdqu %xmm0, 121(%rdi) L(bwd_write_121bytes): - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) + lddqu 105(%rsi), %xmm0 + movdqu %xmm0, 105(%rdi) L(bwd_write_105bytes): - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) + lddqu 89(%rsi), %xmm0 + movdqu %xmm0, 89(%rdi) L(bwd_write_89bytes): - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) + lddqu 73(%rsi), %xmm0 + movdqu %xmm0, 73(%rdi) L(bwd_write_73bytes): - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) + lddqu 57(%rsi), %xmm0 + movdqu %xmm0, 57(%rdi) L(bwd_write_57bytes): - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) + lddqu 41(%rsi), %xmm0 + movdqu %xmm0, 41(%rdi) L(bwd_write_41bytes): - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) + lddqu 25(%rsi), %xmm0 + movdqu %xmm0, 25(%rdi) L(bwd_write_25bytes): - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 9(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 9(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_9bytes): - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret + mov 1(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 1(%rdi) + mov %rcx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_136bytes): - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) + lddqu 120(%rsi), %xmm0 + movdqu %xmm0, 120(%rdi) L(bwd_write_120bytes): - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) + lddqu 104(%rsi), %xmm0 + movdqu %xmm0, 104(%rdi) L(bwd_write_104bytes): - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) + lddqu 88(%rsi), %xmm0 + movdqu %xmm0, 88(%rdi) L(bwd_write_88bytes): - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) + lddqu 72(%rsi), %xmm0 + movdqu %xmm0, 72(%rdi) L(bwd_write_72bytes): - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) + lddqu 56(%rsi), %xmm0 + movdqu %xmm0, 56(%rdi) L(bwd_write_56bytes): - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) + lddqu 40(%rsi), %xmm0 + movdqu %xmm0, 40(%rdi) L(bwd_write_40bytes): - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) + lddqu 24(%rsi), %xmm0 + movdqu %xmm0, 24(%rdi) L(bwd_write_24bytes): - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 8(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 8(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_8bytes): - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret + mov (%rsi), %rdx + mov %rdx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_135bytes): - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) + lddqu 119(%rsi), %xmm0 + movdqu %xmm0, 119(%rdi) L(bwd_write_119bytes): - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) + lddqu 103(%rsi), %xmm0 + movdqu %xmm0, 103(%rdi) L(bwd_write_103bytes): - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) + lddqu 87(%rsi), %xmm0 + movdqu %xmm0, 87(%rdi) L(bwd_write_87bytes): - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) + lddqu 71(%rsi), %xmm0 + movdqu %xmm0, 71(%rdi) L(bwd_write_71bytes): - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) + lddqu 55(%rsi), %xmm0 + movdqu %xmm0, 55(%rdi) L(bwd_write_55bytes): - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) + lddqu 39(%rsi), %xmm0 + movdqu %xmm0, 39(%rdi) L(bwd_write_39bytes): - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) + lddqu 23(%rsi), %xmm0 + movdqu %xmm0, 23(%rdi) L(bwd_write_23bytes): - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 7(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 7(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_7bytes): - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret + mov 3(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 3(%rdi) + mov %ecx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_134bytes): - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) + lddqu 118(%rsi), %xmm0 + movdqu %xmm0, 118(%rdi) L(bwd_write_118bytes): - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) + lddqu 102(%rsi), %xmm0 + movdqu %xmm0, 102(%rdi) L(bwd_write_102bytes): - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) + lddqu 86(%rsi), %xmm0 + movdqu %xmm0, 86(%rdi) L(bwd_write_86bytes): - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) + lddqu 70(%rsi), %xmm0 + movdqu %xmm0, 70(%rdi) L(bwd_write_70bytes): - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) + lddqu 54(%rsi), %xmm0 + movdqu %xmm0, 54(%rdi) L(bwd_write_54bytes): - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) + lddqu 38(%rsi), %xmm0 + movdqu %xmm0, 38(%rdi) L(bwd_write_38bytes): - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) + lddqu 22(%rsi), %xmm0 + movdqu %xmm0, 22(%rdi) L(bwd_write_22bytes): - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 6(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 6(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_6bytes): - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret + mov 2(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 2(%rdi) + mov %ecx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_133bytes): - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) + lddqu 117(%rsi), %xmm0 + movdqu %xmm0, 117(%rdi) L(bwd_write_117bytes): - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) + lddqu 101(%rsi), %xmm0 + movdqu %xmm0, 101(%rdi) L(bwd_write_101bytes): - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) + lddqu 85(%rsi), %xmm0 + movdqu %xmm0, 85(%rdi) L(bwd_write_85bytes): - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) + lddqu 69(%rsi), %xmm0 + movdqu %xmm0, 69(%rdi) L(bwd_write_69bytes): - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) + lddqu 53(%rsi), %xmm0 + movdqu %xmm0, 53(%rdi) L(bwd_write_53bytes): - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) + lddqu 37(%rsi), %xmm0 + movdqu %xmm0, 37(%rdi) L(bwd_write_37bytes): - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) + lddqu 21(%rsi), %xmm0 + movdqu %xmm0, 21(%rdi) L(bwd_write_21bytes): - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 5(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 5(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_5bytes): - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret + mov 1(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 1(%rdi) + mov %ecx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_132bytes): - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) + lddqu 116(%rsi), %xmm0 + movdqu %xmm0, 116(%rdi) L(bwd_write_116bytes): - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) + lddqu 100(%rsi), %xmm0 + movdqu %xmm0, 100(%rdi) L(bwd_write_100bytes): - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) + lddqu 84(%rsi), %xmm0 + movdqu %xmm0, 84(%rdi) L(bwd_write_84bytes): - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) + lddqu 68(%rsi), %xmm0 + movdqu %xmm0, 68(%rdi) L(bwd_write_68bytes): - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) + lddqu 52(%rsi), %xmm0 + movdqu %xmm0, 52(%rdi) L(bwd_write_52bytes): - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) + lddqu 36(%rsi), %xmm0 + movdqu %xmm0, 36(%rdi) L(bwd_write_36bytes): - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) + lddqu 20(%rsi), %xmm0 + movdqu %xmm0, 20(%rdi) L(bwd_write_20bytes): - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 4(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 4(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_4bytes): - mov (%rsi), %edx - mov %edx, (%rdi) - ret + mov (%rsi), %edx + mov %edx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_131bytes): - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) + lddqu 115(%rsi), %xmm0 + movdqu %xmm0, 115(%rdi) L(bwd_write_115bytes): - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) + lddqu 99(%rsi), %xmm0 + movdqu %xmm0, 99(%rdi) L(bwd_write_99bytes): - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) + lddqu 83(%rsi), %xmm0 + movdqu %xmm0, 83(%rdi) L(bwd_write_83bytes): - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) + lddqu 67(%rsi), %xmm0 + movdqu %xmm0, 67(%rdi) L(bwd_write_67bytes): - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) + lddqu 51(%rsi), %xmm0 + movdqu %xmm0, 51(%rdi) L(bwd_write_51bytes): - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) + lddqu 35(%rsi), %xmm0 + movdqu %xmm0, 35(%rdi) L(bwd_write_35bytes): - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) + lddqu 19(%rsi), %xmm0 + movdqu %xmm0, 19(%rdi) L(bwd_write_19bytes): - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 3(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 3(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_3bytes): - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret + mov 1(%rsi), %dx + mov (%rsi), %cx + mov %dx, 1(%rdi) + mov %cx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_130bytes): - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) + lddqu 114(%rsi), %xmm0 + movdqu %xmm0, 114(%rdi) L(bwd_write_114bytes): - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) + lddqu 98(%rsi), %xmm0 + movdqu %xmm0, 98(%rdi) L(bwd_write_98bytes): - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) + lddqu 82(%rsi), %xmm0 + movdqu %xmm0, 82(%rdi) L(bwd_write_82bytes): - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) + lddqu 66(%rsi), %xmm0 + movdqu %xmm0, 66(%rdi) L(bwd_write_66bytes): - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) + lddqu 50(%rsi), %xmm0 + movdqu %xmm0, 50(%rdi) L(bwd_write_50bytes): - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) + lddqu 34(%rsi), %xmm0 + movdqu %xmm0, 34(%rdi) L(bwd_write_34bytes): - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) + lddqu 18(%rsi), %xmm0 + movdqu %xmm0, 18(%rdi) L(bwd_write_18bytes): - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 2(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 2(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_2bytes): - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret + movzwl (%rsi), %edx + mov %dx, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_129bytes): - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) + lddqu 113(%rsi), %xmm0 + movdqu %xmm0, 113(%rdi) L(bwd_write_113bytes): - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) + lddqu 97(%rsi), %xmm0 + movdqu %xmm0, 97(%rdi) L(bwd_write_97bytes): - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) + lddqu 81(%rsi), %xmm0 + movdqu %xmm0, 81(%rdi) L(bwd_write_81bytes): - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) + lddqu 65(%rsi), %xmm0 + movdqu %xmm0, 65(%rdi) L(bwd_write_65bytes): - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) + lddqu 49(%rsi), %xmm0 + movdqu %xmm0, 49(%rdi) L(bwd_write_49bytes): - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) + lddqu 33(%rsi), %xmm0 + movdqu %xmm0, 33(%rdi) L(bwd_write_33bytes): - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) + lddqu 17(%rsi), %xmm0 + movdqu %xmm0, 17(%rdi) L(bwd_write_17bytes): - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret + lddqu 1(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 1(%rdi) + movdqu %xmm1, (%rdi) + ret - .p2align 4 + .p2align 4 L(bwd_write_1bytes): - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret + movzbl (%rsi), %edx + mov %dl, (%rdi) + ret END (MEMCPY) - .section .rodata.ssse3,"a",@progbits - .p2align 3 + .section .rodata.ssse3,"a",@progbits + .p2align 3 L(table_144_bytes_bwd): - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - .p2align 3 + .p2align 3 L(table_144_bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - .p2align 3 + .p2align 3 L(shl_table_fwd): - .int JMPTBL (L(shl_0), L(shl_table_fwd)) - .int JMPTBL (L(shl_1), L(shl_table_fwd)) - .int JMPTBL (L(shl_2), L(shl_table_fwd)) - .int JMPTBL (L(shl_3), L(shl_table_fwd)) - .int JMPTBL (L(shl_4), L(shl_table_fwd)) - .int JMPTBL (L(shl_5), L(shl_table_fwd)) - .int JMPTBL (L(shl_6), L(shl_table_fwd)) - .int JMPTBL (L(shl_7), L(shl_table_fwd)) - .int JMPTBL (L(shl_8), L(shl_table_fwd)) - .int JMPTBL (L(shl_9), L(shl_table_fwd)) - .int JMPTBL (L(shl_10), L(shl_table_fwd)) - .int JMPTBL (L(shl_11), L(shl_table_fwd)) - .int JMPTBL (L(shl_12), L(shl_table_fwd)) - .int JMPTBL (L(shl_13), L(shl_table_fwd)) - .int JMPTBL (L(shl_14), L(shl_table_fwd)) - .int JMPTBL (L(shl_15), L(shl_table_fwd)) + .int JMPTBL (L(shl_0), L(shl_table_fwd)) + .int JMPTBL (L(shl_1), L(shl_table_fwd)) + .int JMPTBL (L(shl_2), L(shl_table_fwd)) + .int JMPTBL (L(shl_3), L(shl_table_fwd)) + .int JMPTBL (L(shl_4), L(shl_table_fwd)) + .int JMPTBL (L(shl_5), L(shl_table_fwd)) + .int JMPTBL (L(shl_6), L(shl_table_fwd)) + .int JMPTBL (L(shl_7), L(shl_table_fwd)) + .int JMPTBL (L(shl_8), L(shl_table_fwd)) + .int JMPTBL (L(shl_9), L(shl_table_fwd)) + .int JMPTBL (L(shl_10), L(shl_table_fwd)) + .int JMPTBL (L(shl_11), L(shl_table_fwd)) + .int JMPTBL (L(shl_12), L(shl_table_fwd)) + .int JMPTBL (L(shl_13), L(shl_table_fwd)) + .int JMPTBL (L(shl_14), L(shl_table_fwd)) + .int JMPTBL (L(shl_15), L(shl_table_fwd)) - .p2align 3 + .p2align 3 L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) #endif diff --git a/utils/memcpy-bench/glibc/memcpy-ssse3.S b/utils/memcpy-bench/glibc/memcpy-ssse3.S index 2fd26651645..11cb6559a8b 100644 --- a/utils/memcpy-bench/glibc/memcpy-ssse3.S +++ b/utils/memcpy-bench/glibc/memcpy-ssse3.S @@ -24,3129 +24,3129 @@ #include "asm-syntax.h" #ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -# define MEMPCPY __mempcpy_ssse3 -# define MEMPCPY_CHK __mempcpy_chk_ssse3 +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 #endif -#define JMPTBL(I, B) I - B +#define JMPTBL(I, B) I - B /* Branch to an entry in a jump table. TABLE is a jump table with relative offsets. INDEX is a register contains the index into the jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + _CET_NOTRACK jmp *INDEX; \ ud2 - .section .text.ssse3,"ax",@progbits + .section .text.ssse3,"ax",@progbits #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMPCPY_CHK) ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) END (MEMPCPY) #endif #if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMCPY_CHK) #endif ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP + mov %RDI_LP, %RAX_LP #ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP + add %RDX_LP, %RAX_LP #endif #ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx + /* Clear the upper 32 bits. */ + mov %edx, %edx #endif #ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(write_0bytes) - cmp $79, %rdx - jbe L(copy_forward) - jmp L(copy_backward) + cmp %rsi, %rdi + jb L(copy_forward) + je L(write_0bytes) + cmp $79, %rdx + jbe L(copy_forward) + jmp L(copy_backward) L(copy_forward): #endif L(start): - cmp $79, %rdx - lea L(table_less_80bytes)(%rip), %r11 - ja L(80bytesormore) - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 + cmp $79, %rdx + lea L(table_less_80bytes)(%rip), %r11 + ja L(80bytesormore) + movslq (%r11, %rdx, 4), %r9 + add %rdx, %rsi + add %rdx, %rdi + add %r11, %r9 + _CET_NOTRACK jmp *%r9 + ud2 - .p2align 4 + .p2align 4 L(80bytesormore): #ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) + cmp %dil, %sil + jle L(copy_backward) #endif - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi + movdqu (%rsi), %xmm0 + mov %rdi, %rcx + and $-16, %rdi + add $16, %rdi + mov %rcx, %r8 + sub %rdi, %rcx + add %rcx, %rdx + sub %rcx, %rsi #ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP #else - mov __x86_shared_cache_size_half(%rip), %RCX_LP + mov __x86_shared_cache_size_half(%rip), %RCX_LP #endif - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_fwd) - and $0xf, %r9 - jz L(shl_0) + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_fwd) + and $0xf, %r9 + jz L(shl_0) #ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP + mov $DATA_CACHE_SIZE_HALF, %RCX_LP #else - mov __x86_data_cache_size_half(%rip), %RCX_LP + mov __x86_data_cache_size_half(%rip), %RCX_LP #endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - .p2align 4 + .p2align 4 L(copy_backward): - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi + movdqu -16(%rsi, %rdx), %xmm0 + add %rdx, %rsi + lea -16(%rdi, %rdx), %r8 + add %rdx, %rdi - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi + mov %rdi, %rcx + and $0xf, %rcx + xor %rcx, %rdi + sub %rcx, %rdx + sub %rcx, %rsi #ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP #else - mov __x86_shared_cache_size_half(%rip), %RCX_LP + mov __x86_shared_cache_size_half(%rip), %RCX_LP #endif - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_bwd) - and $0xf, %r9 - jz L(shl_0_bwd) + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_bwd) + and $0xf, %r9 + jz L(shl_0_bwd) #ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP + mov $DATA_CACHE_SIZE_HALF, %RCX_LP #else - mov __x86_data_cache_size_half(%rip), %RCX_LP + mov __x86_data_cache_size_half(%rip), %RCX_LP #endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) + BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - .p2align 4 + .p2align 4 L(shl_0): - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble) - cmp $64, %rdx - jb L(shl_0_less_64bytes) - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi + sub $16, %rdx + movdqa (%rsi), %xmm1 + add $16, %rsi + movdqa %xmm1, (%rdi) + add $16, %rdi + cmp $128, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble) + cmp $64, %rdx + jb L(shl_0_less_64bytes) + movaps (%rsi), %xmm4 + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm4, (%rdi) + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + sub $64, %rdx + add $64, %rsi + add $64, %rdi L(shl_0_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP #else - cmp __x86_data_cache_size_half(%rip), %RDX_LP + cmp __x86_data_cache_size_half(%rip), %RDX_LP #endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_loop) + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_loop) L(shl_0_gobble_cache_loop): - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 + movdqa (%rsi), %xmm4 + movaps 0x10(%rsi), %xmm1 + movaps 0x20(%rsi), %xmm2 + movaps 0x30(%rsi), %xmm3 - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) + movdqa %xmm4, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi + sub $128, %rdx + movaps 0x40(%rsi), %xmm4 + movaps 0x50(%rsi), %xmm5 + movaps 0x60(%rsi), %xmm6 + movaps 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi - jae L(shl_0_gobble_cache_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_cache_less_64bytes) + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_cache_less_64bytes) - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 + movdqa (%rsi), %xmm4 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) + movdqa %xmm4, (%rdi) + movdqa %xmm1, 0x10(%rdi) - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi + movdqa 0x20(%rsi), %xmm4 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi + movdqa %xmm4, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi L(shl_0_cache_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) + prefetcht0 0x1c0(%rsi) + prefetcht0 0x280(%rsi) - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi + movdqa (%rsi), %xmm0 + movdqa 0x10(%rsi), %xmm1 + movdqa 0x20(%rsi), %xmm2 + movdqa 0x30(%rsi), %xmm3 + movdqa 0x40(%rsi), %xmm4 + movdqa 0x50(%rsi), %xmm5 + movdqa 0x60(%rsi), %xmm6 + movdqa 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi - jae L(shl_0_gobble_mem_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_less_64bytes) + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_less_64bytes) - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 + movdqa (%rsi), %xmm0 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi + movdqa 0x20(%rsi), %xmm0 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi + movdqa %xmm0, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi L(shl_0_mem_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_less_32bytes) - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi + cmp $0x20, %rdx + jb L(shl_0_mem_less_32bytes) + movdqa (%rsi), %xmm0 + sub $0x20, %rdx + movdqa 0x10(%rsi), %xmm1 + add $0x20, %rsi + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + add $0x20, %rdi L(shl_0_mem_less_32bytes): - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_0_bwd): - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble_bwd) - cmp $64, %rdx - jb L(shl_0_less_64bytes_bwd) - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi + sub $16, %rdx + movdqa -0x10(%rsi), %xmm1 + sub $16, %rsi + movdqa %xmm1, -0x10(%rdi) + sub $16, %rdi + cmp $0x80, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble_bwd) + cmp $64, %rdx + jb L(shl_0_less_64bytes_bwd) + movaps -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + sub $64, %rdx + sub $0x40, %rsi + sub $0x40, %rdi L(shl_0_less_64bytes_bwd): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_0_gobble_bwd): #ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP #else - cmp __x86_data_cache_size_half(%rip), %RDX_LP + cmp __x86_data_cache_size_half(%rip), %RDX_LP #endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_bwd_loop) + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_bwd_loop) L(shl_0_gobble_bwd_loop): - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 + movdqa -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) + movdqa %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi + sub $0x80, %rdx + movaps -0x50(%rsi), %xmm4 + movaps -0x60(%rsi), %xmm5 + movaps -0x70(%rsi), %xmm6 + movaps -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi - jae L(shl_0_gobble_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_gobble_bwd_less_64bytes) + jae L(shl_0_gobble_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_gobble_bwd_less_64bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi L(shl_0_gobble_bwd_less_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_0_gobble_mem_bwd_loop): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x280(%rsi) + movdqa -0x10(%rsi), %xmm0 + movdqa -0x20(%rsi), %xmm1 + movdqa -0x30(%rsi), %xmm2 + movdqa -0x40(%rsi), %xmm3 + movdqa -0x50(%rsi), %xmm4 + movdqa -0x60(%rsi), %xmm5 + movdqa -0x70(%rsi), %xmm6 + movdqa -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + movdqa %xmm2, -0x30(%rdi) + movdqa %xmm3, -0x40(%rdi) + movdqa %xmm4, -0x50(%rdi) + movdqa %xmm5, -0x60(%rdi) + movdqa %xmm6, -0x70(%rdi) + movdqa %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi - jae L(shl_0_gobble_mem_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_bwd_less_64bytes) + jae L(shl_0_gobble_mem_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_bwd_less_64bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi L(shl_0_mem_bwd_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_bwd_less_32bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi + cmp $0x20, %rdx + jb L(shl_0_mem_bwd_less_32bytes) + movdqa -0x10(%rsi), %xmm0 + sub $0x20, %rdx + movdqa -0x20(%rsi), %xmm1 + sub $0x20, %rsi + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + sub $0x20, %rdi L(shl_0_mem_bwd_less_32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_1): - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_fwd) - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 + lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_fwd) + lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 L(L1_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_1_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_1_loop_L1): - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_1_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $1, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $1, %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $1, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_1_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_1_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_1_bwd): - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_bwd) - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 + lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_bwd) + lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 L(L1_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_1_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_1_bwd_loop_L1): - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 + movaps -0x11(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x21(%rsi), %xmm3 + movaps -0x31(%rsi), %xmm4 + movaps -0x41(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $1, %xmm2, %xmm1 + palignr $1, %xmm3, %xmm2 + palignr $1, %xmm4, %xmm3 + palignr $1, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_1_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_1_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_1_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_2): - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_fwd) - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 + lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_fwd) + lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 L(L2_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_2_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_2_loop_L1): - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_2_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $2, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $2, %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $2, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_2_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_2_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_2_bwd): - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_bwd) - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 + lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_bwd) + lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 L(L2_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_2_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_2_bwd_loop_L1): - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 + movaps -0x12(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x22(%rsi), %xmm3 + movaps -0x32(%rsi), %xmm4 + movaps -0x42(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $2, %xmm2, %xmm1 + palignr $2, %xmm3, %xmm2 + palignr $2, %xmm4, %xmm3 + palignr $2, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_2_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_2_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_2_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_3): - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_fwd) - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 + lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_fwd) + lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 L(L3_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_3_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_3_loop_L1): - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_3_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $3, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $3, %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $3, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_3_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_3_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_3_bwd): - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_bwd) - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 + lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_bwd) + lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 L(L3_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_3_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_3_bwd_loop_L1): - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 + movaps -0x13(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x23(%rsi), %xmm3 + movaps -0x33(%rsi), %xmm4 + movaps -0x43(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $3, %xmm2, %xmm1 + palignr $3, %xmm3, %xmm2 + palignr $3, %xmm4, %xmm3 + palignr $3, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_3_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_3_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_3_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_4): - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_fwd) - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 + lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_fwd) + lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 L(L4_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_4_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_4_loop_L1): - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_4_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $4, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $4, %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $4, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_4_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_4_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_4_bwd): - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_bwd) - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 + lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_bwd) + lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 L(L4_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_4_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_4_bwd_loop_L1): - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 + movaps -0x14(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x24(%rsi), %xmm3 + movaps -0x34(%rsi), %xmm4 + movaps -0x44(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $4, %xmm2, %xmm1 + palignr $4, %xmm3, %xmm2 + palignr $4, %xmm4, %xmm3 + palignr $4, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_4_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_4_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_4_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_5): - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_fwd) - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 + lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_fwd) + lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 L(L5_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_5_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_5_loop_L1): - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_5_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $5, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $5, %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $5, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_5_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_5_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_5_bwd): - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_bwd) - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 + lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_bwd) + lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 L(L5_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_5_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_5_bwd_loop_L1): - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 + movaps -0x15(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x25(%rsi), %xmm3 + movaps -0x35(%rsi), %xmm4 + movaps -0x45(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $5, %xmm2, %xmm1 + palignr $5, %xmm3, %xmm2 + palignr $5, %xmm4, %xmm3 + palignr $5, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_5_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_5_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_5_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_6): - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_fwd) - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 + lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_fwd) + lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 L(L6_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_6_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_6_loop_L1): - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_6_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $6, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $6, %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $6, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_6_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_6_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_6_bwd): - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_bwd) - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 + lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_bwd) + lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 L(L6_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_6_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_6_bwd_loop_L1): - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 + movaps -0x16(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x26(%rsi), %xmm3 + movaps -0x36(%rsi), %xmm4 + movaps -0x46(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $6, %xmm2, %xmm1 + palignr $6, %xmm3, %xmm2 + palignr $6, %xmm4, %xmm3 + palignr $6, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_6_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_6_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_6_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_7): - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_fwd) - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 + lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_fwd) + lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 L(L7_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_7_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_7_loop_L1): - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_7_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $7, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $7, %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $7, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_7_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_7_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_7_bwd): - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_bwd) - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 + lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_bwd) + lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 L(L7_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_7_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_7_bwd_loop_L1): - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 + movaps -0x17(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x27(%rsi), %xmm3 + movaps -0x37(%rsi), %xmm4 + movaps -0x47(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $7, %xmm2, %xmm1 + palignr $7, %xmm3, %xmm2 + palignr $7, %xmm4, %xmm3 + palignr $7, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_7_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_7_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_7_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_8): - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_fwd) - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 + lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_fwd) + lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 L(L8_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 L(shl_8_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_8_loop_L1): - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_8_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 - .p2align 4 + sub $64, %rdx + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $8, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $8, %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $8, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_8_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 + .p2align 4 L(shl_8_end): - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + lea 64(%rdx), %rdx + movaps %xmm4, -0x20(%rdi) + add %rdx, %rsi + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_8_bwd): - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_bwd) - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 + lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_bwd) + lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 L(L8_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_8_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_8_bwd_loop_L1): - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 + movaps -0x18(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x28(%rsi), %xmm3 + movaps -0x38(%rsi), %xmm4 + movaps -0x48(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $8, %xmm2, %xmm1 + palignr $8, %xmm3, %xmm2 + palignr $8, %xmm4, %xmm3 + palignr $8, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_8_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_8_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_8_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_9): - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_fwd) - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 + lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_fwd) + lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 L(L9_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_9_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_9_loop_L1): - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_9_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $9, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $9, %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $9, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_9_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_9_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_9_bwd): - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_bwd) - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 + lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_bwd) + lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 L(L9_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_9_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_9_bwd_loop_L1): - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 + movaps -0x19(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x29(%rsi), %xmm3 + movaps -0x39(%rsi), %xmm4 + movaps -0x49(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $9, %xmm2, %xmm1 + palignr $9, %xmm3, %xmm2 + palignr $9, %xmm4, %xmm3 + palignr $9, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_9_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_9_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_9_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_10): - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_fwd) - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 + lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_fwd) + lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 L(L10_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_10_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_10_loop_L1): - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_10_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $10, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $10, %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $10, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_10_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_10_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_10_bwd): - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_bwd) - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 + lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_bwd) + lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 L(L10_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_10_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_10_bwd_loop_L1): - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 + movaps -0x1a(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2a(%rsi), %xmm3 + movaps -0x3a(%rsi), %xmm4 + movaps -0x4a(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $10, %xmm2, %xmm1 + palignr $10, %xmm3, %xmm2 + palignr $10, %xmm4, %xmm3 + palignr $10, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_10_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_10_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_10_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_11): - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_fwd) - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 + lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_fwd) + lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 L(L11_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_11_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_11_loop_L1): - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_11_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $11, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $11, %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $11, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_11_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_11_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_11_bwd): - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_bwd) - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 + lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_bwd) + lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 L(L11_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_11_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_11_bwd_loop_L1): - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 + movaps -0x1b(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2b(%rsi), %xmm3 + movaps -0x3b(%rsi), %xmm4 + movaps -0x4b(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $11, %xmm2, %xmm1 + palignr $11, %xmm3, %xmm2 + palignr $11, %xmm4, %xmm3 + palignr $11, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_11_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_11_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_11_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_12): - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_fwd) - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 + lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_fwd) + lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 L(L12_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_12_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_12_loop_L1): - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_12_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $12, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $12, %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $12, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_12_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_12_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_12_bwd): - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_bwd) - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 + lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_bwd) + lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 L(L12_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_12_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_12_bwd_loop_L1): - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 + movaps -0x1c(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2c(%rsi), %xmm3 + movaps -0x3c(%rsi), %xmm4 + movaps -0x4c(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $12, %xmm2, %xmm1 + palignr $12, %xmm3, %xmm2 + palignr $12, %xmm4, %xmm3 + palignr $12, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_12_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_12_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_12_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_13): - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_fwd) - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 + lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_fwd) + lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 L(L13_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_13_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_13_loop_L1): - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_13_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $13, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $13, %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $13, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_13_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_13_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_13_bwd): - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_bwd) - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 + lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_bwd) + lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 L(L13_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_13_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_13_bwd_loop_L1): - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 + movaps -0x1d(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2d(%rsi), %xmm3 + movaps -0x3d(%rsi), %xmm4 + movaps -0x4d(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $13, %xmm2, %xmm1 + palignr $13, %xmm3, %xmm2 + palignr $13, %xmm4, %xmm3 + palignr $13, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_13_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_13_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_13_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_14): - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_fwd) - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 + lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_fwd) + lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 L(L14_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_14_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_14_loop_L1): - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_14_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $14, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $14, %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $14, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_14_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_14_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_14_bwd): - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_bwd) - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 + lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_bwd) + lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 L(L14_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_14_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_14_bwd_loop_L1): - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 + movaps -0x1e(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2e(%rsi), %xmm3 + movaps -0x3e(%rsi), %xmm4 + movaps -0x4e(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $14, %xmm2, %xmm1 + palignr $14, %xmm3, %xmm2 + palignr $14, %xmm4, %xmm3 + palignr $14, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_14_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_14_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_14_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_15): - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_fwd) - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 + lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_fwd) + lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 L(L15_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_15_loop_L2): - prefetchnta 0x1c0(%rsi) + prefetchnta 0x1c0(%rsi) L(shl_15_loop_L1): - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_15_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + sub $64, %rdx + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $15, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $15, %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $15, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_15_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_15_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(shl_15_bwd): - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_bwd) - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 + lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_bwd) + lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 L(L15_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 + lea -64(%rdx), %rdx + _CET_NOTRACK jmp *%r9 + ud2 L(shl_15_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) + prefetchnta -0x1c0(%rsi) L(shl_15_bwd_loop_L1): - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 + movaps -0x1f(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2f(%rsi), %xmm3 + movaps -0x3f(%rsi), %xmm4 + movaps -0x4f(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $15, %xmm2, %xmm1 + palignr $15, %xmm3, %xmm2 + palignr $15, %xmm4, %xmm3 + palignr $15, %xmm5, %xmm4 - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi - movaps %xmm3, 0x10(%rdi) - jb L(shl_15_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 + movaps %xmm3, 0x10(%rdi) + jb L(shl_15_bwd_end) + movaps %xmm4, (%rdi) + _CET_NOTRACK jmp *%r9 + ud2 L(shl_15_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - .p2align 4 + .p2align 4 L(write_72bytes): - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret + movdqu -72(%rsi), %xmm0 + movdqu -56(%rsi), %xmm1 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -72(%rdi) + movdqu %xmm1, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_64bytes): - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -64(%rsi), %xmm0 + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + movdqu %xmm0, -64(%rdi) + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_56bytes): - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret + movdqu -56(%rsi), %xmm0 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_48bytes): - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_40bytes): - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_32bytes): - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_24bytes): - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_16bytes): - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) L(write_0bytes): - ret + ret - .p2align 4 + .p2align 4 L(write_73bytes): - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -73(%rsi), %xmm0 + movdqu -57(%rsi), %xmm1 + mov -41(%rsi), %rcx + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %r8 + mov -4(%rsi), %edx + movdqu %xmm0, -73(%rdi) + movdqu %xmm1, -57(%rdi) + mov %rcx, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %r8, -9(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_65bytes): - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -65(%rsi), %xmm0 + movdqu -49(%rsi), %xmm1 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -65(%rdi) + movdqu %xmm1, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_57bytes): - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -57(%rsi), %xmm0 + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -57(%rdi) + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_49bytes): - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -49(%rsi), %xmm0 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_41bytes): - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret - .p2align 4 + .p2align 4 L(write_33bytes): - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret - .p2align 4 + .p2align 4 L(write_25bytes): - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret - .p2align 4 + .p2align 4 L(write_17bytes): - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_9bytes): - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_1bytes): - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret + mov -1(%rsi), %dl + mov %dl, -1(%rdi) + ret - .p2align 4 + .p2align 4 L(write_74bytes): - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -74(%rsi), %xmm0 + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -74(%rdi) + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_66bytes): - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -66(%rsi), %xmm0 + movdqu -50(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -66(%rdi) + movdqu %xmm1, -50(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_58bytes): - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_50bytes): - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -50(%rsi), %xmm0 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -50(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_42bytes): - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_34bytes): - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_26bytes): - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_18bytes): - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_10bytes): - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_2bytes): - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret + mov -2(%rsi), %dx + mov %dx, -2(%rdi) + ret - .p2align 4 + .p2align 4 L(write_75bytes): - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -75(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -75(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_67bytes): - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -67(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -67(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_59bytes): - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -59(%rsi), %xmm0 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_51bytes): - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -51(%rsi), %xmm0 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -51(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_43bytes): - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_35bytes): - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_27bytes): - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_19bytes): - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_11bytes): - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret - .p2align 4 + .p2align 4 L(write_76bytes): - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -76(%rsi), %xmm0 + movdqu -60(%rsi), %xmm1 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -76(%rdi) + movdqu %xmm1, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_68bytes): - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -68(%rsi), %xmm0 + movdqu -52(%rsi), %xmm1 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -68(%rdi) + movdqu %xmm1, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_60bytes): - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -60(%rsi), %xmm0 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_52bytes): - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + movdqu -52(%rsi), %xmm0 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_44bytes): - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_36bytes): - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_28bytes): - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_20bytes): - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_12bytes): - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_77bytes): - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -77(%rsi), %xmm0 + movdqu -61(%rsi), %xmm1 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -77(%rdi) + movdqu %xmm1, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_69bytes): - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -69(%rsi), %xmm0 + movdqu -53(%rsi), %xmm1 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -69(%rdi) + movdqu %xmm1, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_61bytes): - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -61(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_53bytes): - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -53(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_45bytes): - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_37bytes): - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_29bytes): - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_21bytes): - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_13bytes): - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_78bytes): - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -78(%rsi), %xmm0 + movdqu -62(%rsi), %xmm1 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -78(%rdi) + movdqu %xmm1, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_70bytes): - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -70(%rsi), %xmm0 + movdqu -54(%rsi), %xmm1 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -70(%rdi) + movdqu %xmm1, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_62bytes): - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -62(%rsi), %xmm0 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_54bytes): - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -54(%rsi), %xmm0 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_46bytes): - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_38bytes): - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_30bytes): - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_22bytes): - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_14bytes): - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(write_79bytes): - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -79(%rsi), %xmm0 + movdqu -63(%rsi), %xmm1 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -79(%rdi) + movdqu %xmm1, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_71bytes): - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -71(%rsi), %xmm0 + movdqu -55(%rsi), %xmm1 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -71(%rdi) + movdqu %xmm1, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_63bytes): - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -63(%rsi), %xmm0 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_55bytes): - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + movdqu -55(%rsi), %xmm0 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_47bytes): - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_39bytes): - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_31bytes): - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_23bytes): - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_15bytes): - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret - .p2align 4 + .p2align 4 L(write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret - .p2align 4 + .p2align 4 L(large_page_fwd): - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx + movdqu (%rsi), %xmm1 + lea 16(%rsi), %rsi + movdqu %xmm0, (%r8) + movntdq %xmm1, (%rdi) + lea 16(%rdi), %rdi + lea -0x90(%rdx), %rdx #ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - shl $2, %rcx - cmp %rcx, %rdx - jb L(ll_cache_copy_fwd_start) + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + shl $2, %rcx + cmp %rcx, %rdx + jb L(ll_cache_copy_fwd_start) L(memmove_is_memcpy_fwd): #endif L(large_page_loop): - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(large_page_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_64bytes) + sub $0x80, %rdx + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(large_page_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_64bytes) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx L(large_page_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + add %rdx, %rsi + add %rdx, %rdi + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #ifdef USE_AS_MEMMOVE - .p2align 4 + .p2align 4 L(ll_cache_copy_fwd_start): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x200(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi + prefetcht0 0x1c0(%rsi) + prefetcht0 0x200(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(ll_cache_copy_fwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_fwd_64bytes) + sub $0x80, %rdx + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(ll_cache_copy_fwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_fwd_64bytes) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx L(large_page_ll_less_fwd_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #endif - .p2align 4 + .p2align 4 L(large_page_bwd): - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx + movdqu -0x10(%rsi), %xmm1 + lea -16(%rsi), %rsi + movdqu %xmm0, (%r8) + movdqa %xmm1, -0x10(%rdi) + lea -16(%rdi), %rdi + lea -0x90(%rdx), %rdx #ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jb L(ll_cache_copy_bwd_start) + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jb L(ll_cache_copy_bwd_start) L(memmove_is_memcpy_bwd): #endif L(large_page_bwd_loop): - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(large_page_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_bwd_64bytes) + sub $0x80, %rdx + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + movntdq %xmm4, -0x50(%rdi) + movntdq %xmm5, -0x60(%rdi) + movntdq %xmm6, -0x70(%rdi) + movntdq %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(large_page_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_bwd_64bytes) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx L(large_page_less_bwd_64bytes): - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #ifdef USE_AS_MEMMOVE - .p2align 4 + .p2align 4 L(ll_cache_copy_bwd_start): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x200(%rsi) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x200(%rsi) + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(ll_cache_copy_bwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_bwd_64bytes) + sub $0x80, %rdx + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(ll_cache_copy_bwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_bwd_64bytes) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx L(large_page_ll_less_bwd_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #endif END (MEMCPY) - .section .rodata.ssse3,"a",@progbits - .p2align 3 + .section .rodata.ssse3,"a",@progbits + .p2align 3 L(table_less_80bytes): - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - .p2align 3 + .p2align 3 L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) - .p2align 3 + .p2align 3 L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) #endif diff --git a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S index 9ee6f0a71c3..2de73b29a85 100644 --- a/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S +++ b/utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S @@ -1,12 +1,12 @@ #if 1 -# define VEC_SIZE 32 -# define VEC(i) ymm##i -# define VMOVNT vmovntdq -# define VMOVU vmovdqu -# define VMOVA vmovdqa +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa -# define SECTION(p) p##.avx -# define MEMMOVE_SYMBOL(p,s) p##_avx_##s +# define SECTION(p) p##.avx +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s # include "memmove-vec-unaligned-erms.S" #endif diff --git a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S index b14d92fd6a8..3effa845274 100644 --- a/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S +++ b/utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S @@ -22,396 +22,396 @@ # include "asm-syntax.h" - .section .text.avx512,"ax",@progbits + .section .text.avx512,"ax",@progbits ENTRY (__mempcpy_chk_avx512_no_vzeroupper) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (__mempcpy_chk_avx512_no_vzeroupper) ENTRY (__mempcpy_avx512_no_vzeroupper) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) END (__mempcpy_avx512_no_vzeroupper) ENTRY (__memmove_chk_avx512_no_vzeroupper) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (__memmove_chk_avx512_no_vzeroupper) ENTRY (__memmove_avx512_no_vzeroupper) - mov %RDI_LP, %RAX_LP + mov %RDI_LP, %RAX_LP # ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP + add %RDX_LP, %RAX_LP # endif L(start): # ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx + /* Clear the upper 32 bits. */ + mov %edx, %edx # endif - lea (%rsi, %rdx), %rcx - lea (%rdi, %rdx), %r9 - cmp $512, %rdx - ja L(512bytesormore) + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx + ja L(512bytesormore) L(check): - cmp $16, %rdx - jbe L(less_16bytes) - cmp $256, %rdx - jb L(less_256bytes) - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups 0x80(%rsi), %zmm2 - vmovups 0xC0(%rsi), %zmm3 - vmovups -0x100(%rcx), %zmm4 - vmovups -0xC0(%rcx), %zmm5 - vmovups -0x80(%rcx), %zmm6 - vmovups -0x40(%rcx), %zmm7 - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, 0x80(%rdi) - vmovups %zmm3, 0xC0(%rdi) - vmovups %zmm4, -0x100(%r9) - vmovups %zmm5, -0xC0(%r9) - vmovups %zmm6, -0x80(%r9) - vmovups %zmm7, -0x40(%r9) - ret + cmp $16, %rdx + jbe L(less_16bytes) + cmp $256, %rdx + jb L(less_256bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups -0x100(%rcx), %zmm4 + vmovups -0xC0(%rcx), %zmm5 + vmovups -0x80(%rcx), %zmm6 + vmovups -0x40(%rcx), %zmm7 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, -0x100(%r9) + vmovups %zmm5, -0xC0(%r9) + vmovups %zmm6, -0x80(%r9) + vmovups %zmm7, -0x40(%r9) + ret L(less_256bytes): - cmp $128, %dl - jb L(less_128bytes) - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups -0x80(%rcx), %zmm2 - vmovups -0x40(%rcx), %zmm3 - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, -0x80(%r9) - vmovups %zmm3, -0x40(%r9) - ret + cmp $128, %dl + jb L(less_128bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups -0x80(%rcx), %zmm2 + vmovups -0x40(%rcx), %zmm3 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, -0x80(%r9) + vmovups %zmm3, -0x40(%r9) + ret L(less_128bytes): - cmp $64, %dl - jb L(less_64bytes) - vmovdqu (%rsi), %ymm0 - vmovdqu 0x20(%rsi), %ymm1 - vmovdqu -0x40(%rcx), %ymm2 - vmovdqu -0x20(%rcx), %ymm3 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, 0x20(%rdi) - vmovdqu %ymm2, -0x40(%r9) - vmovdqu %ymm3, -0x20(%r9) - ret + cmp $64, %dl + jb L(less_64bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu 0x20(%rsi), %ymm1 + vmovdqu -0x40(%rcx), %ymm2 + vmovdqu -0x20(%rcx), %ymm3 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 0x20(%rdi) + vmovdqu %ymm2, -0x40(%r9) + vmovdqu %ymm3, -0x20(%r9) + ret L(less_64bytes): - cmp $32, %dl - jb L(less_32bytes) - vmovdqu (%rsi), %ymm0 - vmovdqu -0x20(%rcx), %ymm1 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, -0x20(%r9) - ret + cmp $32, %dl + jb L(less_32bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu -0x20(%rcx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -0x20(%r9) + ret L(less_32bytes): - vmovdqu (%rsi), %xmm0 - vmovdqu -0x10(%rcx), %xmm1 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, -0x10(%r9) - ret + vmovdqu (%rsi), %xmm0 + vmovdqu -0x10(%rcx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -0x10(%r9) + ret L(less_16bytes): - cmp $8, %dl - jb L(less_8bytes) - movq (%rsi), %rsi - movq -0x8(%rcx), %rcx - movq %rsi, (%rdi) - movq %rcx, -0x8(%r9) - ret + cmp $8, %dl + jb L(less_8bytes) + movq (%rsi), %rsi + movq -0x8(%rcx), %rcx + movq %rsi, (%rdi) + movq %rcx, -0x8(%r9) + ret L(less_8bytes): - cmp $4, %dl - jb L(less_4bytes) - mov (%rsi), %esi - mov -0x4(%rcx), %ecx - mov %esi, (%rdi) - mov %ecx, -0x4(%r9) - ret + cmp $4, %dl + jb L(less_4bytes) + mov (%rsi), %esi + mov -0x4(%rcx), %ecx + mov %esi, (%rdi) + mov %ecx, -0x4(%r9) + ret L(less_4bytes): - cmp $2, %dl - jb L(less_2bytes) - mov (%rsi), %si - mov -0x2(%rcx), %cx - mov %si, (%rdi) - mov %cx, -0x2(%r9) - ret + cmp $2, %dl + jb L(less_2bytes) + mov (%rsi), %si + mov -0x2(%rcx), %cx + mov %si, (%rdi) + mov %cx, -0x2(%r9) + ret L(less_2bytes): - cmp $1, %dl - jb L(less_1bytes) - mov (%rsi), %cl - mov %cl, (%rdi) + cmp $1, %dl + jb L(less_1bytes) + mov (%rsi), %cl + mov %cl, (%rdi) L(less_1bytes): - ret + ret L(512bytesormore): # ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %r8 + mov $SHARED_CACHE_SIZE_HALF, %r8 # else - mov __x86_shared_cache_size_half(%rip), %r8 + mov __x86_shared_cache_size_half(%rip), %r8 # endif - cmp %r8, %rdx - jae L(preloop_large) - cmp $1024, %rdx - ja L(1024bytesormore) - prefetcht1 (%rsi) - prefetcht1 0x40(%rsi) - prefetcht1 0x80(%rsi) - prefetcht1 0xC0(%rsi) - prefetcht1 0x100(%rsi) - prefetcht1 0x140(%rsi) - prefetcht1 0x180(%rsi) - prefetcht1 0x1C0(%rsi) - prefetcht1 -0x200(%rcx) - prefetcht1 -0x1C0(%rcx) - prefetcht1 -0x180(%rcx) - prefetcht1 -0x140(%rcx) - prefetcht1 -0x100(%rcx) - prefetcht1 -0xC0(%rcx) - prefetcht1 -0x80(%rcx) - prefetcht1 -0x40(%rcx) - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups 0x80(%rsi), %zmm2 - vmovups 0xC0(%rsi), %zmm3 - vmovups 0x100(%rsi), %zmm4 - vmovups 0x140(%rsi), %zmm5 - vmovups 0x180(%rsi), %zmm6 - vmovups 0x1C0(%rsi), %zmm7 - vmovups -0x200(%rcx), %zmm8 - vmovups -0x1C0(%rcx), %zmm9 - vmovups -0x180(%rcx), %zmm10 - vmovups -0x140(%rcx), %zmm11 - vmovups -0x100(%rcx), %zmm12 - vmovups -0xC0(%rcx), %zmm13 - vmovups -0x80(%rcx), %zmm14 - vmovups -0x40(%rcx), %zmm15 - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, 0x80(%rdi) - vmovups %zmm3, 0xC0(%rdi) - vmovups %zmm4, 0x100(%rdi) - vmovups %zmm5, 0x140(%rdi) - vmovups %zmm6, 0x180(%rdi) - vmovups %zmm7, 0x1C0(%rdi) - vmovups %zmm8, -0x200(%r9) - vmovups %zmm9, -0x1C0(%r9) - vmovups %zmm10, -0x180(%r9) - vmovups %zmm11, -0x140(%r9) - vmovups %zmm12, -0x100(%r9) - vmovups %zmm13, -0xC0(%r9) - vmovups %zmm14, -0x80(%r9) - vmovups %zmm15, -0x40(%r9) - ret + cmp %r8, %rdx + jae L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + prefetcht1 -0x200(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0x40(%rcx) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + vmovups %zmm8, -0x200(%r9) + vmovups %zmm9, -0x1C0(%r9) + vmovups %zmm10, -0x180(%r9) + vmovups %zmm11, -0x140(%r9) + vmovups %zmm12, -0x100(%r9) + vmovups %zmm13, -0xC0(%r9) + vmovups %zmm14, -0x80(%r9) + vmovups %zmm15, -0x40(%r9) + ret L(1024bytesormore): - cmp %rsi, %rdi - ja L(1024bytesormore_bkw) - sub $512, %r9 - vmovups -0x200(%rcx), %zmm8 - vmovups -0x1C0(%rcx), %zmm9 - vmovups -0x180(%rcx), %zmm10 - vmovups -0x140(%rcx), %zmm11 - vmovups -0x100(%rcx), %zmm12 - vmovups -0xC0(%rcx), %zmm13 - vmovups -0x80(%rcx), %zmm14 - vmovups -0x40(%rcx), %zmm15 - prefetcht1 (%rsi) - prefetcht1 0x40(%rsi) - prefetcht1 0x80(%rsi) - prefetcht1 0xC0(%rsi) - prefetcht1 0x100(%rsi) - prefetcht1 0x140(%rsi) - prefetcht1 0x180(%rsi) - prefetcht1 0x1C0(%rsi) + cmp %rsi, %rdi + ja L(1024bytesormore_bkw) + sub $512, %r9 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) /* Loop with unaligned memory access. */ L(gobble_512bytes_loop): - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups 0x80(%rsi), %zmm2 - vmovups 0xC0(%rsi), %zmm3 - vmovups 0x100(%rsi), %zmm4 - vmovups 0x140(%rsi), %zmm5 - vmovups 0x180(%rsi), %zmm6 - vmovups 0x1C0(%rsi), %zmm7 - add $512, %rsi - prefetcht1 (%rsi) - prefetcht1 0x40(%rsi) - prefetcht1 0x80(%rsi) - prefetcht1 0xC0(%rsi) - prefetcht1 0x100(%rsi) - prefetcht1 0x140(%rsi) - prefetcht1 0x180(%rsi) - prefetcht1 0x1C0(%rsi) - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, 0x80(%rdi) - vmovups %zmm3, 0xC0(%rdi) - vmovups %zmm4, 0x100(%rdi) - vmovups %zmm5, 0x140(%rdi) - vmovups %zmm6, 0x180(%rdi) - vmovups %zmm7, 0x1C0(%rdi) - add $512, %rdi - cmp %r9, %rdi - jb L(gobble_512bytes_loop) - vmovups %zmm8, (%r9) - vmovups %zmm9, 0x40(%r9) - vmovups %zmm10, 0x80(%r9) - vmovups %zmm11, 0xC0(%r9) - vmovups %zmm12, 0x100(%r9) - vmovups %zmm13, 0x140(%r9) - vmovups %zmm14, 0x180(%r9) - vmovups %zmm15, 0x1C0(%r9) - ret + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + add $512, %rsi + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + add $512, %rdi + cmp %r9, %rdi + jb L(gobble_512bytes_loop) + vmovups %zmm8, (%r9) + vmovups %zmm9, 0x40(%r9) + vmovups %zmm10, 0x80(%r9) + vmovups %zmm11, 0xC0(%r9) + vmovups %zmm12, 0x100(%r9) + vmovups %zmm13, 0x140(%r9) + vmovups %zmm14, 0x180(%r9) + vmovups %zmm15, 0x1C0(%r9) + ret L(1024bytesormore_bkw): - add $512, %rdi - vmovups 0x1C0(%rsi), %zmm8 - vmovups 0x180(%rsi), %zmm9 - vmovups 0x140(%rsi), %zmm10 - vmovups 0x100(%rsi), %zmm11 - vmovups 0xC0(%rsi), %zmm12 - vmovups 0x80(%rsi), %zmm13 - vmovups 0x40(%rsi), %zmm14 - vmovups (%rsi), %zmm15 - prefetcht1 -0x40(%rcx) - prefetcht1 -0x80(%rcx) - prefetcht1 -0xC0(%rcx) - prefetcht1 -0x100(%rcx) - prefetcht1 -0x140(%rcx) - prefetcht1 -0x180(%rcx) - prefetcht1 -0x1C0(%rcx) - prefetcht1 -0x200(%rcx) + add $512, %rdi + vmovups 0x1C0(%rsi), %zmm8 + vmovups 0x180(%rsi), %zmm9 + vmovups 0x140(%rsi), %zmm10 + vmovups 0x100(%rsi), %zmm11 + vmovups 0xC0(%rsi), %zmm12 + vmovups 0x80(%rsi), %zmm13 + vmovups 0x40(%rsi), %zmm14 + vmovups (%rsi), %zmm15 + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) /* Backward loop with unaligned memory access. */ L(gobble_512bytes_loop_bkw): - vmovups -0x40(%rcx), %zmm0 - vmovups -0x80(%rcx), %zmm1 - vmovups -0xC0(%rcx), %zmm2 - vmovups -0x100(%rcx), %zmm3 - vmovups -0x140(%rcx), %zmm4 - vmovups -0x180(%rcx), %zmm5 - vmovups -0x1C0(%rcx), %zmm6 - vmovups -0x200(%rcx), %zmm7 - sub $512, %rcx - prefetcht1 -0x40(%rcx) - prefetcht1 -0x80(%rcx) - prefetcht1 -0xC0(%rcx) - prefetcht1 -0x100(%rcx) - prefetcht1 -0x140(%rcx) - prefetcht1 -0x180(%rcx) - prefetcht1 -0x1C0(%rcx) - prefetcht1 -0x200(%rcx) - vmovups %zmm0, -0x40(%r9) - vmovups %zmm1, -0x80(%r9) - vmovups %zmm2, -0xC0(%r9) - vmovups %zmm3, -0x100(%r9) - vmovups %zmm4, -0x140(%r9) - vmovups %zmm5, -0x180(%r9) - vmovups %zmm6, -0x1C0(%r9) - vmovups %zmm7, -0x200(%r9) - sub $512, %r9 - cmp %rdi, %r9 - ja L(gobble_512bytes_loop_bkw) - vmovups %zmm8, -0x40(%rdi) - vmovups %zmm9, -0x80(%rdi) - vmovups %zmm10, -0xC0(%rdi) - vmovups %zmm11, -0x100(%rdi) - vmovups %zmm12, -0x140(%rdi) - vmovups %zmm13, -0x180(%rdi) - vmovups %zmm14, -0x1C0(%rdi) - vmovups %zmm15, -0x200(%rdi) - ret + vmovups -0x40(%rcx), %zmm0 + vmovups -0x80(%rcx), %zmm1 + vmovups -0xC0(%rcx), %zmm2 + vmovups -0x100(%rcx), %zmm3 + vmovups -0x140(%rcx), %zmm4 + vmovups -0x180(%rcx), %zmm5 + vmovups -0x1C0(%rcx), %zmm6 + vmovups -0x200(%rcx), %zmm7 + sub $512, %rcx + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + vmovups %zmm0, -0x40(%r9) + vmovups %zmm1, -0x80(%r9) + vmovups %zmm2, -0xC0(%r9) + vmovups %zmm3, -0x100(%r9) + vmovups %zmm4, -0x140(%r9) + vmovups %zmm5, -0x180(%r9) + vmovups %zmm6, -0x1C0(%r9) + vmovups %zmm7, -0x200(%r9) + sub $512, %r9 + cmp %rdi, %r9 + ja L(gobble_512bytes_loop_bkw) + vmovups %zmm8, -0x40(%rdi) + vmovups %zmm9, -0x80(%rdi) + vmovups %zmm10, -0xC0(%rdi) + vmovups %zmm11, -0x100(%rdi) + vmovups %zmm12, -0x140(%rdi) + vmovups %zmm13, -0x180(%rdi) + vmovups %zmm14, -0x1C0(%rdi) + vmovups %zmm15, -0x200(%rdi) + ret L(preloop_large): - cmp %rsi, %rdi - ja L(preloop_large_bkw) - vmovups (%rsi), %zmm4 - vmovups 0x40(%rsi), %zmm5 + cmp %rsi, %rdi + ja L(preloop_large_bkw) + vmovups (%rsi), %zmm4 + vmovups 0x40(%rsi), %zmm5 - mov %rdi, %r11 + mov %rdi, %r11 /* Align destination for access with non-temporal stores in the loop. */ - mov %rdi, %r8 - and $-0x80, %rdi - add $0x80, %rdi - sub %rdi, %r8 - sub %r8, %rsi - add %r8, %rdx + mov %rdi, %r8 + and $-0x80, %rdi + add $0x80, %rdi + sub %rdi, %r8 + sub %r8, %rsi + add %r8, %rdx L(gobble_256bytes_nt_loop): - prefetcht1 0x200(%rsi) - prefetcht1 0x240(%rsi) - prefetcht1 0x280(%rsi) - prefetcht1 0x2C0(%rsi) - prefetcht1 0x300(%rsi) - prefetcht1 0x340(%rsi) - prefetcht1 0x380(%rsi) - prefetcht1 0x3C0(%rsi) - vmovdqu64 (%rsi), %zmm0 - vmovdqu64 0x40(%rsi), %zmm1 - vmovdqu64 0x80(%rsi), %zmm2 - vmovdqu64 0xC0(%rsi), %zmm3 - vmovntdq %zmm0, (%rdi) - vmovntdq %zmm1, 0x40(%rdi) - vmovntdq %zmm2, 0x80(%rdi) - vmovntdq %zmm3, 0xC0(%rdi) - sub $256, %rdx - add $256, %rsi - add $256, %rdi - cmp $256, %rdx - ja L(gobble_256bytes_nt_loop) - sfence - vmovups %zmm4, (%r11) - vmovups %zmm5, 0x40(%r11) - jmp L(check) + prefetcht1 0x200(%rsi) + prefetcht1 0x240(%rsi) + prefetcht1 0x280(%rsi) + prefetcht1 0x2C0(%rsi) + prefetcht1 0x300(%rsi) + prefetcht1 0x340(%rsi) + prefetcht1 0x380(%rsi) + prefetcht1 0x3C0(%rsi) + vmovdqu64 (%rsi), %zmm0 + vmovdqu64 0x40(%rsi), %zmm1 + vmovdqu64 0x80(%rsi), %zmm2 + vmovdqu64 0xC0(%rsi), %zmm3 + vmovntdq %zmm0, (%rdi) + vmovntdq %zmm1, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm3, 0xC0(%rdi) + sub $256, %rdx + add $256, %rsi + add $256, %rdi + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop) + sfence + vmovups %zmm4, (%r11) + vmovups %zmm5, 0x40(%r11) + jmp L(check) L(preloop_large_bkw): - vmovups -0x80(%rcx), %zmm4 - vmovups -0x40(%rcx), %zmm5 + vmovups -0x80(%rcx), %zmm4 + vmovups -0x40(%rcx), %zmm5 /* Align end of destination for access with non-temporal stores. */ - mov %r9, %r8 - and $-0x80, %r9 - sub %r9, %r8 - sub %r8, %rcx - sub %r8, %rdx - add %r9, %r8 + mov %r9, %r8 + and $-0x80, %r9 + sub %r9, %r8 + sub %r8, %rcx + sub %r8, %rdx + add %r9, %r8 L(gobble_256bytes_nt_loop_bkw): - prefetcht1 -0x400(%rcx) - prefetcht1 -0x3C0(%rcx) - prefetcht1 -0x380(%rcx) - prefetcht1 -0x340(%rcx) - prefetcht1 -0x300(%rcx) - prefetcht1 -0x2C0(%rcx) - prefetcht1 -0x280(%rcx) - prefetcht1 -0x240(%rcx) - vmovdqu64 -0x100(%rcx), %zmm0 - vmovdqu64 -0xC0(%rcx), %zmm1 - vmovdqu64 -0x80(%rcx), %zmm2 - vmovdqu64 -0x40(%rcx), %zmm3 - vmovntdq %zmm0, -0x100(%r9) - vmovntdq %zmm1, -0xC0(%r9) - vmovntdq %zmm2, -0x80(%r9) - vmovntdq %zmm3, -0x40(%r9) - sub $256, %rdx - sub $256, %rcx - sub $256, %r9 - cmp $256, %rdx - ja L(gobble_256bytes_nt_loop_bkw) - sfence - vmovups %zmm4, -0x80(%r8) - vmovups %zmm5, -0x40(%r8) - jmp L(check) + prefetcht1 -0x400(%rcx) + prefetcht1 -0x3C0(%rcx) + prefetcht1 -0x380(%rcx) + prefetcht1 -0x340(%rcx) + prefetcht1 -0x300(%rcx) + prefetcht1 -0x2C0(%rcx) + prefetcht1 -0x280(%rcx) + prefetcht1 -0x240(%rcx) + vmovdqu64 -0x100(%rcx), %zmm0 + vmovdqu64 -0xC0(%rcx), %zmm1 + vmovdqu64 -0x80(%rcx), %zmm2 + vmovdqu64 -0x40(%rcx), %zmm3 + vmovntdq %zmm0, -0x100(%r9) + vmovntdq %zmm1, -0xC0(%r9) + vmovntdq %zmm2, -0x80(%r9) + vmovntdq %zmm3, -0x40(%r9) + sub $256, %rdx + sub $256, %rcx + sub $256, %r9 + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop_bkw) + sfence + vmovups %zmm4, -0x80(%r8) + vmovups %zmm5, -0x40(%r8) + jmp L(check) END (__memmove_avx512_no_vzeroupper) strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) diff --git a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S index db70fdf1b4e..9666b05f1c5 100644 --- a/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S +++ b/utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S @@ -1,12 +1,12 @@ #if 1 -# define VEC_SIZE 64 -# define VEC(i) zmm##i -# define VMOVNT vmovntdq -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 -# define SECTION(p) p##.avx512 -# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s +# define SECTION(p) p##.avx512 +# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s # include "memmove-vec-unaligned-erms.S" #endif diff --git a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S index 17b4f861621..ad405be479e 100644 --- a/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S +++ b/utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S @@ -17,7 +17,7 @@ . */ #if 1 -# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s +# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s #else weak_alias (__mempcpy, mempcpy) #endif diff --git a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S index 21be351b4e7..097ff6ca617 100644 --- a/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S +++ b/utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S @@ -37,15 +37,15 @@ #include "sysdep.h" #ifndef MEMCPY_SYMBOL -# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) #endif #ifndef MEMPCPY_SYMBOL -# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) #endif #ifndef MEMMOVE_CHK_SYMBOL -# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) #endif #ifndef VZEROUPPER @@ -70,17 +70,17 @@ #if PREFETCH_SIZE == 64 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE # define PREFETCH_ONE_SET(dir, base, offset) \ - PREFETCH ((offset)base) + PREFETCH ((offset)base) # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE # define PREFETCH_ONE_SET(dir, base, offset) \ - PREFETCH ((offset)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE)base) + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base) # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE # define PREFETCH_ONE_SET(dir, base, offset) \ - PREFETCH ((offset)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) # else # error Unsupported PREFETCHED_LOAD_SIZE! # endif @@ -92,100 +92,100 @@ # error SECTION is not defined! #endif - .section SECTION(.text),"ax",@progbits + .section SECTION(.text),"ax",@progbits #if defined SHARED ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) #endif ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) #if defined SHARED ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) #endif ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) - movq %rdi, %rax + movq %rdi, %rax L(start): # ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx + /* Clear the upper 32 bits. */ + movl %edx, %edx # endif - cmp $VEC_SIZE, %RDX_LP - jb L(less_vec) - cmp $(VEC_SIZE * 2), %RDX_LP - ja L(more_2x_vec) + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(more_2x_vec) #if !defined USE_MULTIARCH L(last_2x_vec): #endif - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) - VZEROUPPER + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VZEROUPPER #if !defined USE_MULTIARCH L(nop): #endif - ret + ret #if defined USE_MULTIARCH END (MEMMOVE_SYMBOL (__memmove, unaligned)) # if VEC_SIZE == 16 ENTRY (__mempcpy_chk_erms) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (__mempcpy_chk_erms) /* Only used to measure performance of REP MOVSB. */ ENTRY (__mempcpy_erms) - mov %RDI_LP, %RAX_LP - /* Skip zero length. */ - test %RDX_LP, %RDX_LP - jz 2f - add %RDX_LP, %RAX_LP - jmp L(start_movsb) + mov %RDI_LP, %RAX_LP + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz 2f + add %RDX_LP, %RAX_LP + jmp L(start_movsb) END (__mempcpy_erms) ENTRY (__memmove_chk_erms) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (__memmove_chk_erms) ENTRY (__memmove_erms) - movq %rdi, %rax - /* Skip zero length. */ - test %RDX_LP, %RDX_LP - jz 2f + movq %rdi, %rax + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz 2f L(start_movsb): - mov %RDX_LP, %RCX_LP - cmp %RSI_LP, %RDI_LP - jb 1f - /* Source == destination is less common. */ - je 2f - lea (%rsi,%rcx), %RDX_LP - cmp %RDX_LP, %RDI_LP - jb L(movsb_backward) + mov %RDX_LP, %RCX_LP + cmp %RSI_LP, %RDI_LP + jb 1f + /* Source == destination is less common. */ + je 2f + lea (%rsi,%rcx), %RDX_LP + cmp %RDX_LP, %RDI_LP + jb L(movsb_backward) 1: - rep movsb + rep movsb 2: - ret + ret L(movsb_backward): - leaq -1(%rdi,%rcx), %rdi - leaq -1(%rsi,%rcx), %rsi - std - rep movsb - cld - ret + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret END (__memmove_erms) strong_alias (__memmove_erms, __memcpy_erms) strong_alias (__memmove_chk_erms, __memcpy_chk_erms) @@ -193,367 +193,367 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms) # ifdef SHARED ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) # endif ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start_erms) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start_erms) END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) # ifdef SHARED ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) # endif ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) - movq %rdi, %rax + movq %rdi, %rax L(start_erms): # ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx + /* Clear the upper 32 bits. */ + movl %edx, %edx # endif - cmp $VEC_SIZE, %RDX_LP - jb L(less_vec) - cmp $(VEC_SIZE * 2), %RDX_LP - ja L(movsb_more_2x_vec) + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(movsb_more_2x_vec) L(last_2x_vec): - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) L(return): - VZEROUPPER - ret + VZEROUPPER + ret L(movsb): - cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP - jae L(more_8x_vec) - cmpq %rsi, %rdi - jb 1f - /* Source == destination is less common. */ - je L(nop) - leaq (%rsi,%rdx), %r9 - cmpq %r9, %rdi - /* Avoid slow backward REP MOVSB. */ - jb L(more_8x_vec_backward) + cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP + jae L(more_8x_vec) + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je L(nop) + leaq (%rsi,%rdx), %r9 + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ + jb L(more_8x_vec_backward) 1: - mov %RDX_LP, %RCX_LP - rep movsb + mov %RDX_LP, %RCX_LP + rep movsb L(nop): - ret + ret #endif L(less_vec): - /* Less than 1 VEC. */ + /* Less than 1 VEC. */ #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! #endif #if VEC_SIZE > 32 - cmpb $32, %dl - jae L(between_32_63) + cmpb $32, %dl + jae L(between_32_63) #endif #if VEC_SIZE > 16 - cmpb $16, %dl - jae L(between_16_31) + cmpb $16, %dl + jae L(between_16_31) #endif - cmpb $8, %dl - jae L(between_8_15) - cmpb $4, %dl - jae L(between_4_7) - cmpb $1, %dl - ja L(between_2_3) - jb 1f - movzbl (%rsi), %ecx - movb %cl, (%rdi) + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movzbl (%rsi), %ecx + movb %cl, (%rdi) 1: - ret + ret #if VEC_SIZE > 32 L(between_32_63): - /* From 32 to 63. No branch when size == 32. */ - vmovdqu (%rsi), %ymm0 - vmovdqu -32(%rsi,%rdx), %ymm1 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, -32(%rdi,%rdx) - VZEROUPPER - ret + /* From 32 to 63. No branch when size == 32. */ + vmovdqu (%rsi), %ymm0 + vmovdqu -32(%rsi,%rdx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -32(%rdi,%rdx) + VZEROUPPER + ret #endif #if VEC_SIZE > 16 - /* From 16 to 31. No branch when size == 16. */ + /* From 16 to 31. No branch when size == 16. */ L(between_16_31): - vmovdqu (%rsi), %xmm0 - vmovdqu -16(%rsi,%rdx), %xmm1 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, -16(%rdi,%rdx) - ret + vmovdqu (%rsi), %xmm0 + vmovdqu -16(%rsi,%rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -16(%rdi,%rdx) + ret #endif L(between_8_15): - /* From 8 to 15. No branch when size == 8. */ - movq -8(%rsi,%rdx), %rcx - movq (%rsi), %rsi - movq %rcx, -8(%rdi,%rdx) - movq %rsi, (%rdi) - ret + /* From 8 to 15. No branch when size == 8. */ + movq -8(%rsi,%rdx), %rcx + movq (%rsi), %rsi + movq %rcx, -8(%rdi,%rdx) + movq %rsi, (%rdi) + ret L(between_4_7): - /* From 4 to 7. No branch when size == 4. */ - movl -4(%rsi,%rdx), %ecx - movl (%rsi), %esi - movl %ecx, -4(%rdi,%rdx) - movl %esi, (%rdi) - ret + /* From 4 to 7. No branch when size == 4. */ + movl -4(%rsi,%rdx), %ecx + movl (%rsi), %esi + movl %ecx, -4(%rdi,%rdx) + movl %esi, (%rdi) + ret L(between_2_3): - /* From 2 to 3. No branch when size == 2. */ - movzwl -2(%rsi,%rdx), %ecx - movzwl (%rsi), %esi - movw %cx, -2(%rdi,%rdx) - movw %si, (%rdi) - ret + /* From 2 to 3. No branch when size == 2. */ + movzwl -2(%rsi,%rdx), %ecx + movzwl (%rsi), %esi + movw %cx, -2(%rdi,%rdx) + movw %si, (%rdi) + ret #if defined USE_MULTIARCH L(movsb_more_2x_vec): - cmp $REP_MOSB_THRESHOLD, %RDX_LP - ja L(movsb) + cmp $REP_MOSB_THRESHOLD, %RDX_LP + ja L(movsb) #endif L(more_2x_vec): - /* More than 2 * VEC and there may be overlap between destination - and source. */ - cmpq $(VEC_SIZE * 8), %rdx - ja L(more_8x_vec) - cmpq $(VEC_SIZE * 4), %rdx - jb L(last_4x_vec) - /* Copy from 4 * VEC to 8 * VEC, inclusively. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), VEC_SIZE(%rdi) - VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER - ret + /* More than 2 * VEC and there may be overlap between destination + and source. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + /* Copy from 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER + ret L(last_4x_vec): - /* Copy from 2 * VEC to 4 * VEC. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), VEC_SIZE(%rdi) - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) - VZEROUPPER - ret + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VZEROUPPER + ret L(more_8x_vec): - cmpq %rsi, %rdi - ja L(more_8x_vec_backward) - /* Source == destination is less common. */ - je L(nop) - /* Load the first VEC and last 4 * VEC to support overlapping - addresses. */ - VMOVU (%rsi), %VEC(4) - VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) - /* Save start and stop of the destination buffer. */ - movq %rdi, %r11 - leaq -VEC_SIZE(%rdi, %rdx), %rcx - /* Align destination for aligned stores in the loop. Compute - how much destination is misaligned. */ - movq %rdi, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Get the negative of offset for alignment. */ - subq $VEC_SIZE, %r8 - /* Adjust source. */ - subq %r8, %rsi - /* Adjust destination which should be aligned now. */ - subq %r8, %rdi - /* Adjust length. */ - addq %r8, %rdx + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + /* Source == destination is less common. */ + je L(nop) + /* Load the first VEC and last 4 * VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) + /* Save start and stop of the destination buffer. */ + movq %rdi, %r11 + leaq -VEC_SIZE(%rdi, %rdx), %rcx + /* Align destination for aligned stores in the loop. Compute + how much destination is misaligned. */ + movq %rdi, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx #if (defined USE_MULTIARCH || VEC_SIZE == 16) - /* Check non-temporal store threshold. */ - cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP - ja L(large_forward) + /* Check non-temporal store threshold. */ + cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP + ja L(large_forward) #endif L(loop_4x_vec_forward): - /* Copy 4 * VEC a time forward. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $(VEC_SIZE * 4), %rsi - subq $(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%rdi) - VMOVA %VEC(1), VEC_SIZE(%rdi) - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $(VEC_SIZE * 4), %rdi - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec_forward) - /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + /* Copy 4 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $(VEC_SIZE * 4), %rsi + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret L(more_8x_vec_backward): - /* Load the first 4 * VEC and last VEC to support overlapping - addresses. */ - VMOVU (%rsi), %VEC(4) - VMOVU VEC_SIZE(%rsi), %VEC(5) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) - /* Save stop of the destination buffer. */ - leaq -VEC_SIZE(%rdi, %rdx), %r11 - /* Align destination end for aligned stores in the loop. Compute - how much destination end is misaligned. */ - leaq -VEC_SIZE(%rsi, %rdx), %rcx - movq %r11, %r9 - movq %r11, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Adjust source. */ - subq %r8, %rcx - /* Adjust the end of destination which should be aligned now. */ - subq %r8, %r9 - /* Adjust length. */ - subq %r8, %rdx + /* Load the first 4 * VEC and last VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU VEC_SIZE(%rsi), %VEC(5) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) + /* Save stop of the destination buffer. */ + leaq -VEC_SIZE(%rdi, %rdx), %r11 + /* Align destination end for aligned stores in the loop. Compute + how much destination end is misaligned. */ + leaq -VEC_SIZE(%rsi, %rdx), %rcx + movq %r11, %r9 + movq %r11, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Adjust source. */ + subq %r8, %rcx + /* Adjust the end of destination which should be aligned now. */ + subq %r8, %r9 + /* Adjust length. */ + subq %r8, %rdx #if (defined USE_MULTIARCH || VEC_SIZE == 16) - /* Check non-temporal store threshold. */ - cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP - ja L(large_backward) + /* Check non-temporal store threshold. */ + cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP + ja L(large_backward) #endif L(loop_4x_vec_backward): - /* Copy 4 * VEC a time backward. */ - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $(VEC_SIZE * 4), %rcx - subq $(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%r9) - VMOVA %VEC(1), -VEC_SIZE(%r9) - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $(VEC_SIZE * 4), %r9 - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec_backward) - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + /* Copy 4 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $(VEC_SIZE * 4), %rcx + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $(VEC_SIZE * 4), %r9 + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret #if (defined USE_MULTIARCH || VEC_SIZE == 16) L(large_forward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rdi, %rdx), %r10 - cmpq %r10, %rsi - jb L(loop_4x_vec_forward) + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rdi, %rdx), %r10 + cmpq %r10, %rsi + jb L(loop_4x_vec_forward) L(loop_large_forward): - /* Copy 4 * VEC a time forward with non-temporal stores. */ - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $PREFETCHED_LOAD_SIZE, %rsi - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%rdi) - VMOVNT %VEC(1), VEC_SIZE(%rdi) - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $PREFETCHED_LOAD_SIZE, %rdi - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_forward) - sfence - /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + /* Copy 4 * VEC a time forward with non-temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $PREFETCHED_LOAD_SIZE, %rsi + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%rdi) + VMOVNT %VEC(1), VEC_SIZE(%rdi) + VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $PREFETCHED_LOAD_SIZE, %rdi + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_forward) + sfence + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret L(large_backward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rcx, %rdx), %r10 - cmpq %r10, %r9 - jb L(loop_4x_vec_backward) + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rcx, %rdx), %r10 + cmpq %r10, %r9 + jb L(loop_4x_vec_backward) L(loop_large_backward): - /* Copy 4 * VEC a time backward with non-temporal stores. */ - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $PREFETCHED_LOAD_SIZE, %rcx - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%r9) - VMOVNT %VEC(1), -VEC_SIZE(%r9) - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $PREFETCHED_LOAD_SIZE, %r9 - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_backward) - sfence - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + /* Copy 4 * VEC a time backward with non-temporal stores. */ + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $PREFETCHED_LOAD_SIZE, %rcx + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%r9) + VMOVNT %VEC(1), -VEC_SIZE(%r9) + VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $PREFETCHED_LOAD_SIZE, %r9 + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_backward) + sfence + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) #if 1 # ifdef USE_MULTIARCH strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), - MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) + MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) # ifdef SHARED strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), - MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) + MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) # endif # endif # ifdef SHARED strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), - MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) + MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) # endif #endif strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), - MEMCPY_SYMBOL (__memcpy, unaligned)) + MEMCPY_SYMBOL (__memcpy, unaligned)) diff --git a/utils/memcpy-bench/glibc/memmove.S b/utils/memcpy-bench/glibc/memmove.S index 97e735facff..7bd47b9a03f 100644 --- a/utils/memcpy-bench/glibc/memmove.S +++ b/utils/memcpy-bench/glibc/memmove.S @@ -18,33 +18,33 @@ #include "sysdep.h" -#define VEC_SIZE 16 -#define VEC(i) xmm##i -#define PREFETCHNT prefetchnta -#define VMOVNT movntdq +#define VEC_SIZE 16 +#define VEC(i) xmm##i +#define PREFETCHNT prefetchnta +#define VMOVNT movntdq /* Use movups and movaps for smaller code sizes. */ -#define VMOVU movups -#define VMOVA movaps +#define VMOVU movups +#define VMOVA movaps -#define SECTION(p) p +#define SECTION(p) p #ifdef USE_MULTIARCH # if 0 -# define MEMCPY_SYMBOL(p,s) memcpy +# define MEMCPY_SYMBOL(p,s) memcpy # endif #else # if defined SHARED -# define MEMCPY_SYMBOL(p,s) __memcpy +# define MEMCPY_SYMBOL(p,s) __memcpy # else -# define MEMCPY_SYMBOL(p,s) memcpy +# define MEMCPY_SYMBOL(p,s) memcpy # endif #endif #if !defined USE_MULTIARCH -# define MEMPCPY_SYMBOL(p,s) __mempcpy +# define MEMPCPY_SYMBOL(p,s) __mempcpy #endif #ifndef MEMMOVE_SYMBOL -# define MEMMOVE_CHK_SYMBOL(p,s) p -# define MEMMOVE_SYMBOL(p,s) memmove +# define MEMMOVE_CHK_SYMBOL(p,s) p +# define MEMMOVE_SYMBOL(p,s) memmove #endif #include "memmove-vec-unaligned-erms.S" diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h index 099134b2a2f..e255e7488da 100644 --- a/utils/memcpy-bench/glibc/sysdep.h +++ b/utils/memcpy-bench/glibc/sysdep.h @@ -21,7 +21,7 @@ #include "sysdep_x86.h" -#ifdef __ASSEMBLER__ +#ifdef __ASSEMBLER__ /* Syntactic details of assembler. */ @@ -29,11 +29,11 @@ the register as saved relative to %rsp instead of relative to the CFA. Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset from %rsp. */ -#define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \ - 0x77, off & 0x7F | 0x80, off >> 7 +#define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \ + 0x77, off & 0x7F | 0x80, off >> 7 /* If compiled for profiling, call `mcount' at the start of each function. */ -#ifdef PROF +#ifdef PROF /* The mcount code relies on a normal frame pointer being on the stack to locate our caller, so push one just for its benefit. */ #define CALL_MCOUNT \ @@ -45,31 +45,31 @@ popq %rbp; \ cfi_def_cfa(rsp,8); #else -#define CALL_MCOUNT /* Do nothing. */ +#define CALL_MCOUNT /* Do nothing. */ #endif -#define PSEUDO(name, syscall_name, args) \ -lose: \ - jmp JUMPTARGET(syscall_error) \ - .globl syscall_error; \ - ENTRY (name) \ - DO_CALL (syscall_name, args); \ +#define PSEUDO(name, syscall_name, args) \ +lose: \ + jmp JUMPTARGET(syscall_error) \ + .globl syscall_error; \ + ENTRY (name) \ + DO_CALL (syscall_name, args); \ jb lose #undef JUMPTARGET #ifdef SHARED # ifdef BIND_NOW -# define JUMPTARGET(name) *name##@GOTPCREL(%rip) +# define JUMPTARGET(name) *name##@GOTPCREL(%rip) # else -# define JUMPTARGET(name) name##@PLT +# define JUMPTARGET(name) name##@PLT # endif #else /* For static archives, branch to target directly. */ -# define JUMPTARGET(name) name +# define JUMPTARGET(name) name #endif /* Long and pointer size in bytes. */ -#define LP_SIZE 8 +#define LP_SIZE 8 /* Instruction to operate on long and pointer. */ #define LP_OP(insn) insn##q @@ -78,24 +78,24 @@ lose: \ #define ASM_ADDR .quad /* Registers to hold long and pointer. */ -#define RAX_LP rax -#define RBP_LP rbp -#define RBX_LP rbx -#define RCX_LP rcx -#define RDI_LP rdi -#define RDX_LP rdx -#define RSI_LP rsi -#define RSP_LP rsp -#define R8_LP r8 -#define R9_LP r9 -#define R10_LP r10 -#define R11_LP r11 -#define R12_LP r12 -#define R13_LP r13 -#define R14_LP r14 -#define R15_LP r15 +#define RAX_LP rax +#define RBP_LP rbp +#define RBX_LP rbx +#define RCX_LP rcx +#define RDI_LP rdi +#define RDX_LP rdx +#define RSI_LP rsi +#define RSP_LP rsp +#define R8_LP r8 +#define R9_LP r9 +#define R10_LP r10 +#define R11_LP r11 +#define R12_LP r12 +#define R13_LP r13 +#define R14_LP r14 +#define R15_LP r15 -#else /* __ASSEMBLER__ */ +#else /* __ASSEMBLER__ */ /* Long and pointer size in bytes. */ #define LP_SIZE "8" @@ -107,23 +107,23 @@ lose: \ #define ASM_ADDR ".quad" /* Registers to hold long and pointer. */ -#define RAX_LP "rax" -#define RBP_LP "rbp" -#define RBX_LP "rbx" -#define RCX_LP "rcx" -#define RDI_LP "rdi" -#define RDX_LP "rdx" -#define RSI_LP "rsi" -#define RSP_LP "rsp" -#define R8_LP "r8" -#define R9_LP "r9" -#define R10_LP "r10" -#define R11_LP "r11" -#define R12_LP "r12" -#define R13_LP "r13" -#define R14_LP "r14" -#define R15_LP "r15" +#define RAX_LP "rax" +#define RBP_LP "rbp" +#define RBX_LP "rbx" +#define RCX_LP "rcx" +#define RDI_LP "rdi" +#define RDX_LP "rdx" +#define RSI_LP "rsi" +#define RSP_LP "rsp" +#define R8_LP "r8" +#define R9_LP "r9" +#define R10_LP "r10" +#define R11_LP "r11" +#define R12_LP "r12" +#define R13_LP "r13" +#define R14_LP "r14" +#define R15_LP "r15" -#endif /* __ASSEMBLER__ */ +#endif /* __ASSEMBLER__ */ -#endif /* _X86_64_SYSDEP_H */ +#endif /* _X86_64_SYSDEP_H */ diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h index 91f78e1b04d..afecea8c356 100644 --- a/utils/memcpy-bench/glibc/sysdep_generic.h +++ b/utils/memcpy-bench/glibc/sysdep_generic.h @@ -28,14 +28,14 @@ #define ASM_LINE_SEP ; -#define strong_alias(original, alias) \ - .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP \ +#define strong_alias(original, alias) \ + .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP \ C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original) #ifndef C_LABEL /* Define a macro we can use to construct the asm name for a C symbol. */ -# define C_LABEL(name) name##: +# define C_LABEL(name) name##: #endif @@ -47,38 +47,38 @@ # endif # ifndef JUMPTARGET -# define JUMPTARGET(sym) sym +# define JUMPTARGET(sym) sym # endif #endif -/* Makros to generate eh_frame unwind information. */ +/* Macros to generate eh_frame unwind information. */ #ifdef __ASSEMBLER__ -# define cfi_startproc .cfi_startproc -# define cfi_endproc .cfi_endproc -# define cfi_def_cfa(reg, off) .cfi_def_cfa reg, off -# define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg -# define cfi_def_cfa_offset(off) .cfi_def_cfa_offset off -# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off -# define cfi_offset(reg, off) .cfi_offset reg, off -# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off -# define cfi_register(r1, r2) .cfi_register r1, r2 -# define cfi_return_column(reg) .cfi_return_column reg -# define cfi_restore(reg) .cfi_restore reg -# define cfi_same_value(reg) .cfi_same_value reg -# define cfi_undefined(reg) .cfi_undefined reg -# define cfi_remember_state .cfi_remember_state -# define cfi_restore_state .cfi_restore_state -# define cfi_window_save .cfi_window_save -# define cfi_personality(enc, exp) .cfi_personality enc, exp -# define cfi_lsda(enc, exp) .cfi_lsda enc, exp +# define cfi_startproc .cfi_startproc +# define cfi_endproc .cfi_endproc +# define cfi_def_cfa(reg, off) .cfi_def_cfa reg, off +# define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg +# define cfi_def_cfa_offset(off) .cfi_def_cfa_offset off +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +# define cfi_offset(reg, off) .cfi_offset reg, off +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +# define cfi_register(r1, r2) .cfi_register r1, r2 +# define cfi_return_column(reg) .cfi_return_column reg +# define cfi_restore(reg) .cfi_restore reg +# define cfi_same_value(reg) .cfi_same_value reg +# define cfi_undefined(reg) .cfi_undefined reg +# define cfi_remember_state .cfi_remember_state +# define cfi_restore_state .cfi_restore_state +# define cfi_window_save .cfi_window_save +# define cfi_personality(enc, exp) .cfi_personality enc, exp +# define cfi_lsda(enc, exp) .cfi_lsda enc, exp #else /* ! ASSEMBLER */ # define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name) # define CFI_STRINGIFY2(Name) #Name -# define CFI_STARTPROC ".cfi_startproc" -# define CFI_ENDPROC ".cfi_endproc" -# define CFI_DEF_CFA(reg, off) \ +# define CFI_STARTPROC ".cfi_startproc" +# define CFI_ENDPROC ".cfi_endproc" +# define CFI_DEF_CFA(reg, off) \ ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) # define CFI_DEF_CFA_REGISTER(reg) \ ".cfi_def_cfa_register " CFI_STRINGIFY(reg) diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h index a3fecd01268..7abb350242f 100644 --- a/utils/memcpy-bench/glibc/sysdep_x86.h +++ b/utils/memcpy-bench/glibc/sysdep_x86.h @@ -34,18 +34,18 @@ enum cf_protection_level */ /* Set if CF_BRANCH (IBT) is enabled. */ -#define X86_FEATURE_1_IBT (1U << 0) +#define X86_FEATURE_1_IBT (1U << 0) /* Set if CF_RETURN (SHSTK) is enabled. */ -#define X86_FEATURE_1_SHSTK (1U << 1) +#define X86_FEATURE_1_SHSTK (1U << 1) #ifdef __CET__ -# define CET_ENABLED 1 -# define IBT_ENABLED (__CET__ & X86_FEATURE_1_IBT) -# define SHSTK_ENABLED (__CET__ & X86_FEATURE_1_SHSTK) +# define CET_ENABLED 1 +# define IBT_ENABLED (__CET__ & X86_FEATURE_1_IBT) +# define SHSTK_ENABLED (__CET__ & X86_FEATURE_1_SHSTK) #else -# define CET_ENABLED 0 -# define IBT_ENABLED 0 -# define SHSTK_ENABLED 0 +# define CET_ENABLED 0 +# define IBT_ENABLED 0 +# define SHSTK_ENABLED 0 #endif /* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need @@ -57,7 +57,7 @@ enum cf_protection_level #define STATE_SAVE_MASK \ ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) -#ifdef __ASSEMBLER__ +#ifdef __ASSEMBLER__ /* Syntactic details of assembler. */ @@ -73,18 +73,18 @@ enum cf_protection_level #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; /* Define an entry point visible from C. */ -#define ENTRY(name) \ - .globl C_SYMBOL_NAME(name); \ - .type C_SYMBOL_NAME(name),@function; \ - .align ALIGNARG(4); \ - C_LABEL(name) \ - cfi_startproc; \ - _CET_ENDBR; \ +#define ENTRY(name) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ + .align ALIGNARG(4); \ + C_LABEL(name) \ + cfi_startproc; \ + _CET_ENDBR; \ CALL_MCOUNT -#undef END -#define END(name) \ - cfi_endproc; \ +#undef END +#define END(name) \ + cfi_endproc; \ ASM_SIZE_DIRECTIVE(name) #define ENTRY_CHK(name) ENTRY (name) @@ -93,21 +93,21 @@ enum cf_protection_level /* Since C identifiers are not normally prefixed with an underscore on this system, the asm identifier `syscall_error' intrudes on the C name space. Make sure we use an innocuous name. */ -#define syscall_error __syscall_error -#define mcount _mcount +#define syscall_error __syscall_error +#define mcount _mcount -#undef PSEUDO_END -#define PSEUDO_END(name) \ +#undef PSEUDO_END +#define PSEUDO_END(name) \ END (name) /* Local label name for asm code. */ #ifndef L /* ELF-like local names start with `.L'. */ -# define L(name) .L##name +# define L(name) .L##name #endif #define atom_text_section .section ".text.atom", "ax" -#endif /* __ASSEMBLER__ */ +#endif /* __ASSEMBLER__ */ -#endif /* _X86_SYSDEP_H */ +#endif /* _X86_SYSDEP_H */ From 8c2d65242a81b68f9ca520cf015e53933a52eaca Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 23:24:22 +0300 Subject: [PATCH 3/6] Fix style --- utils/memcpy-bench/glibc/asm-syntax.h | 2 ++ utils/memcpy-bench/glibc/dwarf2.h | 4 +++- utils/memcpy-bench/glibc/sysdep.h | 2 ++ utils/memcpy-bench/glibc/sysdep_generic.h | 2 ++ utils/memcpy-bench/glibc/sysdep_x86.h | 2 ++ 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/utils/memcpy-bench/glibc/asm-syntax.h b/utils/memcpy-bench/glibc/asm-syntax.h index 6e299c1fec2..9d65213ba30 100644 --- a/utils/memcpy-bench/glibc/asm-syntax.h +++ b/utils/memcpy-bench/glibc/asm-syntax.h @@ -1,3 +1,5 @@ +#pragma once + /* Definitions for x86 syntax variations. Copyright (C) 1992-2020 Free Software Foundation, Inc. This file is part of the GNU C Library. Its master source is NOT part of diff --git a/utils/memcpy-bench/glibc/dwarf2.h b/utils/memcpy-bench/glibc/dwarf2.h index 2be827f00ae..b0536c97e5e 100644 --- a/utils/memcpy-bench/glibc/dwarf2.h +++ b/utils/memcpy-bench/glibc/dwarf2.h @@ -1,3 +1,5 @@ +#pragma once + /* Declarations and definitions of codes relating to the DWARF2 symbolic debugging information format. Copyright (C) 1992-2020 Free Software Foundation, Inc. @@ -563,7 +565,7 @@ enum dwarf_macinfo_record_type }; #endif /* !ASSEMBLER */ - + /* @@@ For use with GNU frame unwind information. */ #define DW_EH_PE_absptr 0x00 diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h index e255e7488da..2f43d688df9 100644 --- a/utils/memcpy-bench/glibc/sysdep.h +++ b/utils/memcpy-bench/glibc/sysdep.h @@ -1,3 +1,5 @@ +#pragma once + /* Assembler macros for x86-64. Copyright (C) 2001-2020 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h index afecea8c356..0cb5bca4102 100644 --- a/utils/memcpy-bench/glibc/sysdep_generic.h +++ b/utils/memcpy-bench/glibc/sysdep_generic.h @@ -1,3 +1,5 @@ +#pragma once + /* Generic asm macros used on many machines. Copyright (C) 1991-2020 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h index 7abb350242f..4469ed2e885 100644 --- a/utils/memcpy-bench/glibc/sysdep_x86.h +++ b/utils/memcpy-bench/glibc/sysdep_x86.h @@ -1,3 +1,5 @@ +#pragma once + /* Assembler macros for x86. Copyright (C) 2017-2020 Free Software Foundation, Inc. This file is part of the GNU C Library. From 637f6a29a649ee46360848c5a8013fb040050589 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Mar 2021 11:16:15 +0300 Subject: [PATCH 4/6] Add penalty --- utils/memcpy-bench/memcpy-bench.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp index 365abe1f01e..dc510af0dbf 100644 --- a/utils/memcpy-bench/memcpy-bench.cpp +++ b/utils/memcpy-bench/memcpy-bench.cpp @@ -33,6 +33,9 @@ void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_d dst += bytes_to_copy; src += bytes_to_copy; size -= bytes_to_copy; + + /// Execute at least one SSE instruction as a penalty after running AVX code. + __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7"); } } @@ -76,16 +79,9 @@ uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size uint64_t elapsed_ns = watch.elapsed(); /// Validation - size_t sum = 0; - size_t reference = 0; for (size_t i = 0; i < size; ++i) - { - sum += dst[i]; - reference += uint8_t(i); - } - - if (sum != reference) - throw std::logic_error("Incorrect result"); + if (dst[i] != uint8_t(i)) + throw std::logic_error("Incorrect result"); std::cout << name; return elapsed_ns; @@ -676,11 +672,9 @@ done | tee result.tsv } else { - iterations = 10000000000ULL * num_threads / size; + iterations = 10000000000ULL / size; if (generator_variant == 1) - iterations /= 100; - if (generator_variant == 2) iterations /= 10; } From 1f6b05cd85d34c2d6f71b057c16d95b83f7d8853 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Mar 2021 11:18:11 +0300 Subject: [PATCH 5/6] Add example --- utils/memcpy-bench/memcpy-bench.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp index dc510af0dbf..cd769640017 100644 --- a/utils/memcpy-bench/memcpy-bench.cpp +++ b/utils/memcpy-bench/memcpy-bench.cpp @@ -655,6 +655,24 @@ for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do done; done | tee result.tsv +clickhouse-local --structure ' + name String, + size UInt64, + iterations UInt64, + threads UInt16, + generator UInt8, + memcpy UInt8, + elapsed UInt64 +' --query " + SELECT + size, name, + avg(1000 * elapsed / size / iterations) AS s, + count() AS c + FROM table + GROUP BY size, name + ORDER BY size ASC, s DESC +" --output-format PrettyCompact < result.tsv + )" << std::endl; std::cout << desc << std::endl; return 1; From 9bea10d9f94206671c89b8faf196725ed47e0d5e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Mar 2021 11:49:56 +0300 Subject: [PATCH 6/6] Fix style --- utils/memcpy-bench/glibc/asm-syntax.h | 28 +++--- utils/memcpy-bench/glibc/dwarf2.h | 114 +++++++++++----------- utils/memcpy-bench/glibc/sysdep.h | 58 +++++------ utils/memcpy-bench/glibc/sysdep_generic.h | 62 ++++++------ utils/memcpy-bench/glibc/sysdep_x86.h | 66 ++++++------- 5 files changed, 164 insertions(+), 164 deletions(-) diff --git a/utils/memcpy-bench/glibc/asm-syntax.h b/utils/memcpy-bench/glibc/asm-syntax.h index 9d65213ba30..0879f2606c7 100644 --- a/utils/memcpy-bench/glibc/asm-syntax.h +++ b/utils/memcpy-bench/glibc/asm-syntax.h @@ -1,23 +1,23 @@ #pragma once /* Definitions for x86 syntax variations. - Copyright (C) 1992-2020 Free Software Foundation, Inc. - This file is part of the GNU C Library. Its master source is NOT part of - the C library, however. The master source lives in the GNU MP Library. + Copyright (C) 1992-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. Its master source is NOT part of + the C library, however. The master source lives in the GNU MP Library. - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ #undef ALIGN #define ALIGN(log) .align 1<. */ + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ #ifndef _DWARF2_H #define _DWARF2_H 1 /* This file is derived from the DWARF specification (a public document) - Revision 2.0.0 (July 27, 1993) developed by the UNIX International - Programming Languages Special Interest Group (UI/PLSIG) and distributed - by UNIX International. Copies of this specification are available from - UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. */ + Revision 2.0.0 (July 27, 1993) developed by the UNIX International + Programming Languages Special Interest Group (UI/PLSIG) and distributed + by UNIX International. Copies of this specification are available from + UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. */ /* This file is shared between GCC and GDB, and should not contain - prototypes. */ + prototypes. */ #ifndef __ASSEMBLER__ /* Tag names and codes. */ enum dwarf_tag - { + { DW_TAG_padding = 0x00, DW_TAG_array_type = 0x01, DW_TAG_class_type = 0x02, @@ -95,7 +95,7 @@ enum dwarf_tag DW_TAG_class_template = 0x4103, /* for C++ */ DW_TAG_GNU_BINCL = 0x4104, DW_TAG_GNU_EINCL = 0x4105 - }; + }; #define DW_TAG_lo_user 0x4080 #define DW_TAG_hi_user 0xffff @@ -106,7 +106,7 @@ enum dwarf_tag /* Form names and codes. */ enum dwarf_form - { + { DW_FORM_addr = 0x01, DW_FORM_block2 = 0x03, DW_FORM_block4 = 0x04, @@ -128,12 +128,12 @@ enum dwarf_form DW_FORM_ref8 = 0x14, DW_FORM_ref_udata = 0x15, DW_FORM_indirect = 0x16 - }; + }; /* Attribute names and codes. */ enum dwarf_attribute - { + { DW_AT_sibling = 0x01, DW_AT_location = 0x02, DW_AT_name = 0x03, @@ -215,7 +215,7 @@ enum dwarf_attribute DW_AT_src_coords = 0x2104, DW_AT_body_begin = 0x2105, DW_AT_body_end = 0x2106 - }; + }; #define DW_AT_lo_user 0x2000 /* implementation-defined range start */ #define DW_AT_hi_user 0x3ff0 /* implementation-defined range end */ @@ -223,7 +223,7 @@ enum dwarf_attribute /* Location atom names and codes. */ enum dwarf_location_atom - { + { DW_OP_addr = 0x03, DW_OP_deref = 0x06, DW_OP_const1u = 0x08, @@ -369,7 +369,7 @@ enum dwarf_location_atom DW_OP_deref_size = 0x94, DW_OP_xderef_size = 0x95, DW_OP_nop = 0x96 - }; + }; #define DW_OP_lo_user 0x80 /* implementation-defined range start */ #define DW_OP_hi_user 0xff /* implementation-defined range end */ @@ -377,7 +377,7 @@ enum dwarf_location_atom /* Type encodings. */ enum dwarf_type - { + { DW_ATE_void = 0x0, DW_ATE_address = 0x1, DW_ATE_boolean = 0x2, @@ -387,81 +387,81 @@ enum dwarf_type DW_ATE_signed_char = 0x6, DW_ATE_unsigned = 0x7, DW_ATE_unsigned_char = 0x8 - }; + }; #define DW_ATE_lo_user 0x80 #define DW_ATE_hi_user 0xff /* Array ordering names and codes. */ enum dwarf_array_dim_ordering - { + { DW_ORD_row_major = 0, DW_ORD_col_major = 1 - }; + }; /* access attribute */ enum dwarf_access_attribute - { + { DW_ACCESS_public = 1, DW_ACCESS_protected = 2, DW_ACCESS_private = 3 - }; + }; /* visibility */ enum dwarf_visibility_attribute - { + { DW_VIS_local = 1, DW_VIS_exported = 2, DW_VIS_qualified = 3 - }; + }; /* virtuality */ enum dwarf_virtuality_attribute - { + { DW_VIRTUALITY_none = 0, DW_VIRTUALITY_virtual = 1, DW_VIRTUALITY_pure_virtual = 2 - }; + }; /* case sensitivity */ enum dwarf_id_case - { + { DW_ID_case_sensitive = 0, DW_ID_up_case = 1, DW_ID_down_case = 2, DW_ID_case_insensitive = 3 - }; + }; /* calling convention */ enum dwarf_calling_convention - { + { DW_CC_normal = 0x1, DW_CC_program = 0x2, DW_CC_nocall = 0x3 - }; + }; #define DW_CC_lo_user 0x40 #define DW_CC_hi_user 0xff /* inline attribute */ enum dwarf_inline_attribute - { + { DW_INL_not_inlined = 0, DW_INL_inlined = 1, DW_INL_declared_not_inlined = 2, DW_INL_declared_inlined = 3 - }; + }; /* discriminant lists */ enum dwarf_discrim_list - { + { DW_DSC_label = 0, DW_DSC_range = 1 - }; + }; /* line number opcodes */ enum dwarf_line_number_ops - { + { DW_LNS_extended_op = 0, DW_LNS_copy = 1, DW_LNS_advance_pc = 2, @@ -472,19 +472,19 @@ enum dwarf_line_number_ops DW_LNS_set_basic_block = 7, DW_LNS_const_add_pc = 8, DW_LNS_fixed_advance_pc = 9 - }; + }; /* line number extended opcodes */ enum dwarf_line_number_x_ops - { + { DW_LNE_end_sequence = 1, DW_LNE_set_address = 2, DW_LNE_define_file = 3 - }; + }; /* call frame information */ enum dwarf_call_frame_info - { + { DW_CFA_advance_loc = 0x40, DW_CFA_offset = 0x80, DW_CFA_restore = 0xc0, @@ -517,7 +517,7 @@ enum dwarf_call_frame_info DW_CFA_GNU_window_save = 0x2d, DW_CFA_GNU_args_size = 0x2e, DW_CFA_GNU_negative_offset_extended = 0x2f - }; + }; #define DW_CIE_ID 0xffffffff #define DW_CIE_VERSION 1 @@ -534,7 +534,7 @@ enum dwarf_call_frame_info /* Source language names and codes. */ enum dwarf_source_language - { + { DW_LANG_C89 = 0x0001, DW_LANG_C = 0x0002, DW_LANG_Ada83 = 0x0003, @@ -547,7 +547,7 @@ enum dwarf_source_language DW_LANG_Modula2 = 0x000a, DW_LANG_Java = 0x000b, DW_LANG_Mips_Assembler = 0x8001 - }; + }; #define DW_LANG_lo_user 0x8000 /* implementation-defined range start */ @@ -556,13 +556,13 @@ enum dwarf_source_language /* Names and codes for macro information. */ enum dwarf_macinfo_record_type - { + { DW_MACINFO_define = 1, DW_MACINFO_undef = 2, DW_MACINFO_start_file = 3, DW_MACINFO_end_file = 4, DW_MACINFO_vendor_ext = 255 - }; + }; #endif /* !ASSEMBLER */ diff --git a/utils/memcpy-bench/glibc/sysdep.h b/utils/memcpy-bench/glibc/sysdep.h index 2f43d688df9..82b1e747fbe 100644 --- a/utils/memcpy-bench/glibc/sysdep.h +++ b/utils/memcpy-bench/glibc/sysdep.h @@ -1,22 +1,22 @@ #pragma once /* Assembler macros for x86-64. - Copyright (C) 2001-2020 Free Software Foundation, Inc. - This file is part of the GNU C Library. + Copyright (C) 2001-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ #ifndef _X86_64_SYSDEP_H #define _X86_64_SYSDEP_H 1 @@ -28,35 +28,35 @@ /* Syntactic details of assembler. */ /* This macro is for setting proper CFI with DW_CFA_expression describing - the register as saved relative to %rsp instead of relative to the CFA. - Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset - from %rsp. */ + the register as saved relative to %rsp instead of relative to the CFA. + Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset + from %rsp. */ #define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \ 0x77, off & 0x7F | 0x80, off >> 7 /* If compiled for profiling, call `mcount' at the start of each function. */ #ifdef PROF /* The mcount code relies on a normal frame pointer being on the stack - to locate our caller, so push one just for its benefit. */ + to locate our caller, so push one just for its benefit. */ #define CALL_MCOUNT \ - pushq %rbp; \ - cfi_adjust_cfa_offset(8); \ - movq %rsp, %rbp; \ - cfi_def_cfa_register(%rbp); \ - call JUMPTARGET(mcount); \ - popq %rbp; \ - cfi_def_cfa(rsp,8); + pushq %rbp; \ + cfi_adjust_cfa_offset(8); \ + movq %rsp, %rbp; \ + cfi_def_cfa_register(%rbp); \ + call JUMPTARGET(mcount); \ + popq %rbp; \ + cfi_def_cfa(rsp,8); #else #define CALL_MCOUNT /* Do nothing. */ #endif #define PSEUDO(name, syscall_name, args) \ lose: \ - jmp JUMPTARGET(syscall_error) \ - .globl syscall_error; \ - ENTRY (name) \ - DO_CALL (syscall_name, args); \ - jb lose + jmp JUMPTARGET(syscall_error) \ + .globl syscall_error; \ + ENTRY (name) \ + DO_CALL (syscall_name, args); \ + jb lose #undef JUMPTARGET #ifdef SHARED diff --git a/utils/memcpy-bench/glibc/sysdep_generic.h b/utils/memcpy-bench/glibc/sysdep_generic.h index 0cb5bca4102..e6183d72792 100644 --- a/utils/memcpy-bench/glibc/sysdep_generic.h +++ b/utils/memcpy-bench/glibc/sysdep_generic.h @@ -1,22 +1,22 @@ #pragma once /* Generic asm macros used on many machines. - Copyright (C) 1991-2020 Free Software Foundation, Inc. - This file is part of the GNU C Library. + Copyright (C) 1991-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ #define C_SYMBOL_NAME(name) name #define HIDDEN_JUMPTARGET(name) 0x0 @@ -31,8 +31,8 @@ #define ASM_LINE_SEP ; #define strong_alias(original, alias) \ - .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP \ - C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original) + .globl C_SYMBOL_NAME (alias) ASM_LINE_SEP \ + C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original) #ifndef C_LABEL @@ -43,7 +43,7 @@ #ifdef __ASSEMBLER__ /* Mark the end of function named SYM. This is used on some platforms - to generate correct debugging information. */ + to generate correct debugging information. */ # ifndef END # define END(sym) # endif @@ -81,35 +81,35 @@ # define CFI_STARTPROC ".cfi_startproc" # define CFI_ENDPROC ".cfi_endproc" # define CFI_DEF_CFA(reg, off) \ - ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) + ".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) # define CFI_DEF_CFA_REGISTER(reg) \ - ".cfi_def_cfa_register " CFI_STRINGIFY(reg) + ".cfi_def_cfa_register " CFI_STRINGIFY(reg) # define CFI_DEF_CFA_OFFSET(off) \ - ".cfi_def_cfa_offset " CFI_STRINGIFY(off) + ".cfi_def_cfa_offset " CFI_STRINGIFY(off) # define CFI_ADJUST_CFA_OFFSET(off) \ - ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off) + ".cfi_adjust_cfa_offset " CFI_STRINGIFY(off) # define CFI_OFFSET(reg, off) \ - ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) + ".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) # define CFI_REL_OFFSET(reg, off) \ - ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) + ".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off) # define CFI_REGISTER(r1, r2) \ - ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2) + ".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2) # define CFI_RETURN_COLUMN(reg) \ - ".cfi_return_column " CFI_STRINGIFY(reg) + ".cfi_return_column " CFI_STRINGIFY(reg) # define CFI_RESTORE(reg) \ - ".cfi_restore " CFI_STRINGIFY(reg) + ".cfi_restore " CFI_STRINGIFY(reg) # define CFI_UNDEFINED(reg) \ - ".cfi_undefined " CFI_STRINGIFY(reg) + ".cfi_undefined " CFI_STRINGIFY(reg) # define CFI_REMEMBER_STATE \ - ".cfi_remember_state" + ".cfi_remember_state" # define CFI_RESTORE_STATE \ - ".cfi_restore_state" + ".cfi_restore_state" # define CFI_WINDOW_SAVE \ - ".cfi_window_save" + ".cfi_window_save" # define CFI_PERSONALITY(enc, exp) \ - ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) + ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) # define CFI_LSDA(enc, exp) \ - ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) + ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) #endif #include "dwarf2.h" diff --git a/utils/memcpy-bench/glibc/sysdep_x86.h b/utils/memcpy-bench/glibc/sysdep_x86.h index 4469ed2e885..1c482cfabb7 100644 --- a/utils/memcpy-bench/glibc/sysdep_x86.h +++ b/utils/memcpy-bench/glibc/sysdep_x86.h @@ -1,22 +1,22 @@ #pragma once /* Assembler macros for x86. - Copyright (C) 2017-2020 Free Software Foundation, Inc. - This file is part of the GNU C Library. + Copyright (C) 2017-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ #ifndef _X86_SYSDEP_H #define _X86_SYSDEP_H 1 @@ -27,11 +27,11 @@ enum cf_protection_level { - CF_NONE = 0, - CF_BRANCH = 1 << 0, - CF_RETURN = 1 << 1, - CF_FULL = CF_BRANCH | CF_RETURN, - CF_SET = 1 << 2 + CF_NONE = 0, + CF_BRANCH = 1 << 0, + CF_RETURN = 1 << 1, + CF_FULL = CF_BRANCH | CF_RETURN, + CF_SET = 1 << 2 }; */ @@ -51,13 +51,13 @@ enum cf_protection_level #endif /* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need - space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be - aligned to 16 bytes for fxsave and 64 bytes for xsave. */ + space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be + aligned to 16 bytes for fxsave and 64 bytes for xsave. */ #define STATE_SAVE_OFFSET (8 * 7 + 8) /* Save SSE, AVX, AVX512, mask and bound registers. */ #define STATE_SAVE_MASK \ - ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) + ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) #ifdef __ASSEMBLER__ @@ -76,31 +76,31 @@ enum cf_protection_level /* Define an entry point visible from C. */ #define ENTRY(name) \ - .globl C_SYMBOL_NAME(name); \ - .type C_SYMBOL_NAME(name),@function; \ - .align ALIGNARG(4); \ - C_LABEL(name) \ - cfi_startproc; \ - _CET_ENDBR; \ - CALL_MCOUNT + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ + .align ALIGNARG(4); \ + C_LABEL(name) \ + cfi_startproc; \ + _CET_ENDBR; \ + CALL_MCOUNT #undef END #define END(name) \ - cfi_endproc; \ - ASM_SIZE_DIRECTIVE(name) + cfi_endproc; \ + ASM_SIZE_DIRECTIVE(name) #define ENTRY_CHK(name) ENTRY (name) #define END_CHK(name) END (name) /* Since C identifiers are not normally prefixed with an underscore - on this system, the asm identifier `syscall_error' intrudes on the - C name space. Make sure we use an innocuous name. */ + on this system, the asm identifier `syscall_error' intrudes on the + C name space. Make sure we use an innocuous name. */ #define syscall_error __syscall_error #define mcount _mcount #undef PSEUDO_END #define PSEUDO_END(name) \ - END (name) + END (name) /* Local label name for asm code. */ #ifndef L