Merge pull request #21715 from ClickHouse/memcpy-bench

Add more variants for memcpy benchmark
This commit is contained in:
alexey-milovidov 2021-03-15 20:04:36 +03:00 committed by GitHub
commit 6a455fe71d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 8609 additions and 61 deletions

View File

@ -1,5 +1,22 @@
enable_language(ASM)
add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S)
#target_compile_options(memcpy-bench PRIVATE -mavx)
target_link_libraries(memcpy-bench PRIVATE dbms)
add_executable (memcpy-bench
memcpy-bench.cpp
FastMemcpy.cpp
FastMemcpy_Avx.cpp
memcpy_jart.S
glibc/memcpy-ssse3.S
glibc/memcpy-ssse3-back.S
glibc/memmove-sse2-unaligned-erms.S
glibc/memmove-avx-unaligned-erms.S
glibc/memmove-avx512-unaligned-erms.S
glibc/memmove-avx512-no-vzeroupper.S
)
add_compile_options(memcpy-bench PRIVATE -fno-tree-loop-distribute-patterns)
set_source_files_properties(FastMemcpy.cpp PROPERTIES COMPILE_FLAGS "-Wno-old-style-cast")
set_source_files_properties(FastMemcpy_Avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -Wno-old-style-cast -Wno-cast-qual -Wno-cast-align")
target_link_libraries(memcpy-bench PRIVATE dbms boost::program_options)

View File

@ -0,0 +1 @@
#include "FastMemcpy.h"

View File

@ -93,7 +93,7 @@ static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restric
/// Attribute is used to avoid an error with undefined behaviour sanitizer
/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
{
unsigned char *dd = ((unsigned char*)dst) + size;
const unsigned char *ss = ((const unsigned char*)src) + size;

View File

@ -0,0 +1 @@
#include "FastMemcpy_Avx.h"

View File

@ -0,0 +1,26 @@
#pragma once
/* Definitions for x86 syntax variations.
Copyright (C) 1992-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library. Its master source is NOT part of
the C library, however. The master source lives in the GNU MP Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#undef ALIGN
#define ALIGN(log) .align 1<<log
#undef L
#define L(body) .L##body

View File

@ -0,0 +1,592 @@
#pragma once
/* Declarations and definitions of codes relating to the DWARF2 symbolic
debugging information format.
Copyright (C) 1992-2020 Free Software Foundation, Inc.
Contributed by Gary Funck (gary@intrepid.com). Derived from the
DWARF 1 implementation written by Ron Guilmette (rfg@monkeys.com).
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _DWARF2_H
#define _DWARF2_H 1
/* This file is derived from the DWARF specification (a public document)
Revision 2.0.0 (July 27, 1993) developed by the UNIX International
Programming Languages Special Interest Group (UI/PLSIG) and distributed
by UNIX International. Copies of this specification are available from
UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. */
/* This file is shared between GCC and GDB, and should not contain
prototypes. */
#ifndef __ASSEMBLER__
/* Tag names and codes. */
enum dwarf_tag
{
DW_TAG_padding = 0x00,
DW_TAG_array_type = 0x01,
DW_TAG_class_type = 0x02,
DW_TAG_entry_point = 0x03,
DW_TAG_enumeration_type = 0x04,
DW_TAG_formal_parameter = 0x05,
DW_TAG_imported_declaration = 0x08,
DW_TAG_label = 0x0a,
DW_TAG_lexical_block = 0x0b,
DW_TAG_member = 0x0d,
DW_TAG_pointer_type = 0x0f,
DW_TAG_reference_type = 0x10,
DW_TAG_compile_unit = 0x11,
DW_TAG_string_type = 0x12,
DW_TAG_structure_type = 0x13,
DW_TAG_subroutine_type = 0x15,
DW_TAG_typedef = 0x16,
DW_TAG_union_type = 0x17,
DW_TAG_unspecified_parameters = 0x18,
DW_TAG_variant = 0x19,
DW_TAG_common_block = 0x1a,
DW_TAG_common_inclusion = 0x1b,
DW_TAG_inheritance = 0x1c,
DW_TAG_inlined_subroutine = 0x1d,
DW_TAG_module = 0x1e,
DW_TAG_ptr_to_member_type = 0x1f,
DW_TAG_set_type = 0x20,
DW_TAG_subrange_type = 0x21,
DW_TAG_with_stmt = 0x22,
DW_TAG_access_declaration = 0x23,
DW_TAG_base_type = 0x24,
DW_TAG_catch_block = 0x25,
DW_TAG_const_type = 0x26,
DW_TAG_constant = 0x27,
DW_TAG_enumerator = 0x28,
DW_TAG_file_type = 0x29,
DW_TAG_friend = 0x2a,
DW_TAG_namelist = 0x2b,
DW_TAG_namelist_item = 0x2c,
DW_TAG_packed_type = 0x2d,
DW_TAG_subprogram = 0x2e,
DW_TAG_template_type_param = 0x2f,
DW_TAG_template_value_param = 0x30,
DW_TAG_thrown_type = 0x31,
DW_TAG_try_block = 0x32,
DW_TAG_variant_part = 0x33,
DW_TAG_variable = 0x34,
DW_TAG_volatile_type = 0x35,
/* SGI/MIPS Extensions */
DW_TAG_MIPS_loop = 0x4081,
/* GNU extensions */
DW_TAG_format_label = 0x4101, /* for FORTRAN 77 and Fortran 90 */
DW_TAG_function_template = 0x4102, /* for C++ */
DW_TAG_class_template = 0x4103, /* for C++ */
DW_TAG_GNU_BINCL = 0x4104,
DW_TAG_GNU_EINCL = 0x4105
};
#define DW_TAG_lo_user 0x4080
#define DW_TAG_hi_user 0xffff
/* flag that tells whether entry has a child or not */
#define DW_children_no 0
#define DW_children_yes 1
/* Form names and codes. */
enum dwarf_form
{
DW_FORM_addr = 0x01,
DW_FORM_block2 = 0x03,
DW_FORM_block4 = 0x04,
DW_FORM_data2 = 0x05,
DW_FORM_data4 = 0x06,
DW_FORM_data8 = 0x07,
DW_FORM_string = 0x08,
DW_FORM_block = 0x09,
DW_FORM_block1 = 0x0a,
DW_FORM_data1 = 0x0b,
DW_FORM_flag = 0x0c,
DW_FORM_sdata = 0x0d,
DW_FORM_strp = 0x0e,
DW_FORM_udata = 0x0f,
DW_FORM_ref_addr = 0x10,
DW_FORM_ref1 = 0x11,
DW_FORM_ref2 = 0x12,
DW_FORM_ref4 = 0x13,
DW_FORM_ref8 = 0x14,
DW_FORM_ref_udata = 0x15,
DW_FORM_indirect = 0x16
};
/* Attribute names and codes. */
enum dwarf_attribute
{
DW_AT_sibling = 0x01,
DW_AT_location = 0x02,
DW_AT_name = 0x03,
DW_AT_ordering = 0x09,
DW_AT_subscr_data = 0x0a,
DW_AT_byte_size = 0x0b,
DW_AT_bit_offset = 0x0c,
DW_AT_bit_size = 0x0d,
DW_AT_element_list = 0x0f,
DW_AT_stmt_list = 0x10,
DW_AT_low_pc = 0x11,
DW_AT_high_pc = 0x12,
DW_AT_language = 0x13,
DW_AT_member = 0x14,
DW_AT_discr = 0x15,
DW_AT_discr_value = 0x16,
DW_AT_visibility = 0x17,
DW_AT_import = 0x18,
DW_AT_string_length = 0x19,
DW_AT_common_reference = 0x1a,
DW_AT_comp_dir = 0x1b,
DW_AT_const_value = 0x1c,
DW_AT_containing_type = 0x1d,
DW_AT_default_value = 0x1e,
DW_AT_inline = 0x20,
DW_AT_is_optional = 0x21,
DW_AT_lower_bound = 0x22,
DW_AT_producer = 0x25,
DW_AT_prototyped = 0x27,
DW_AT_return_addr = 0x2a,
DW_AT_start_scope = 0x2c,
DW_AT_stride_size = 0x2e,
DW_AT_upper_bound = 0x2f,
DW_AT_abstract_origin = 0x31,
DW_AT_accessibility = 0x32,
DW_AT_address_class = 0x33,
DW_AT_artificial = 0x34,
DW_AT_base_types = 0x35,
DW_AT_calling_convention = 0x36,
DW_AT_count = 0x37,
DW_AT_data_member_location = 0x38,
DW_AT_decl_column = 0x39,
DW_AT_decl_file = 0x3a,
DW_AT_decl_line = 0x3b,
DW_AT_declaration = 0x3c,
DW_AT_discr_list = 0x3d,
DW_AT_encoding = 0x3e,
DW_AT_external = 0x3f,
DW_AT_frame_base = 0x40,
DW_AT_friend = 0x41,
DW_AT_identifier_case = 0x42,
DW_AT_macro_info = 0x43,
DW_AT_namelist_items = 0x44,
DW_AT_priority = 0x45,
DW_AT_segment = 0x46,
DW_AT_specification = 0x47,
DW_AT_static_link = 0x48,
DW_AT_type = 0x49,
DW_AT_use_location = 0x4a,
DW_AT_variable_parameter = 0x4b,
DW_AT_virtuality = 0x4c,
DW_AT_vtable_elem_location = 0x4d,
/* SGI/MIPS Extensions */
DW_AT_MIPS_fde = 0x2001,
DW_AT_MIPS_loop_begin = 0x2002,
DW_AT_MIPS_tail_loop_begin = 0x2003,
DW_AT_MIPS_epilog_begin = 0x2004,
DW_AT_MIPS_loop_unroll_factor = 0x2005,
DW_AT_MIPS_software_pipeline_depth = 0x2006,
DW_AT_MIPS_linkage_name = 0x2007,
DW_AT_MIPS_stride = 0x2008,
DW_AT_MIPS_abstract_name = 0x2009,
DW_AT_MIPS_clone_origin = 0x200a,
DW_AT_MIPS_has_inlines = 0x200b,
/* GNU extensions. */
DW_AT_sf_names = 0x2101,
DW_AT_src_info = 0x2102,
DW_AT_mac_info = 0x2103,
DW_AT_src_coords = 0x2104,
DW_AT_body_begin = 0x2105,
DW_AT_body_end = 0x2106
};
#define DW_AT_lo_user 0x2000 /* implementation-defined range start */
#define DW_AT_hi_user 0x3ff0 /* implementation-defined range end */
/* Location atom names and codes. */
enum dwarf_location_atom
{
DW_OP_addr = 0x03,
DW_OP_deref = 0x06,
DW_OP_const1u = 0x08,
DW_OP_const1s = 0x09,
DW_OP_const2u = 0x0a,
DW_OP_const2s = 0x0b,
DW_OP_const4u = 0x0c,
DW_OP_const4s = 0x0d,
DW_OP_const8u = 0x0e,
DW_OP_const8s = 0x0f,
DW_OP_constu = 0x10,
DW_OP_consts = 0x11,
DW_OP_dup = 0x12,
DW_OP_drop = 0x13,
DW_OP_over = 0x14,
DW_OP_pick = 0x15,
DW_OP_swap = 0x16,
DW_OP_rot = 0x17,
DW_OP_xderef = 0x18,
DW_OP_abs = 0x19,
DW_OP_and = 0x1a,
DW_OP_div = 0x1b,
DW_OP_minus = 0x1c,
DW_OP_mod = 0x1d,
DW_OP_mul = 0x1e,
DW_OP_neg = 0x1f,
DW_OP_not = 0x20,
DW_OP_or = 0x21,
DW_OP_plus = 0x22,
DW_OP_plus_uconst = 0x23,
DW_OP_shl = 0x24,
DW_OP_shr = 0x25,
DW_OP_shra = 0x26,
DW_OP_xor = 0x27,
DW_OP_bra = 0x28,
DW_OP_eq = 0x29,
DW_OP_ge = 0x2a,
DW_OP_gt = 0x2b,
DW_OP_le = 0x2c,
DW_OP_lt = 0x2d,
DW_OP_ne = 0x2e,
DW_OP_skip = 0x2f,
DW_OP_lit0 = 0x30,
DW_OP_lit1 = 0x31,
DW_OP_lit2 = 0x32,
DW_OP_lit3 = 0x33,
DW_OP_lit4 = 0x34,
DW_OP_lit5 = 0x35,
DW_OP_lit6 = 0x36,
DW_OP_lit7 = 0x37,
DW_OP_lit8 = 0x38,
DW_OP_lit9 = 0x39,
DW_OP_lit10 = 0x3a,
DW_OP_lit11 = 0x3b,
DW_OP_lit12 = 0x3c,
DW_OP_lit13 = 0x3d,
DW_OP_lit14 = 0x3e,
DW_OP_lit15 = 0x3f,
DW_OP_lit16 = 0x40,
DW_OP_lit17 = 0x41,
DW_OP_lit18 = 0x42,
DW_OP_lit19 = 0x43,
DW_OP_lit20 = 0x44,
DW_OP_lit21 = 0x45,
DW_OP_lit22 = 0x46,
DW_OP_lit23 = 0x47,
DW_OP_lit24 = 0x48,
DW_OP_lit25 = 0x49,
DW_OP_lit26 = 0x4a,
DW_OP_lit27 = 0x4b,
DW_OP_lit28 = 0x4c,
DW_OP_lit29 = 0x4d,
DW_OP_lit30 = 0x4e,
DW_OP_lit31 = 0x4f,
DW_OP_reg0 = 0x50,
DW_OP_reg1 = 0x51,
DW_OP_reg2 = 0x52,
DW_OP_reg3 = 0x53,
DW_OP_reg4 = 0x54,
DW_OP_reg5 = 0x55,
DW_OP_reg6 = 0x56,
DW_OP_reg7 = 0x57,
DW_OP_reg8 = 0x58,
DW_OP_reg9 = 0x59,
DW_OP_reg10 = 0x5a,
DW_OP_reg11 = 0x5b,
DW_OP_reg12 = 0x5c,
DW_OP_reg13 = 0x5d,
DW_OP_reg14 = 0x5e,
DW_OP_reg15 = 0x5f,
DW_OP_reg16 = 0x60,
DW_OP_reg17 = 0x61,
DW_OP_reg18 = 0x62,
DW_OP_reg19 = 0x63,
DW_OP_reg20 = 0x64,
DW_OP_reg21 = 0x65,
DW_OP_reg22 = 0x66,
DW_OP_reg23 = 0x67,
DW_OP_reg24 = 0x68,
DW_OP_reg25 = 0x69,
DW_OP_reg26 = 0x6a,
DW_OP_reg27 = 0x6b,
DW_OP_reg28 = 0x6c,
DW_OP_reg29 = 0x6d,
DW_OP_reg30 = 0x6e,
DW_OP_reg31 = 0x6f,
DW_OP_breg0 = 0x70,
DW_OP_breg1 = 0x71,
DW_OP_breg2 = 0x72,
DW_OP_breg3 = 0x73,
DW_OP_breg4 = 0x74,
DW_OP_breg5 = 0x75,
DW_OP_breg6 = 0x76,
DW_OP_breg7 = 0x77,
DW_OP_breg8 = 0x78,
DW_OP_breg9 = 0x79,
DW_OP_breg10 = 0x7a,
DW_OP_breg11 = 0x7b,
DW_OP_breg12 = 0x7c,
DW_OP_breg13 = 0x7d,
DW_OP_breg14 = 0x7e,
DW_OP_breg15 = 0x7f,
DW_OP_breg16 = 0x80,
DW_OP_breg17 = 0x81,
DW_OP_breg18 = 0x82,
DW_OP_breg19 = 0x83,
DW_OP_breg20 = 0x84,
DW_OP_breg21 = 0x85,
DW_OP_breg22 = 0x86,
DW_OP_breg23 = 0x87,
DW_OP_breg24 = 0x88,
DW_OP_breg25 = 0x89,
DW_OP_breg26 = 0x8a,
DW_OP_breg27 = 0x8b,
DW_OP_breg28 = 0x8c,
DW_OP_breg29 = 0x8d,
DW_OP_breg30 = 0x8e,
DW_OP_breg31 = 0x8f,
DW_OP_regx = 0x90,
DW_OP_fbreg = 0x91,
DW_OP_bregx = 0x92,
DW_OP_piece = 0x93,
DW_OP_deref_size = 0x94,
DW_OP_xderef_size = 0x95,
DW_OP_nop = 0x96
};
#define DW_OP_lo_user 0x80 /* implementation-defined range start */
#define DW_OP_hi_user 0xff /* implementation-defined range end */
/* Type encodings. */
enum dwarf_type
{
DW_ATE_void = 0x0,
DW_ATE_address = 0x1,
DW_ATE_boolean = 0x2,
DW_ATE_complex_float = 0x3,
DW_ATE_float = 0x4,
DW_ATE_signed = 0x5,
DW_ATE_signed_char = 0x6,
DW_ATE_unsigned = 0x7,
DW_ATE_unsigned_char = 0x8
};
#define DW_ATE_lo_user 0x80
#define DW_ATE_hi_user 0xff
/* Array ordering names and codes. */
enum dwarf_array_dim_ordering
{
DW_ORD_row_major = 0,
DW_ORD_col_major = 1
};
/* access attribute */
enum dwarf_access_attribute
{
DW_ACCESS_public = 1,
DW_ACCESS_protected = 2,
DW_ACCESS_private = 3
};
/* visibility */
enum dwarf_visibility_attribute
{
DW_VIS_local = 1,
DW_VIS_exported = 2,
DW_VIS_qualified = 3
};
/* virtuality */
enum dwarf_virtuality_attribute
{
DW_VIRTUALITY_none = 0,
DW_VIRTUALITY_virtual = 1,
DW_VIRTUALITY_pure_virtual = 2
};
/* case sensitivity */
enum dwarf_id_case
{
DW_ID_case_sensitive = 0,
DW_ID_up_case = 1,
DW_ID_down_case = 2,
DW_ID_case_insensitive = 3
};
/* calling convention */
enum dwarf_calling_convention
{
DW_CC_normal = 0x1,
DW_CC_program = 0x2,
DW_CC_nocall = 0x3
};
#define DW_CC_lo_user 0x40
#define DW_CC_hi_user 0xff
/* inline attribute */
enum dwarf_inline_attribute
{
DW_INL_not_inlined = 0,
DW_INL_inlined = 1,
DW_INL_declared_not_inlined = 2,
DW_INL_declared_inlined = 3
};
/* discriminant lists */
enum dwarf_discrim_list
{
DW_DSC_label = 0,
DW_DSC_range = 1
};
/* line number opcodes */
enum dwarf_line_number_ops
{
DW_LNS_extended_op = 0,
DW_LNS_copy = 1,
DW_LNS_advance_pc = 2,
DW_LNS_advance_line = 3,
DW_LNS_set_file = 4,
DW_LNS_set_column = 5,
DW_LNS_negate_stmt = 6,
DW_LNS_set_basic_block = 7,
DW_LNS_const_add_pc = 8,
DW_LNS_fixed_advance_pc = 9
};
/* line number extended opcodes */
enum dwarf_line_number_x_ops
{
DW_LNE_end_sequence = 1,
DW_LNE_set_address = 2,
DW_LNE_define_file = 3
};
/* call frame information */
enum dwarf_call_frame_info
{
DW_CFA_advance_loc = 0x40,
DW_CFA_offset = 0x80,
DW_CFA_restore = 0xc0,
DW_CFA_nop = 0x00,
DW_CFA_set_loc = 0x01,
DW_CFA_advance_loc1 = 0x02,
DW_CFA_advance_loc2 = 0x03,
DW_CFA_advance_loc4 = 0x04,
DW_CFA_offset_extended = 0x05,
DW_CFA_restore_extended = 0x06,
DW_CFA_undefined = 0x07,
DW_CFA_same_value = 0x08,
DW_CFA_register = 0x09,
DW_CFA_remember_state = 0x0a,
DW_CFA_restore_state = 0x0b,
DW_CFA_def_cfa = 0x0c,
DW_CFA_def_cfa_register = 0x0d,
DW_CFA_def_cfa_offset = 0x0e,
DW_CFA_def_cfa_expression = 0x0f,
DW_CFA_expression = 0x10,
/* Dwarf 2.1 */
DW_CFA_offset_extended_sf = 0x11,
DW_CFA_def_cfa_sf = 0x12,
DW_CFA_def_cfa_offset_sf = 0x13,
/* SGI/MIPS specific */
DW_CFA_MIPS_advance_loc8 = 0x1d,
/* GNU extensions */
DW_CFA_GNU_window_save = 0x2d,
DW_CFA_GNU_args_size = 0x2e,
DW_CFA_GNU_negative_offset_extended = 0x2f
};
#define DW_CIE_ID 0xffffffff
#define DW_CIE_VERSION 1
#define DW_CFA_extended 0
#define DW_CFA_low_user 0x1c
#define DW_CFA_high_user 0x3f
#define DW_CHILDREN_no 0x00
#define DW_CHILDREN_yes 0x01
#define DW_ADDR_none 0
/* Source language names and codes. */
enum dwarf_source_language
{
DW_LANG_C89 = 0x0001,
DW_LANG_C = 0x0002,
DW_LANG_Ada83 = 0x0003,
DW_LANG_C_plus_plus = 0x0004,
DW_LANG_Cobol74 = 0x0005,
DW_LANG_Cobol85 = 0x0006,
DW_LANG_Fortran77 = 0x0007,
DW_LANG_Fortran90 = 0x0008,
DW_LANG_Pascal83 = 0x0009,
DW_LANG_Modula2 = 0x000a,
DW_LANG_Java = 0x000b,
DW_LANG_Mips_Assembler = 0x8001
};
#define DW_LANG_lo_user 0x8000 /* implementation-defined range start */
#define DW_LANG_hi_user 0xffff /* implementation-defined range start */
/* Names and codes for macro information. */
enum dwarf_macinfo_record_type
{
DW_MACINFO_define = 1,
DW_MACINFO_undef = 2,
DW_MACINFO_start_file = 3,
DW_MACINFO_end_file = 4,
DW_MACINFO_vendor_ext = 255
};
#endif /* !ASSEMBLER */
/* @@@ For use with GNU frame unwind information. */
#define DW_EH_PE_absptr 0x00
#define DW_EH_PE_omit 0xff
#define DW_EH_PE_uleb128 0x01
#define DW_EH_PE_udata2 0x02
#define DW_EH_PE_udata4 0x03
#define DW_EH_PE_udata8 0x04
#define DW_EH_PE_sleb128 0x09
#define DW_EH_PE_sdata2 0x0A
#define DW_EH_PE_sdata4 0x0B
#define DW_EH_PE_sdata8 0x0C
#define DW_EH_PE_signed 0x08
#define DW_EH_PE_pcrel 0x10
#define DW_EH_PE_textrel 0x20
#define DW_EH_PE_datarel 0x30
#define DW_EH_PE_funcrel 0x40
#define DW_EH_PE_aligned 0x50
#define DW_EH_PE_indirect 0x80
#endif /* dwarf2.h */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
#if 1
# define VEC_SIZE 32
# define VEC(i) ymm##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
# define SECTION(p) p##.avx
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
# include "memmove-vec-unaligned-erms.S"
#endif

View File

@ -0,0 +1,419 @@
/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
Copyright (C) 2016-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "sysdep.h"
#if 1
# include "asm-syntax.h"
.section .text.avx512,"ax",@progbits
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_avx512_no_vzeroupper)
ENTRY (__mempcpy_avx512_no_vzeroupper)
mov %RDI_LP, %RAX_LP
add %RDX_LP, %RAX_LP
jmp L(start)
END (__mempcpy_avx512_no_vzeroupper)
ENTRY (__memmove_chk_avx512_no_vzeroupper)
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_avx512_no_vzeroupper)
ENTRY (__memmove_avx512_no_vzeroupper)
mov %RDI_LP, %RAX_LP
# ifdef USE_AS_MEMPCPY
add %RDX_LP, %RAX_LP
# endif
L(start):
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
lea (%rsi, %rdx), %rcx
lea (%rdi, %rdx), %r9
cmp $512, %rdx
ja L(512bytesormore)
L(check):
cmp $16, %rdx
jbe L(less_16bytes)
cmp $256, %rdx
jb L(less_256bytes)
vmovups (%rsi), %zmm0
vmovups 0x40(%rsi), %zmm1
vmovups 0x80(%rsi), %zmm2
vmovups 0xC0(%rsi), %zmm3
vmovups -0x100(%rcx), %zmm4
vmovups -0xC0(%rcx), %zmm5
vmovups -0x80(%rcx), %zmm6
vmovups -0x40(%rcx), %zmm7
vmovups %zmm0, (%rdi)
vmovups %zmm1, 0x40(%rdi)
vmovups %zmm2, 0x80(%rdi)
vmovups %zmm3, 0xC0(%rdi)
vmovups %zmm4, -0x100(%r9)
vmovups %zmm5, -0xC0(%r9)
vmovups %zmm6, -0x80(%r9)
vmovups %zmm7, -0x40(%r9)
ret
L(less_256bytes):
cmp $128, %dl
jb L(less_128bytes)
vmovups (%rsi), %zmm0
vmovups 0x40(%rsi), %zmm1
vmovups -0x80(%rcx), %zmm2
vmovups -0x40(%rcx), %zmm3
vmovups %zmm0, (%rdi)
vmovups %zmm1, 0x40(%rdi)
vmovups %zmm2, -0x80(%r9)
vmovups %zmm3, -0x40(%r9)
ret
L(less_128bytes):
cmp $64, %dl
jb L(less_64bytes)
vmovdqu (%rsi), %ymm0
vmovdqu 0x20(%rsi), %ymm1
vmovdqu -0x40(%rcx), %ymm2
vmovdqu -0x20(%rcx), %ymm3
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, 0x20(%rdi)
vmovdqu %ymm2, -0x40(%r9)
vmovdqu %ymm3, -0x20(%r9)
ret
L(less_64bytes):
cmp $32, %dl
jb L(less_32bytes)
vmovdqu (%rsi), %ymm0
vmovdqu -0x20(%rcx), %ymm1
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -0x20(%r9)
ret
L(less_32bytes):
vmovdqu (%rsi), %xmm0
vmovdqu -0x10(%rcx), %xmm1
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -0x10(%r9)
ret
L(less_16bytes):
cmp $8, %dl
jb L(less_8bytes)
movq (%rsi), %rsi
movq -0x8(%rcx), %rcx
movq %rsi, (%rdi)
movq %rcx, -0x8(%r9)
ret
L(less_8bytes):
cmp $4, %dl
jb L(less_4bytes)
mov (%rsi), %esi
mov -0x4(%rcx), %ecx
mov %esi, (%rdi)
mov %ecx, -0x4(%r9)
ret
L(less_4bytes):
cmp $2, %dl
jb L(less_2bytes)
mov (%rsi), %si
mov -0x2(%rcx), %cx
mov %si, (%rdi)
mov %cx, -0x2(%r9)
ret
L(less_2bytes):
cmp $1, %dl
jb L(less_1bytes)
mov (%rsi), %cl
mov %cl, (%rdi)
L(less_1bytes):
ret
L(512bytesormore):
# ifdef SHARED_CACHE_SIZE_HALF
mov $SHARED_CACHE_SIZE_HALF, %r8
# else
mov __x86_shared_cache_size_half(%rip), %r8
# endif
cmp %r8, %rdx
jae L(preloop_large)
cmp $1024, %rdx
ja L(1024bytesormore)
prefetcht1 (%rsi)
prefetcht1 0x40(%rsi)
prefetcht1 0x80(%rsi)
prefetcht1 0xC0(%rsi)
prefetcht1 0x100(%rsi)
prefetcht1 0x140(%rsi)
prefetcht1 0x180(%rsi)
prefetcht1 0x1C0(%rsi)
prefetcht1 -0x200(%rcx)
prefetcht1 -0x1C0(%rcx)
prefetcht1 -0x180(%rcx)
prefetcht1 -0x140(%rcx)
prefetcht1 -0x100(%rcx)
prefetcht1 -0xC0(%rcx)
prefetcht1 -0x80(%rcx)
prefetcht1 -0x40(%rcx)
vmovups (%rsi), %zmm0
vmovups 0x40(%rsi), %zmm1
vmovups 0x80(%rsi), %zmm2
vmovups 0xC0(%rsi), %zmm3
vmovups 0x100(%rsi), %zmm4
vmovups 0x140(%rsi), %zmm5
vmovups 0x180(%rsi), %zmm6
vmovups 0x1C0(%rsi), %zmm7
vmovups -0x200(%rcx), %zmm8
vmovups -0x1C0(%rcx), %zmm9
vmovups -0x180(%rcx), %zmm10
vmovups -0x140(%rcx), %zmm11
vmovups -0x100(%rcx), %zmm12
vmovups -0xC0(%rcx), %zmm13
vmovups -0x80(%rcx), %zmm14
vmovups -0x40(%rcx), %zmm15
vmovups %zmm0, (%rdi)
vmovups %zmm1, 0x40(%rdi)
vmovups %zmm2, 0x80(%rdi)
vmovups %zmm3, 0xC0(%rdi)
vmovups %zmm4, 0x100(%rdi)
vmovups %zmm5, 0x140(%rdi)
vmovups %zmm6, 0x180(%rdi)
vmovups %zmm7, 0x1C0(%rdi)
vmovups %zmm8, -0x200(%r9)
vmovups %zmm9, -0x1C0(%r9)
vmovups %zmm10, -0x180(%r9)
vmovups %zmm11, -0x140(%r9)
vmovups %zmm12, -0x100(%r9)
vmovups %zmm13, -0xC0(%r9)
vmovups %zmm14, -0x80(%r9)
vmovups %zmm15, -0x40(%r9)
ret
L(1024bytesormore):
cmp %rsi, %rdi
ja L(1024bytesormore_bkw)
sub $512, %r9
vmovups -0x200(%rcx), %zmm8
vmovups -0x1C0(%rcx), %zmm9
vmovups -0x180(%rcx), %zmm10
vmovups -0x140(%rcx), %zmm11
vmovups -0x100(%rcx), %zmm12
vmovups -0xC0(%rcx), %zmm13
vmovups -0x80(%rcx), %zmm14
vmovups -0x40(%rcx), %zmm15
prefetcht1 (%rsi)
prefetcht1 0x40(%rsi)
prefetcht1 0x80(%rsi)
prefetcht1 0xC0(%rsi)
prefetcht1 0x100(%rsi)
prefetcht1 0x140(%rsi)
prefetcht1 0x180(%rsi)
prefetcht1 0x1C0(%rsi)
/* Loop with unaligned memory access. */
L(gobble_512bytes_loop):
vmovups (%rsi), %zmm0
vmovups 0x40(%rsi), %zmm1
vmovups 0x80(%rsi), %zmm2
vmovups 0xC0(%rsi), %zmm3
vmovups 0x100(%rsi), %zmm4
vmovups 0x140(%rsi), %zmm5
vmovups 0x180(%rsi), %zmm6
vmovups 0x1C0(%rsi), %zmm7
add $512, %rsi
prefetcht1 (%rsi)
prefetcht1 0x40(%rsi)
prefetcht1 0x80(%rsi)
prefetcht1 0xC0(%rsi)
prefetcht1 0x100(%rsi)
prefetcht1 0x140(%rsi)
prefetcht1 0x180(%rsi)
prefetcht1 0x1C0(%rsi)
vmovups %zmm0, (%rdi)
vmovups %zmm1, 0x40(%rdi)
vmovups %zmm2, 0x80(%rdi)
vmovups %zmm3, 0xC0(%rdi)
vmovups %zmm4, 0x100(%rdi)
vmovups %zmm5, 0x140(%rdi)
vmovups %zmm6, 0x180(%rdi)
vmovups %zmm7, 0x1C0(%rdi)
add $512, %rdi
cmp %r9, %rdi
jb L(gobble_512bytes_loop)
vmovups %zmm8, (%r9)
vmovups %zmm9, 0x40(%r9)
vmovups %zmm10, 0x80(%r9)
vmovups %zmm11, 0xC0(%r9)
vmovups %zmm12, 0x100(%r9)
vmovups %zmm13, 0x140(%r9)
vmovups %zmm14, 0x180(%r9)
vmovups %zmm15, 0x1C0(%r9)
ret
L(1024bytesormore_bkw):
add $512, %rdi
vmovups 0x1C0(%rsi), %zmm8
vmovups 0x180(%rsi), %zmm9
vmovups 0x140(%rsi), %zmm10
vmovups 0x100(%rsi), %zmm11
vmovups 0xC0(%rsi), %zmm12
vmovups 0x80(%rsi), %zmm13
vmovups 0x40(%rsi), %zmm14
vmovups (%rsi), %zmm15
prefetcht1 -0x40(%rcx)
prefetcht1 -0x80(%rcx)
prefetcht1 -0xC0(%rcx)
prefetcht1 -0x100(%rcx)
prefetcht1 -0x140(%rcx)
prefetcht1 -0x180(%rcx)
prefetcht1 -0x1C0(%rcx)
prefetcht1 -0x200(%rcx)
/* Backward loop with unaligned memory access. */
L(gobble_512bytes_loop_bkw):
vmovups -0x40(%rcx), %zmm0
vmovups -0x80(%rcx), %zmm1
vmovups -0xC0(%rcx), %zmm2
vmovups -0x100(%rcx), %zmm3
vmovups -0x140(%rcx), %zmm4
vmovups -0x180(%rcx), %zmm5
vmovups -0x1C0(%rcx), %zmm6
vmovups -0x200(%rcx), %zmm7
sub $512, %rcx
prefetcht1 -0x40(%rcx)
prefetcht1 -0x80(%rcx)
prefetcht1 -0xC0(%rcx)
prefetcht1 -0x100(%rcx)
prefetcht1 -0x140(%rcx)
prefetcht1 -0x180(%rcx)
prefetcht1 -0x1C0(%rcx)
prefetcht1 -0x200(%rcx)
vmovups %zmm0, -0x40(%r9)
vmovups %zmm1, -0x80(%r9)
vmovups %zmm2, -0xC0(%r9)
vmovups %zmm3, -0x100(%r9)
vmovups %zmm4, -0x140(%r9)
vmovups %zmm5, -0x180(%r9)
vmovups %zmm6, -0x1C0(%r9)
vmovups %zmm7, -0x200(%r9)
sub $512, %r9
cmp %rdi, %r9
ja L(gobble_512bytes_loop_bkw)
vmovups %zmm8, -0x40(%rdi)
vmovups %zmm9, -0x80(%rdi)
vmovups %zmm10, -0xC0(%rdi)
vmovups %zmm11, -0x100(%rdi)
vmovups %zmm12, -0x140(%rdi)
vmovups %zmm13, -0x180(%rdi)
vmovups %zmm14, -0x1C0(%rdi)
vmovups %zmm15, -0x200(%rdi)
ret
L(preloop_large):
cmp %rsi, %rdi
ja L(preloop_large_bkw)
vmovups (%rsi), %zmm4
vmovups 0x40(%rsi), %zmm5
mov %rdi, %r11
/* Align destination for access with non-temporal stores in the loop. */
mov %rdi, %r8
and $-0x80, %rdi
add $0x80, %rdi
sub %rdi, %r8
sub %r8, %rsi
add %r8, %rdx
L(gobble_256bytes_nt_loop):
prefetcht1 0x200(%rsi)
prefetcht1 0x240(%rsi)
prefetcht1 0x280(%rsi)
prefetcht1 0x2C0(%rsi)
prefetcht1 0x300(%rsi)
prefetcht1 0x340(%rsi)
prefetcht1 0x380(%rsi)
prefetcht1 0x3C0(%rsi)
vmovdqu64 (%rsi), %zmm0
vmovdqu64 0x40(%rsi), %zmm1
vmovdqu64 0x80(%rsi), %zmm2
vmovdqu64 0xC0(%rsi), %zmm3
vmovntdq %zmm0, (%rdi)
vmovntdq %zmm1, 0x40(%rdi)
vmovntdq %zmm2, 0x80(%rdi)
vmovntdq %zmm3, 0xC0(%rdi)
sub $256, %rdx
add $256, %rsi
add $256, %rdi
cmp $256, %rdx
ja L(gobble_256bytes_nt_loop)
sfence
vmovups %zmm4, (%r11)
vmovups %zmm5, 0x40(%r11)
jmp L(check)
L(preloop_large_bkw):
vmovups -0x80(%rcx), %zmm4
vmovups -0x40(%rcx), %zmm5
/* Align end of destination for access with non-temporal stores. */
mov %r9, %r8
and $-0x80, %r9
sub %r9, %r8
sub %r8, %rcx
sub %r8, %rdx
add %r9, %r8
L(gobble_256bytes_nt_loop_bkw):
prefetcht1 -0x400(%rcx)
prefetcht1 -0x3C0(%rcx)
prefetcht1 -0x380(%rcx)
prefetcht1 -0x340(%rcx)
prefetcht1 -0x300(%rcx)
prefetcht1 -0x2C0(%rcx)
prefetcht1 -0x280(%rcx)
prefetcht1 -0x240(%rcx)
vmovdqu64 -0x100(%rcx), %zmm0
vmovdqu64 -0xC0(%rcx), %zmm1
vmovdqu64 -0x80(%rcx), %zmm2
vmovdqu64 -0x40(%rcx), %zmm3
vmovntdq %zmm0, -0x100(%r9)
vmovntdq %zmm1, -0xC0(%r9)
vmovntdq %zmm2, -0x80(%r9)
vmovntdq %zmm3, -0x40(%r9)
sub $256, %rdx
sub $256, %rcx
sub $256, %r9
cmp $256, %rdx
ja L(gobble_256bytes_nt_loop_bkw)
sfence
vmovups %zmm4, -0x80(%r8)
vmovups %zmm5, -0x40(%r8)
jmp L(check)
END (__memmove_avx512_no_vzeroupper)
strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
#endif

View File

@ -0,0 +1,12 @@
#if 1
# define VEC_SIZE 64
# define VEC(i) zmm##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
# define SECTION(p) p##.avx512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
# include "memmove-vec-unaligned-erms.S"
#endif

View File

@ -0,0 +1,33 @@
/* memmove with SSE2.
Copyright (C) 2017-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if 1
# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
#else
weak_alias (__mempcpy, mempcpy)
#endif
#include "memmove.S"
#if defined SHARED
# include <shlib-compat.h>
# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
/* Use __memmove_sse2_unaligned to support overlapping addresses. */
compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
# endif
#endif

View File

@ -0,0 +1,559 @@
/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
Copyright (C) 2016-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
/* memmove/memcpy/mempcpy is implemented as:
1. Use overlapping load and store to avoid branch.
2. Load all sources into registers and store them together to avoid
possible address overlap between source and destination.
3. If size is 8 * VEC_SIZE or less, load all sources into registers
and store them together.
4. If address of destination > address of source, backward copy
4 * VEC_SIZE at a time with unaligned load and aligned store.
Load the first 4 * VEC and last VEC before the loop and store
them after the loop to support overlapping addresses.
5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
load and aligned store. Load the last 4 * VEC and first VEC
before the loop and store them after the loop to support
overlapping addresses.
6. If size >= __x86_shared_non_temporal_threshold and there is no
overlap between destination and source, use non-temporal store
instead of aligned store. */
#include "sysdep.h"
#ifndef MEMCPY_SYMBOL
# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
#ifndef MEMPCPY_SYMBOL
# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
#ifndef MEMMOVE_CHK_SYMBOL
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
# else
# define VZEROUPPER
# endif
#endif
#ifndef PREFETCH
# define PREFETCH(addr) prefetcht0 addr
#endif
/* Assume 64-byte prefetch size. */
#ifndef PREFETCH_SIZE
# define PREFETCH_SIZE 64
#endif
#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
#if PREFETCH_SIZE == 64
# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
# define PREFETCH_ONE_SET(dir, base, offset) \
PREFETCH ((offset)base)
# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
# define PREFETCH_ONE_SET(dir, base, offset) \
PREFETCH ((offset)base); \
PREFETCH ((offset + dir * PREFETCH_SIZE)base)
# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
# define PREFETCH_ONE_SET(dir, base, offset) \
PREFETCH ((offset)base); \
PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
# else
# error Unsupported PREFETCHED_LOAD_SIZE!
# endif
#else
# error Unsupported PREFETCH_SIZE!
#endif
#ifndef SECTION
# error SECTION is not defined!
#endif
.section SECTION(.text),"ax",@progbits
#if defined SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
#endif
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
mov %RDI_LP, %RAX_LP
add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
#if defined SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
#endif
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
movq %rdi, %rax
L(start):
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
#if !defined USE_MULTIARCH
L(last_2x_vec):
#endif
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
VZEROUPPER
#if !defined USE_MULTIARCH
L(nop):
#endif
ret
#if defined USE_MULTIARCH
END (MEMMOVE_SYMBOL (__memmove, unaligned))
# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_erms)
/* Only used to measure performance of REP MOVSB. */
ENTRY (__mempcpy_erms)
mov %RDI_LP, %RAX_LP
/* Skip zero length. */
test %RDX_LP, %RDX_LP
jz 2f
add %RDX_LP, %RAX_LP
jmp L(start_movsb)
END (__mempcpy_erms)
ENTRY (__memmove_chk_erms)
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_erms)
ENTRY (__memmove_erms)
movq %rdi, %rax
/* Skip zero length. */
test %RDX_LP, %RDX_LP
jz 2f
L(start_movsb):
mov %RDX_LP, %RCX_LP
cmp %RSI_LP, %RDI_LP
jb 1f
/* Source == destination is less common. */
je 2f
lea (%rsi,%rcx), %RDX_LP
cmp %RDX_LP, %RDI_LP
jb L(movsb_backward)
1:
rep movsb
2:
ret
L(movsb_backward):
leaq -1(%rdi,%rcx), %rdi
leaq -1(%rsi,%rcx), %rsi
std
rep movsb
cld
ret
END (__memmove_erms)
strong_alias (__memmove_erms, __memcpy_erms)
strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
# endif
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
mov %RDI_LP, %RAX_LP
add %RDX_LP, %RAX_LP
jmp L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
movq %rdi, %rax
L(start_erms):
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
L(last_2x_vec):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %VEC(0)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
L(return):
VZEROUPPER
ret
L(movsb):
cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
jae L(more_8x_vec)
cmpq %rsi, %rdi
jb 1f
/* Source == destination is less common. */
je L(nop)
leaq (%rsi,%rdx), %r9
cmpq %r9, %rdi
/* Avoid slow backward REP MOVSB. */
jb L(more_8x_vec_backward)
1:
mov %RDX_LP, %RCX_LP
rep movsb
L(nop):
ret
#endif
L(less_vec):
/* Less than 1 VEC. */
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
#endif
#if VEC_SIZE > 32
cmpb $32, %dl
jae L(between_32_63)
#endif
#if VEC_SIZE > 16
cmpb $16, %dl
jae L(between_16_31)
#endif
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
jae L(between_4_7)
cmpb $1, %dl
ja L(between_2_3)
jb 1f
movzbl (%rsi), %ecx
movb %cl, (%rdi)
1:
ret
#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
vmovdqu (%rsi), %ymm0
vmovdqu -32(%rsi,%rdx), %ymm1
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi,%rdx)
VZEROUPPER
ret
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi,%rdx), %xmm1
vmovdqu %xmm0, (%rdi)
vmovdqu %xmm1, -16(%rdi,%rdx)
ret
#endif
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
movq -8(%rsi,%rdx), %rcx
movq (%rsi), %rsi
movq %rcx, -8(%rdi,%rdx)
movq %rsi, (%rdi)
ret
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl -4(%rsi,%rdx), %ecx
movl (%rsi), %esi
movl %ecx, -4(%rdi,%rdx)
movl %esi, (%rdi)
ret
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
movzwl -2(%rsi,%rdx), %ecx
movzwl (%rsi), %esi
movw %cx, -2(%rdi,%rdx)
movw %si, (%rdi)
ret
#if defined USE_MULTIARCH
L(movsb_more_2x_vec):
cmp $REP_MOSB_THRESHOLD, %RDX_LP
ja L(movsb)
#endif
L(more_2x_vec):
/* More than 2 * VEC and there may be overlap between destination
and source. */
cmpq $(VEC_SIZE * 8), %rdx
ja L(more_8x_vec)
cmpq $(VEC_SIZE * 4), %rdx
jb L(last_4x_vec)
/* Copy from 4 * VEC to 8 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
VZEROUPPER
ret
L(last_4x_vec):
/* Copy from 2 * VEC to 4 * VEC. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
VZEROUPPER
ret
L(more_8x_vec):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
/* Source == destination is less common. */
je L(nop)
/* Load the first VEC and last 4 * VEC to support overlapping
addresses. */
VMOVU (%rsi), %VEC(4)
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
/* Save start and stop of the destination buffer. */
movq %rdi, %r11
leaq -VEC_SIZE(%rdi, %rdx), %rcx
/* Align destination for aligned stores in the loop. Compute
how much destination is misaligned. */
movq %rdi, %r8
andq $(VEC_SIZE - 1), %r8
/* Get the negative of offset for alignment. */
subq $VEC_SIZE, %r8
/* Adjust source. */
subq %r8, %rsi
/* Adjust destination which should be aligned now. */
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16)
/* Check non-temporal store threshold. */
cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
ja L(large_forward)
#endif
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
addq $(VEC_SIZE * 4), %rsi
subq $(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%rdi)
VMOVA %VEC(1), VEC_SIZE(%rdi)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
addq $(VEC_SIZE * 4), %rdi
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_4x_vec_forward)
/* Store the last 4 * VEC. */
VMOVU %VEC(5), (%rcx)
VMOVU %VEC(6), -VEC_SIZE(%rcx)
VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
VZEROUPPER
ret
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
addresses. */
VMOVU (%rsi), %VEC(4)
VMOVU VEC_SIZE(%rsi), %VEC(5)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
/* Save stop of the destination buffer. */
leaq -VEC_SIZE(%rdi, %rdx), %r11
/* Align destination end for aligned stores in the loop. Compute
how much destination end is misaligned. */
leaq -VEC_SIZE(%rsi, %rdx), %rcx
movq %r11, %r9
movq %r11, %r8
andq $(VEC_SIZE - 1), %r8
/* Adjust source. */
subq %r8, %rcx
/* Adjust the end of destination which should be aligned now. */
subq %r8, %r9
/* Adjust length. */
subq %r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16)
/* Check non-temporal store threshold. */
cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
ja L(large_backward)
#endif
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
subq $(VEC_SIZE * 4), %rcx
subq $(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%r9)
VMOVA %VEC(1), -VEC_SIZE(%r9)
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
subq $(VEC_SIZE * 4), %r9
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_4x_vec_backward)
/* Store the first 4 * VEC. */
VMOVU %VEC(4), (%rdi)
VMOVU %VEC(5), VEC_SIZE(%rdi)
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
VZEROUPPER
ret
#if (defined USE_MULTIARCH || VEC_SIZE == 16)
L(large_forward):
/* Don't use non-temporal store if there is overlap between
destination and source since destination may be in cache
when source is loaded. */
leaq (%rdi, %rdx), %r10
cmpq %r10, %rsi
jb L(loop_4x_vec_forward)
L(loop_large_forward):
/* Copy 4 * VEC a time forward with non-temporal stores. */
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
addq $PREFETCHED_LOAD_SIZE, %rsi
subq $PREFETCHED_LOAD_SIZE, %rdx
VMOVNT %VEC(0), (%rdi)
VMOVNT %VEC(1), VEC_SIZE(%rdi)
VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
addq $PREFETCHED_LOAD_SIZE, %rdi
cmpq $PREFETCHED_LOAD_SIZE, %rdx
ja L(loop_large_forward)
sfence
/* Store the last 4 * VEC. */
VMOVU %VEC(5), (%rcx)
VMOVU %VEC(6), -VEC_SIZE(%rcx)
VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
VZEROUPPER
ret
L(large_backward):
/* Don't use non-temporal store if there is overlap between
destination and source since destination may be in cache
when source is loaded. */
leaq (%rcx, %rdx), %r10
cmpq %r10, %r9
jb L(loop_4x_vec_backward)
L(loop_large_backward):
/* Copy 4 * VEC a time backward with non-temporal stores. */
PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
subq $PREFETCHED_LOAD_SIZE, %rcx
subq $PREFETCHED_LOAD_SIZE, %rdx
VMOVNT %VEC(0), (%r9)
VMOVNT %VEC(1), -VEC_SIZE(%r9)
VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
subq $PREFETCHED_LOAD_SIZE, %r9
cmpq $PREFETCHED_LOAD_SIZE, %rdx
ja L(loop_large_backward)
sfence
/* Store the first 4 * VEC. */
VMOVU %VEC(4), (%rdi)
VMOVU %VEC(5), VEC_SIZE(%rdi)
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
VZEROUPPER
ret
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
#if 1
# ifdef USE_MULTIARCH
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
# ifdef SHARED
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
# endif
# endif
# ifdef SHARED
strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
# endif
#endif
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
MEMCPY_SYMBOL (__memcpy, unaligned))

View File

@ -0,0 +1,71 @@
/* Optimized memmove for x86-64.
Copyright (C) 2016-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include "sysdep.h"
#define VEC_SIZE 16
#define VEC(i) xmm##i
#define PREFETCHNT prefetchnta
#define VMOVNT movntdq
/* Use movups and movaps for smaller code sizes. */
#define VMOVU movups
#define VMOVA movaps
#define SECTION(p) p
#ifdef USE_MULTIARCH
# if 0
# define MEMCPY_SYMBOL(p,s) memcpy
# endif
#else
# if defined SHARED
# define MEMCPY_SYMBOL(p,s) __memcpy
# else
# define MEMCPY_SYMBOL(p,s) memcpy
# endif
#endif
#if !defined USE_MULTIARCH
# define MEMPCPY_SYMBOL(p,s) __mempcpy
#endif
#ifndef MEMMOVE_SYMBOL
# define MEMMOVE_CHK_SYMBOL(p,s) p
# define MEMMOVE_SYMBOL(p,s) memmove
#endif
#include "memmove-vec-unaligned-erms.S"
#ifndef USE_MULTIARCH
libc_hidden_builtin_def (memmove)
# if defined SHARED && IS_IN (libc)
strong_alias (memmove, __memcpy)
libc_hidden_ver (memmove, memcpy)
# endif
libc_hidden_def (__mempcpy)
weak_alias (__mempcpy, mempcpy)
libc_hidden_builtin_def (mempcpy)
# if defined SHARED && IS_IN (libc)
# undef memcpy
# include <shlib-compat.h>
versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
# endif
# endif
#endif

View File

@ -0,0 +1,131 @@
#pragma once
/* Assembler macros for x86-64.
Copyright (C) 2001-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _X86_64_SYSDEP_H
#define _X86_64_SYSDEP_H 1
#include "sysdep_x86.h"
#ifdef __ASSEMBLER__
/* Syntactic details of assembler. */
/* This macro is for setting proper CFI with DW_CFA_expression describing
the register as saved relative to %rsp instead of relative to the CFA.
Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
from %rsp. */
#define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \
0x77, off & 0x7F | 0x80, off >> 7
/* If compiled for profiling, call `mcount' at the start of each function. */
#ifdef PROF
/* The mcount code relies on a normal frame pointer being on the stack
to locate our caller, so push one just for its benefit. */
#define CALL_MCOUNT \
pushq %rbp; \
cfi_adjust_cfa_offset(8); \
movq %rsp, %rbp; \
cfi_def_cfa_register(%rbp); \
call JUMPTARGET(mcount); \
popq %rbp; \
cfi_def_cfa(rsp,8);
#else
#define CALL_MCOUNT /* Do nothing. */
#endif
#define PSEUDO(name, syscall_name, args) \
lose: \
jmp JUMPTARGET(syscall_error) \
.globl syscall_error; \
ENTRY (name) \
DO_CALL (syscall_name, args); \
jb lose
#undef JUMPTARGET
#ifdef SHARED
# ifdef BIND_NOW
# define JUMPTARGET(name) *name##@GOTPCREL(%rip)
# else
# define JUMPTARGET(name) name##@PLT
# endif
#else
/* For static archives, branch to target directly. */
# define JUMPTARGET(name) name
#endif
/* Long and pointer size in bytes. */
#define LP_SIZE 8
/* Instruction to operate on long and pointer. */
#define LP_OP(insn) insn##q
/* Assembler address directive. */
#define ASM_ADDR .quad
/* Registers to hold long and pointer. */
#define RAX_LP rax
#define RBP_LP rbp
#define RBX_LP rbx
#define RCX_LP rcx
#define RDI_LP rdi
#define RDX_LP rdx
#define RSI_LP rsi
#define RSP_LP rsp
#define R8_LP r8
#define R9_LP r9
#define R10_LP r10
#define R11_LP r11
#define R12_LP r12
#define R13_LP r13
#define R14_LP r14
#define R15_LP r15
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */
#define LP_SIZE "8"
/* Instruction to operate on long and pointer. */
#define LP_OP(insn) #insn "q"
/* Assembler address directive. */
#define ASM_ADDR ".quad"
/* Registers to hold long and pointer. */
#define RAX_LP "rax"
#define RBP_LP "rbp"
#define RBX_LP "rbx"
#define RCX_LP "rcx"
#define RDI_LP "rdi"
#define RDX_LP "rdx"
#define RSI_LP "rsi"
#define RSP_LP "rsp"
#define R8_LP "r8"
#define R9_LP "r9"
#define R10_LP "r10"
#define R11_LP "r11"
#define R12_LP "r12"
#define R13_LP "r13"
#define R14_LP "r14"
#define R15_LP "r15"
#endif /* __ASSEMBLER__ */
#endif /* _X86_64_SYSDEP_H */

View File

@ -0,0 +1,115 @@
#pragma once
/* Generic asm macros used on many machines.
Copyright (C) 1991-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define C_SYMBOL_NAME(name) name
#define HIDDEN_JUMPTARGET(name) 0x0
#define SHARED_CACHE_SIZE_HALF (1024*1024)
#define DATA_CACHE_SIZE_HALF (1024*32/2)
#define DATA_CACHE_SIZE (1024*32)
#define SHARED_NON_TEMPORAL_THRESHOLD (1024*1024*4)
#define REP_MOSB_THRESHOLD 1024
#define USE_MULTIARCH
#define ASM_LINE_SEP ;
#define strong_alias(original, alias) \
.globl C_SYMBOL_NAME (alias) ASM_LINE_SEP \
C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
#ifndef C_LABEL
/* Define a macro we can use to construct the asm name for a C symbol. */
# define C_LABEL(name) name##:
#endif
#ifdef __ASSEMBLER__
/* Mark the end of function named SYM. This is used on some platforms
to generate correct debugging information. */
# ifndef END
# define END(sym)
# endif
# ifndef JUMPTARGET
# define JUMPTARGET(sym) sym
# endif
#endif
/* Macros to generate eh_frame unwind information. */
#ifdef __ASSEMBLER__
# define cfi_startproc .cfi_startproc
# define cfi_endproc .cfi_endproc
# define cfi_def_cfa(reg, off) .cfi_def_cfa reg, off
# define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg
# define cfi_def_cfa_offset(off) .cfi_def_cfa_offset off
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
# define cfi_offset(reg, off) .cfi_offset reg, off
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
# define cfi_register(r1, r2) .cfi_register r1, r2
# define cfi_return_column(reg) .cfi_return_column reg
# define cfi_restore(reg) .cfi_restore reg
# define cfi_same_value(reg) .cfi_same_value reg
# define cfi_undefined(reg) .cfi_undefined reg
# define cfi_remember_state .cfi_remember_state
# define cfi_restore_state .cfi_restore_state
# define cfi_window_save .cfi_window_save
# define cfi_personality(enc, exp) .cfi_personality enc, exp
# define cfi_lsda(enc, exp) .cfi_lsda enc, exp
#else /* ! ASSEMBLER */
# define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name)
# define CFI_STRINGIFY2(Name) #Name
# define CFI_STARTPROC ".cfi_startproc"
# define CFI_ENDPROC ".cfi_endproc"
# define CFI_DEF_CFA(reg, off) \
".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
# define CFI_DEF_CFA_REGISTER(reg) \
".cfi_def_cfa_register " CFI_STRINGIFY(reg)
# define CFI_DEF_CFA_OFFSET(off) \
".cfi_def_cfa_offset " CFI_STRINGIFY(off)
# define CFI_ADJUST_CFA_OFFSET(off) \
".cfi_adjust_cfa_offset " CFI_STRINGIFY(off)
# define CFI_OFFSET(reg, off) \
".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
# define CFI_REL_OFFSET(reg, off) \
".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
# define CFI_REGISTER(r1, r2) \
".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2)
# define CFI_RETURN_COLUMN(reg) \
".cfi_return_column " CFI_STRINGIFY(reg)
# define CFI_RESTORE(reg) \
".cfi_restore " CFI_STRINGIFY(reg)
# define CFI_UNDEFINED(reg) \
".cfi_undefined " CFI_STRINGIFY(reg)
# define CFI_REMEMBER_STATE \
".cfi_remember_state"
# define CFI_RESTORE_STATE \
".cfi_restore_state"
# define CFI_WINDOW_SAVE \
".cfi_window_save"
# define CFI_PERSONALITY(enc, exp) \
".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
# define CFI_LSDA(enc, exp) \
".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
#endif
#include "dwarf2.h"

View File

@ -0,0 +1,115 @@
#pragma once
/* Assembler macros for x86.
Copyright (C) 2017-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef _X86_SYSDEP_H
#define _X86_SYSDEP_H 1
#include "sysdep_generic.h"
/* __CET__ is defined by GCC with Control-Flow Protection values:
enum cf_protection_level
{
CF_NONE = 0,
CF_BRANCH = 1 << 0,
CF_RETURN = 1 << 1,
CF_FULL = CF_BRANCH | CF_RETURN,
CF_SET = 1 << 2
};
*/
/* Set if CF_BRANCH (IBT) is enabled. */
#define X86_FEATURE_1_IBT (1U << 0)
/* Set if CF_RETURN (SHSTK) is enabled. */
#define X86_FEATURE_1_SHSTK (1U << 1)
#ifdef __CET__
# define CET_ENABLED 1
# define IBT_ENABLED (__CET__ & X86_FEATURE_1_IBT)
# define SHSTK_ENABLED (__CET__ & X86_FEATURE_1_SHSTK)
#else
# define CET_ENABLED 0
# define IBT_ENABLED 0
# define SHSTK_ENABLED 0
#endif
/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need
space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be
aligned to 16 bytes for fxsave and 64 bytes for xsave. */
#define STATE_SAVE_OFFSET (8 * 7 + 8)
/* Save SSE, AVX, AVX512, mask and bound registers. */
#define STATE_SAVE_MASK \
((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
#ifdef __ASSEMBLER__
/* Syntactic details of assembler. */
#ifdef _CET_ENDBR
# define _CET_NOTRACK notrack
#else
# define _CET_ENDBR
# define _CET_NOTRACK
#endif
/* ELF uses byte-counts for .align, most others use log2 of count of bytes. */
#define ALIGNARG(log2) 1<<log2
#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
/* Define an entry point visible from C. */
#define ENTRY(name) \
.globl C_SYMBOL_NAME(name); \
.type C_SYMBOL_NAME(name),@function; \
.align ALIGNARG(4); \
C_LABEL(name) \
cfi_startproc; \
_CET_ENDBR; \
CALL_MCOUNT
#undef END
#define END(name) \
cfi_endproc; \
ASM_SIZE_DIRECTIVE(name)
#define ENTRY_CHK(name) ENTRY (name)
#define END_CHK(name) END (name)
/* Since C identifiers are not normally prefixed with an underscore
on this system, the asm identifier `syscall_error' intrudes on the
C name space. Make sure we use an innocuous name. */
#define syscall_error __syscall_error
#define mcount _mcount
#undef PSEUDO_END
#define PSEUDO_END(name) \
END (name)
/* Local label name for asm code. */
#ifndef L
/* ELF-like local names start with `.L'. */
# define L(name) .L##name
#endif
#define atom_text_section .section ".text.atom", "ax"
#endif /* __ASSEMBLER__ */
#endif /* _X86_SYSDEP_H */

View File

@ -1,5 +1,6 @@
#include <memory>
#include <cstddef>
#include <stdexcept>
#include <string>
#include <random>
#include <iostream>
@ -14,15 +15,11 @@
#include <Common/Stopwatch.h>
#pragma GCC diagnostic ignored "-Wold-style-cast"
#pragma GCC diagnostic ignored "-Wcast-align"
#pragma GCC diagnostic ignored "-Wcast-qual"
#include "FastMemcpy.h"
//#include "FastMemcpy_Avx.h"
#include <emmintrin.h>
#include <immintrin.h>
#include <boost/program_options.hpp>
template <typename F, typename MemcpyImpl>
void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl)
@ -36,6 +33,9 @@ void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_d
dst += bytes_to_copy;
src += bytes_to_copy;
size -= bytes_to_copy;
/// Execute at least one SSE instruction as a penalty after running AVX code.
__asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
}
}
@ -47,7 +47,7 @@ size_t generatorUniform(RNG & rng) { return rng() % N; };
template <typename F, typename MemcpyImpl>
void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl)
uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl, const char * name)
{
Stopwatch watch;
@ -76,15 +76,15 @@ void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t n
for (auto & thread : threads)
thread.join();
double elapsed_ns = watch.elapsed();
uint64_t elapsed_ns = watch.elapsed();
/// Validation
size_t sum = 0;
for (size_t i = 0; i < size; ++i)
sum += dst[i];
if (dst[i] != uint8_t(i))
throw std::logic_error("Incorrect result");
std::cerr << std::fixed << std::setprecision(3)
<< "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n";
std::cout << name;
return elapsed_ns;
}
@ -101,9 +101,30 @@ static void * memcpy_erms(void * dst, const void * src, size_t size)
return dst;
}
static void * memcpy_trivial(void * __restrict dst_, const void * __restrict src_, size_t size)
{
char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
void * ret = dst;
while (size > 0)
{
*dst = *src;
++dst;
++src;
--size;
}
return ret;
}
extern "C" void * memcpy_jart(void * dst, const void * src, size_t size);
extern "C" void MemCpy(void * dst, const void * src, size_t size);
void * memcpy_fast_sse(void * dst, const void * src, size_t size);
void * memcpy_fast_avx(void * dst, const void * src, size_t size);
void * memcpy_tiny(void * dst, const void * src, size_t size);
static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size)
{
@ -329,7 +350,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t *
if (padding > 0)
{
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
_mm256_storeu_si256((__m256i*)dst, head);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
@ -539,70 +560,141 @@ tail:
return ret;
}
extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_ssse3_back(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_avx_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_avx_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_avx512_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_avx512_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_avx512_no_vzeroupper(void * __restrict destination, const void * __restrict source, size_t size);
#define VARIANT(N, NAME) \
if (memcpy_variant == N) \
return test(dst, src, size, iterations, num_threads, std::forward<F>(generator), NAME, #NAME);
template <typename F>
void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
{
memcpy_type memcpy_libc = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
memcpy_type memcpy_libc_old = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
if (memcpy_variant == 1)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy);
if (memcpy_variant == 2)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_libc);
if (memcpy_variant == 3)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_erms);
if (memcpy_variant == 4)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), MemCpy);
if (memcpy_variant == 5)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2);
if (memcpy_variant == 6)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled2);
if (memcpy_variant == 7)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled4);
if (memcpy_variant == 8)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled8);
// if (memcpy_variant == 9)
// test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_fast_avx);
if (memcpy_variant == 10)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_my);
VARIANT(1, memcpy)
VARIANT(2, memcpy_trivial)
VARIANT(3, memcpy_libc_old)
VARIANT(4, memcpy_erms)
VARIANT(5, MemCpy)
VARIANT(6, memcpySSE2)
VARIANT(7, memcpySSE2Unrolled2)
VARIANT(8, memcpySSE2Unrolled4)
VARIANT(9, memcpySSE2Unrolled8)
VARIANT(10, memcpy_fast_sse)
VARIANT(11, memcpy_fast_avx)
VARIANT(12, memcpy_my)
VARIANT(21, __memcpy_erms)
VARIANT(22, __memcpy_sse2_unaligned)
VARIANT(23, __memcpy_ssse3)
VARIANT(24, __memcpy_ssse3_back)
VARIANT(25, __memcpy_avx_unaligned)
VARIANT(26, __memcpy_avx_unaligned_erms)
VARIANT(27, __memcpy_avx512_unaligned)
VARIANT(28, __memcpy_avx512_unaligned_erms)
VARIANT(29, __memcpy_avx512_no_vzeroupper)
return 0;
}
void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
uint64_t dispatchVariants(
size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
{
if (generator_variant == 1)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
if (generator_variant == 2)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
if (generator_variant == 3)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
if (generator_variant == 4)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
if (generator_variant == 5)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
return 0;
}
int main(int argc, char ** argv)
{
size_t size = 1000000000;
if (argc >= 2)
size = std::stoull(argv[1]);
boost::program_options::options_description desc("Allowed options");
desc.add_options()("help,h", "produce help message")
("size", boost::program_options::value<size_t>()->default_value(1000000), "Bytes to copy on every iteration")
("iterations", boost::program_options::value<size_t>(), "Number of iterations")
("threads", boost::program_options::value<size_t>()->default_value(1), "Number of copying threads")
("distribution", boost::program_options::value<size_t>()->default_value(4), "Distribution of chunk sizes to perform copy")
("variant", boost::program_options::value<size_t>(), "Variant of memcpy implementation")
("tsv", "Print result in tab-separated format")
;
size_t iterations = 10;
if (argc >= 3)
iterations = std::stoull(argv[2]);
boost::program_options::variables_map options;
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
size_t num_threads = 1;
if (argc >= 4)
num_threads = std::stoull(argv[3]);
if (options.count("help") || !options.count("variant"))
{
std::cout << R"(Usage:
size_t memcpy_variant = 1;
if (argc >= 5)
memcpy_variant = std::stoull(argv[4]);
for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do
for threads in 1 2 4 $(($(nproc) / 2)) $(nproc); do
for distribution in 1 2 3 4 5; do
for variant in {1..12} {21..29}; do
for i in {1..10}; do
./memcpy-bench --tsv --size $size --variant $variant --threads $threads --distribution $distribution;
done;
done;
done;
done;
done | tee result.tsv
size_t generator_variant = 1;
if (argc >= 6)
generator_variant = std::stoull(argv[5]);
clickhouse-local --structure '
name String,
size UInt64,
iterations UInt64,
threads UInt16,
generator UInt8,
memcpy UInt8,
elapsed UInt64
' --query "
SELECT
size, name,
avg(1000 * elapsed / size / iterations) AS s,
count() AS c
FROM table
GROUP BY size, name
ORDER BY size ASC, s DESC
" --output-format PrettyCompact < result.tsv
)" << std::endl;
std::cout << desc << std::endl;
return 1;
}
size_t size = options["size"].as<size_t>();
size_t num_threads = options["threads"].as<size_t>();
size_t memcpy_variant = options["variant"].as<size_t>();
size_t generator_variant = options["distribution"].as<size_t>();
size_t iterations;
if (options.count("iterations"))
{
iterations = options["iterations"].as<size_t>();
}
else
{
iterations = 10000000000ULL / size;
if (generator_variant == 1)
iterations /= 10;
}
std::unique_ptr<uint8_t[]> src(new uint8_t[size]);
std::unique_ptr<uint8_t[]> dst(new uint8_t[size]);
@ -614,7 +706,25 @@ int main(int argc, char ** argv)
/// Fill dst to avoid page faults.
memset(dst.get(), 0, size);
dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
uint64_t elapsed_ns = dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
std::cout << std::fixed << std::setprecision(3);
if (options.count("tsv"))
{
std::cout
<< '\t' << size
<< '\t' << iterations
<< '\t' << num_threads
<< '\t' << generator_variant
<< '\t' << memcpy_variant
<< '\t' << elapsed_ns
<< '\n';
}
else
{
std::cout << ": processed in " << (elapsed_ns / 1e9) << " sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec\n";
}
return 0;
}