mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Merge pull request #21715 from ClickHouse/memcpy-bench
Add more variants for memcpy benchmark
This commit is contained in:
commit
6a455fe71d
@ -1,5 +1,22 @@
|
||||
enable_language(ASM)
|
||||
add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S)
|
||||
#target_compile_options(memcpy-bench PRIVATE -mavx)
|
||||
target_link_libraries(memcpy-bench PRIVATE dbms)
|
||||
|
||||
add_executable (memcpy-bench
|
||||
memcpy-bench.cpp
|
||||
FastMemcpy.cpp
|
||||
FastMemcpy_Avx.cpp
|
||||
memcpy_jart.S
|
||||
glibc/memcpy-ssse3.S
|
||||
glibc/memcpy-ssse3-back.S
|
||||
glibc/memmove-sse2-unaligned-erms.S
|
||||
glibc/memmove-avx-unaligned-erms.S
|
||||
glibc/memmove-avx512-unaligned-erms.S
|
||||
glibc/memmove-avx512-no-vzeroupper.S
|
||||
)
|
||||
|
||||
add_compile_options(memcpy-bench PRIVATE -fno-tree-loop-distribute-patterns)
|
||||
|
||||
set_source_files_properties(FastMemcpy.cpp PROPERTIES COMPILE_FLAGS "-Wno-old-style-cast")
|
||||
set_source_files_properties(FastMemcpy_Avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -Wno-old-style-cast -Wno-cast-qual -Wno-cast-align")
|
||||
|
||||
target_link_libraries(memcpy-bench PRIVATE dbms boost::program_options)
|
||||
|
||||
|
1
utils/memcpy-bench/FastMemcpy.cpp
Normal file
1
utils/memcpy-bench/FastMemcpy.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include "FastMemcpy.h"
|
@ -93,7 +93,7 @@ static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restric
|
||||
/// Attribute is used to avoid an error with undefined behaviour sanitizer
|
||||
/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
|
||||
/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
|
||||
__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
|
||||
__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
|
||||
{
|
||||
unsigned char *dd = ((unsigned char*)dst) + size;
|
||||
const unsigned char *ss = ((const unsigned char*)src) + size;
|
||||
|
1
utils/memcpy-bench/FastMemcpy_Avx.cpp
Normal file
1
utils/memcpy-bench/FastMemcpy_Avx.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include "FastMemcpy_Avx.h"
|
26
utils/memcpy-bench/glibc/asm-syntax.h
Normal file
26
utils/memcpy-bench/glibc/asm-syntax.h
Normal file
@ -0,0 +1,26 @@
|
||||
#pragma once
|
||||
|
||||
/* Definitions for x86 syntax variations.
|
||||
Copyright (C) 1992-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library. Its master source is NOT part of
|
||||
the C library, however. The master source lives in the GNU MP Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#undef ALIGN
|
||||
#define ALIGN(log) .align 1<<log
|
||||
|
||||
#undef L
|
||||
#define L(body) .L##body
|
592
utils/memcpy-bench/glibc/dwarf2.h
Normal file
592
utils/memcpy-bench/glibc/dwarf2.h
Normal file
@ -0,0 +1,592 @@
|
||||
#pragma once
|
||||
|
||||
/* Declarations and definitions of codes relating to the DWARF2 symbolic
|
||||
debugging information format.
|
||||
Copyright (C) 1992-2020 Free Software Foundation, Inc.
|
||||
Contributed by Gary Funck (gary@intrepid.com). Derived from the
|
||||
DWARF 1 implementation written by Ron Guilmette (rfg@monkeys.com).
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _DWARF2_H
|
||||
#define _DWARF2_H 1
|
||||
|
||||
/* This file is derived from the DWARF specification (a public document)
|
||||
Revision 2.0.0 (July 27, 1993) developed by the UNIX International
|
||||
Programming Languages Special Interest Group (UI/PLSIG) and distributed
|
||||
by UNIX International. Copies of this specification are available from
|
||||
UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. */
|
||||
|
||||
/* This file is shared between GCC and GDB, and should not contain
|
||||
prototypes. */
|
||||
|
||||
#ifndef __ASSEMBLER__
|
||||
/* Tag names and codes. */
|
||||
|
||||
enum dwarf_tag
|
||||
{
|
||||
DW_TAG_padding = 0x00,
|
||||
DW_TAG_array_type = 0x01,
|
||||
DW_TAG_class_type = 0x02,
|
||||
DW_TAG_entry_point = 0x03,
|
||||
DW_TAG_enumeration_type = 0x04,
|
||||
DW_TAG_formal_parameter = 0x05,
|
||||
DW_TAG_imported_declaration = 0x08,
|
||||
DW_TAG_label = 0x0a,
|
||||
DW_TAG_lexical_block = 0x0b,
|
||||
DW_TAG_member = 0x0d,
|
||||
DW_TAG_pointer_type = 0x0f,
|
||||
DW_TAG_reference_type = 0x10,
|
||||
DW_TAG_compile_unit = 0x11,
|
||||
DW_TAG_string_type = 0x12,
|
||||
DW_TAG_structure_type = 0x13,
|
||||
DW_TAG_subroutine_type = 0x15,
|
||||
DW_TAG_typedef = 0x16,
|
||||
DW_TAG_union_type = 0x17,
|
||||
DW_TAG_unspecified_parameters = 0x18,
|
||||
DW_TAG_variant = 0x19,
|
||||
DW_TAG_common_block = 0x1a,
|
||||
DW_TAG_common_inclusion = 0x1b,
|
||||
DW_TAG_inheritance = 0x1c,
|
||||
DW_TAG_inlined_subroutine = 0x1d,
|
||||
DW_TAG_module = 0x1e,
|
||||
DW_TAG_ptr_to_member_type = 0x1f,
|
||||
DW_TAG_set_type = 0x20,
|
||||
DW_TAG_subrange_type = 0x21,
|
||||
DW_TAG_with_stmt = 0x22,
|
||||
DW_TAG_access_declaration = 0x23,
|
||||
DW_TAG_base_type = 0x24,
|
||||
DW_TAG_catch_block = 0x25,
|
||||
DW_TAG_const_type = 0x26,
|
||||
DW_TAG_constant = 0x27,
|
||||
DW_TAG_enumerator = 0x28,
|
||||
DW_TAG_file_type = 0x29,
|
||||
DW_TAG_friend = 0x2a,
|
||||
DW_TAG_namelist = 0x2b,
|
||||
DW_TAG_namelist_item = 0x2c,
|
||||
DW_TAG_packed_type = 0x2d,
|
||||
DW_TAG_subprogram = 0x2e,
|
||||
DW_TAG_template_type_param = 0x2f,
|
||||
DW_TAG_template_value_param = 0x30,
|
||||
DW_TAG_thrown_type = 0x31,
|
||||
DW_TAG_try_block = 0x32,
|
||||
DW_TAG_variant_part = 0x33,
|
||||
DW_TAG_variable = 0x34,
|
||||
DW_TAG_volatile_type = 0x35,
|
||||
/* SGI/MIPS Extensions */
|
||||
DW_TAG_MIPS_loop = 0x4081,
|
||||
/* GNU extensions */
|
||||
DW_TAG_format_label = 0x4101, /* for FORTRAN 77 and Fortran 90 */
|
||||
DW_TAG_function_template = 0x4102, /* for C++ */
|
||||
DW_TAG_class_template = 0x4103, /* for C++ */
|
||||
DW_TAG_GNU_BINCL = 0x4104,
|
||||
DW_TAG_GNU_EINCL = 0x4105
|
||||
};
|
||||
|
||||
#define DW_TAG_lo_user 0x4080
|
||||
#define DW_TAG_hi_user 0xffff
|
||||
|
||||
/* flag that tells whether entry has a child or not */
|
||||
#define DW_children_no 0
|
||||
#define DW_children_yes 1
|
||||
|
||||
/* Form names and codes. */
|
||||
enum dwarf_form
|
||||
{
|
||||
DW_FORM_addr = 0x01,
|
||||
DW_FORM_block2 = 0x03,
|
||||
DW_FORM_block4 = 0x04,
|
||||
DW_FORM_data2 = 0x05,
|
||||
DW_FORM_data4 = 0x06,
|
||||
DW_FORM_data8 = 0x07,
|
||||
DW_FORM_string = 0x08,
|
||||
DW_FORM_block = 0x09,
|
||||
DW_FORM_block1 = 0x0a,
|
||||
DW_FORM_data1 = 0x0b,
|
||||
DW_FORM_flag = 0x0c,
|
||||
DW_FORM_sdata = 0x0d,
|
||||
DW_FORM_strp = 0x0e,
|
||||
DW_FORM_udata = 0x0f,
|
||||
DW_FORM_ref_addr = 0x10,
|
||||
DW_FORM_ref1 = 0x11,
|
||||
DW_FORM_ref2 = 0x12,
|
||||
DW_FORM_ref4 = 0x13,
|
||||
DW_FORM_ref8 = 0x14,
|
||||
DW_FORM_ref_udata = 0x15,
|
||||
DW_FORM_indirect = 0x16
|
||||
};
|
||||
|
||||
/* Attribute names and codes. */
|
||||
|
||||
enum dwarf_attribute
|
||||
{
|
||||
DW_AT_sibling = 0x01,
|
||||
DW_AT_location = 0x02,
|
||||
DW_AT_name = 0x03,
|
||||
DW_AT_ordering = 0x09,
|
||||
DW_AT_subscr_data = 0x0a,
|
||||
DW_AT_byte_size = 0x0b,
|
||||
DW_AT_bit_offset = 0x0c,
|
||||
DW_AT_bit_size = 0x0d,
|
||||
DW_AT_element_list = 0x0f,
|
||||
DW_AT_stmt_list = 0x10,
|
||||
DW_AT_low_pc = 0x11,
|
||||
DW_AT_high_pc = 0x12,
|
||||
DW_AT_language = 0x13,
|
||||
DW_AT_member = 0x14,
|
||||
DW_AT_discr = 0x15,
|
||||
DW_AT_discr_value = 0x16,
|
||||
DW_AT_visibility = 0x17,
|
||||
DW_AT_import = 0x18,
|
||||
DW_AT_string_length = 0x19,
|
||||
DW_AT_common_reference = 0x1a,
|
||||
DW_AT_comp_dir = 0x1b,
|
||||
DW_AT_const_value = 0x1c,
|
||||
DW_AT_containing_type = 0x1d,
|
||||
DW_AT_default_value = 0x1e,
|
||||
DW_AT_inline = 0x20,
|
||||
DW_AT_is_optional = 0x21,
|
||||
DW_AT_lower_bound = 0x22,
|
||||
DW_AT_producer = 0x25,
|
||||
DW_AT_prototyped = 0x27,
|
||||
DW_AT_return_addr = 0x2a,
|
||||
DW_AT_start_scope = 0x2c,
|
||||
DW_AT_stride_size = 0x2e,
|
||||
DW_AT_upper_bound = 0x2f,
|
||||
DW_AT_abstract_origin = 0x31,
|
||||
DW_AT_accessibility = 0x32,
|
||||
DW_AT_address_class = 0x33,
|
||||
DW_AT_artificial = 0x34,
|
||||
DW_AT_base_types = 0x35,
|
||||
DW_AT_calling_convention = 0x36,
|
||||
DW_AT_count = 0x37,
|
||||
DW_AT_data_member_location = 0x38,
|
||||
DW_AT_decl_column = 0x39,
|
||||
DW_AT_decl_file = 0x3a,
|
||||
DW_AT_decl_line = 0x3b,
|
||||
DW_AT_declaration = 0x3c,
|
||||
DW_AT_discr_list = 0x3d,
|
||||
DW_AT_encoding = 0x3e,
|
||||
DW_AT_external = 0x3f,
|
||||
DW_AT_frame_base = 0x40,
|
||||
DW_AT_friend = 0x41,
|
||||
DW_AT_identifier_case = 0x42,
|
||||
DW_AT_macro_info = 0x43,
|
||||
DW_AT_namelist_items = 0x44,
|
||||
DW_AT_priority = 0x45,
|
||||
DW_AT_segment = 0x46,
|
||||
DW_AT_specification = 0x47,
|
||||
DW_AT_static_link = 0x48,
|
||||
DW_AT_type = 0x49,
|
||||
DW_AT_use_location = 0x4a,
|
||||
DW_AT_variable_parameter = 0x4b,
|
||||
DW_AT_virtuality = 0x4c,
|
||||
DW_AT_vtable_elem_location = 0x4d,
|
||||
/* SGI/MIPS Extensions */
|
||||
DW_AT_MIPS_fde = 0x2001,
|
||||
DW_AT_MIPS_loop_begin = 0x2002,
|
||||
DW_AT_MIPS_tail_loop_begin = 0x2003,
|
||||
DW_AT_MIPS_epilog_begin = 0x2004,
|
||||
DW_AT_MIPS_loop_unroll_factor = 0x2005,
|
||||
DW_AT_MIPS_software_pipeline_depth = 0x2006,
|
||||
DW_AT_MIPS_linkage_name = 0x2007,
|
||||
DW_AT_MIPS_stride = 0x2008,
|
||||
DW_AT_MIPS_abstract_name = 0x2009,
|
||||
DW_AT_MIPS_clone_origin = 0x200a,
|
||||
DW_AT_MIPS_has_inlines = 0x200b,
|
||||
/* GNU extensions. */
|
||||
DW_AT_sf_names = 0x2101,
|
||||
DW_AT_src_info = 0x2102,
|
||||
DW_AT_mac_info = 0x2103,
|
||||
DW_AT_src_coords = 0x2104,
|
||||
DW_AT_body_begin = 0x2105,
|
||||
DW_AT_body_end = 0x2106
|
||||
};
|
||||
|
||||
#define DW_AT_lo_user 0x2000 /* implementation-defined range start */
|
||||
#define DW_AT_hi_user 0x3ff0 /* implementation-defined range end */
|
||||
|
||||
/* Location atom names and codes. */
|
||||
|
||||
enum dwarf_location_atom
|
||||
{
|
||||
DW_OP_addr = 0x03,
|
||||
DW_OP_deref = 0x06,
|
||||
DW_OP_const1u = 0x08,
|
||||
DW_OP_const1s = 0x09,
|
||||
DW_OP_const2u = 0x0a,
|
||||
DW_OP_const2s = 0x0b,
|
||||
DW_OP_const4u = 0x0c,
|
||||
DW_OP_const4s = 0x0d,
|
||||
DW_OP_const8u = 0x0e,
|
||||
DW_OP_const8s = 0x0f,
|
||||
DW_OP_constu = 0x10,
|
||||
DW_OP_consts = 0x11,
|
||||
DW_OP_dup = 0x12,
|
||||
DW_OP_drop = 0x13,
|
||||
DW_OP_over = 0x14,
|
||||
DW_OP_pick = 0x15,
|
||||
DW_OP_swap = 0x16,
|
||||
DW_OP_rot = 0x17,
|
||||
DW_OP_xderef = 0x18,
|
||||
DW_OP_abs = 0x19,
|
||||
DW_OP_and = 0x1a,
|
||||
DW_OP_div = 0x1b,
|
||||
DW_OP_minus = 0x1c,
|
||||
DW_OP_mod = 0x1d,
|
||||
DW_OP_mul = 0x1e,
|
||||
DW_OP_neg = 0x1f,
|
||||
DW_OP_not = 0x20,
|
||||
DW_OP_or = 0x21,
|
||||
DW_OP_plus = 0x22,
|
||||
DW_OP_plus_uconst = 0x23,
|
||||
DW_OP_shl = 0x24,
|
||||
DW_OP_shr = 0x25,
|
||||
DW_OP_shra = 0x26,
|
||||
DW_OP_xor = 0x27,
|
||||
DW_OP_bra = 0x28,
|
||||
DW_OP_eq = 0x29,
|
||||
DW_OP_ge = 0x2a,
|
||||
DW_OP_gt = 0x2b,
|
||||
DW_OP_le = 0x2c,
|
||||
DW_OP_lt = 0x2d,
|
||||
DW_OP_ne = 0x2e,
|
||||
DW_OP_skip = 0x2f,
|
||||
DW_OP_lit0 = 0x30,
|
||||
DW_OP_lit1 = 0x31,
|
||||
DW_OP_lit2 = 0x32,
|
||||
DW_OP_lit3 = 0x33,
|
||||
DW_OP_lit4 = 0x34,
|
||||
DW_OP_lit5 = 0x35,
|
||||
DW_OP_lit6 = 0x36,
|
||||
DW_OP_lit7 = 0x37,
|
||||
DW_OP_lit8 = 0x38,
|
||||
DW_OP_lit9 = 0x39,
|
||||
DW_OP_lit10 = 0x3a,
|
||||
DW_OP_lit11 = 0x3b,
|
||||
DW_OP_lit12 = 0x3c,
|
||||
DW_OP_lit13 = 0x3d,
|
||||
DW_OP_lit14 = 0x3e,
|
||||
DW_OP_lit15 = 0x3f,
|
||||
DW_OP_lit16 = 0x40,
|
||||
DW_OP_lit17 = 0x41,
|
||||
DW_OP_lit18 = 0x42,
|
||||
DW_OP_lit19 = 0x43,
|
||||
DW_OP_lit20 = 0x44,
|
||||
DW_OP_lit21 = 0x45,
|
||||
DW_OP_lit22 = 0x46,
|
||||
DW_OP_lit23 = 0x47,
|
||||
DW_OP_lit24 = 0x48,
|
||||
DW_OP_lit25 = 0x49,
|
||||
DW_OP_lit26 = 0x4a,
|
||||
DW_OP_lit27 = 0x4b,
|
||||
DW_OP_lit28 = 0x4c,
|
||||
DW_OP_lit29 = 0x4d,
|
||||
DW_OP_lit30 = 0x4e,
|
||||
DW_OP_lit31 = 0x4f,
|
||||
DW_OP_reg0 = 0x50,
|
||||
DW_OP_reg1 = 0x51,
|
||||
DW_OP_reg2 = 0x52,
|
||||
DW_OP_reg3 = 0x53,
|
||||
DW_OP_reg4 = 0x54,
|
||||
DW_OP_reg5 = 0x55,
|
||||
DW_OP_reg6 = 0x56,
|
||||
DW_OP_reg7 = 0x57,
|
||||
DW_OP_reg8 = 0x58,
|
||||
DW_OP_reg9 = 0x59,
|
||||
DW_OP_reg10 = 0x5a,
|
||||
DW_OP_reg11 = 0x5b,
|
||||
DW_OP_reg12 = 0x5c,
|
||||
DW_OP_reg13 = 0x5d,
|
||||
DW_OP_reg14 = 0x5e,
|
||||
DW_OP_reg15 = 0x5f,
|
||||
DW_OP_reg16 = 0x60,
|
||||
DW_OP_reg17 = 0x61,
|
||||
DW_OP_reg18 = 0x62,
|
||||
DW_OP_reg19 = 0x63,
|
||||
DW_OP_reg20 = 0x64,
|
||||
DW_OP_reg21 = 0x65,
|
||||
DW_OP_reg22 = 0x66,
|
||||
DW_OP_reg23 = 0x67,
|
||||
DW_OP_reg24 = 0x68,
|
||||
DW_OP_reg25 = 0x69,
|
||||
DW_OP_reg26 = 0x6a,
|
||||
DW_OP_reg27 = 0x6b,
|
||||
DW_OP_reg28 = 0x6c,
|
||||
DW_OP_reg29 = 0x6d,
|
||||
DW_OP_reg30 = 0x6e,
|
||||
DW_OP_reg31 = 0x6f,
|
||||
DW_OP_breg0 = 0x70,
|
||||
DW_OP_breg1 = 0x71,
|
||||
DW_OP_breg2 = 0x72,
|
||||
DW_OP_breg3 = 0x73,
|
||||
DW_OP_breg4 = 0x74,
|
||||
DW_OP_breg5 = 0x75,
|
||||
DW_OP_breg6 = 0x76,
|
||||
DW_OP_breg7 = 0x77,
|
||||
DW_OP_breg8 = 0x78,
|
||||
DW_OP_breg9 = 0x79,
|
||||
DW_OP_breg10 = 0x7a,
|
||||
DW_OP_breg11 = 0x7b,
|
||||
DW_OP_breg12 = 0x7c,
|
||||
DW_OP_breg13 = 0x7d,
|
||||
DW_OP_breg14 = 0x7e,
|
||||
DW_OP_breg15 = 0x7f,
|
||||
DW_OP_breg16 = 0x80,
|
||||
DW_OP_breg17 = 0x81,
|
||||
DW_OP_breg18 = 0x82,
|
||||
DW_OP_breg19 = 0x83,
|
||||
DW_OP_breg20 = 0x84,
|
||||
DW_OP_breg21 = 0x85,
|
||||
DW_OP_breg22 = 0x86,
|
||||
DW_OP_breg23 = 0x87,
|
||||
DW_OP_breg24 = 0x88,
|
||||
DW_OP_breg25 = 0x89,
|
||||
DW_OP_breg26 = 0x8a,
|
||||
DW_OP_breg27 = 0x8b,
|
||||
DW_OP_breg28 = 0x8c,
|
||||
DW_OP_breg29 = 0x8d,
|
||||
DW_OP_breg30 = 0x8e,
|
||||
DW_OP_breg31 = 0x8f,
|
||||
DW_OP_regx = 0x90,
|
||||
DW_OP_fbreg = 0x91,
|
||||
DW_OP_bregx = 0x92,
|
||||
DW_OP_piece = 0x93,
|
||||
DW_OP_deref_size = 0x94,
|
||||
DW_OP_xderef_size = 0x95,
|
||||
DW_OP_nop = 0x96
|
||||
};
|
||||
|
||||
#define DW_OP_lo_user 0x80 /* implementation-defined range start */
|
||||
#define DW_OP_hi_user 0xff /* implementation-defined range end */
|
||||
|
||||
/* Type encodings. */
|
||||
|
||||
enum dwarf_type
|
||||
{
|
||||
DW_ATE_void = 0x0,
|
||||
DW_ATE_address = 0x1,
|
||||
DW_ATE_boolean = 0x2,
|
||||
DW_ATE_complex_float = 0x3,
|
||||
DW_ATE_float = 0x4,
|
||||
DW_ATE_signed = 0x5,
|
||||
DW_ATE_signed_char = 0x6,
|
||||
DW_ATE_unsigned = 0x7,
|
||||
DW_ATE_unsigned_char = 0x8
|
||||
};
|
||||
|
||||
#define DW_ATE_lo_user 0x80
|
||||
#define DW_ATE_hi_user 0xff
|
||||
|
||||
/* Array ordering names and codes. */
|
||||
enum dwarf_array_dim_ordering
|
||||
{
|
||||
DW_ORD_row_major = 0,
|
||||
DW_ORD_col_major = 1
|
||||
};
|
||||
|
||||
/* access attribute */
|
||||
enum dwarf_access_attribute
|
||||
{
|
||||
DW_ACCESS_public = 1,
|
||||
DW_ACCESS_protected = 2,
|
||||
DW_ACCESS_private = 3
|
||||
};
|
||||
|
||||
/* visibility */
|
||||
enum dwarf_visibility_attribute
|
||||
{
|
||||
DW_VIS_local = 1,
|
||||
DW_VIS_exported = 2,
|
||||
DW_VIS_qualified = 3
|
||||
};
|
||||
|
||||
/* virtuality */
|
||||
enum dwarf_virtuality_attribute
|
||||
{
|
||||
DW_VIRTUALITY_none = 0,
|
||||
DW_VIRTUALITY_virtual = 1,
|
||||
DW_VIRTUALITY_pure_virtual = 2
|
||||
};
|
||||
|
||||
/* case sensitivity */
|
||||
enum dwarf_id_case
|
||||
{
|
||||
DW_ID_case_sensitive = 0,
|
||||
DW_ID_up_case = 1,
|
||||
DW_ID_down_case = 2,
|
||||
DW_ID_case_insensitive = 3
|
||||
};
|
||||
|
||||
/* calling convention */
|
||||
enum dwarf_calling_convention
|
||||
{
|
||||
DW_CC_normal = 0x1,
|
||||
DW_CC_program = 0x2,
|
||||
DW_CC_nocall = 0x3
|
||||
};
|
||||
|
||||
#define DW_CC_lo_user 0x40
|
||||
#define DW_CC_hi_user 0xff
|
||||
|
||||
/* inline attribute */
|
||||
enum dwarf_inline_attribute
|
||||
{
|
||||
DW_INL_not_inlined = 0,
|
||||
DW_INL_inlined = 1,
|
||||
DW_INL_declared_not_inlined = 2,
|
||||
DW_INL_declared_inlined = 3
|
||||
};
|
||||
|
||||
/* discriminant lists */
|
||||
enum dwarf_discrim_list
|
||||
{
|
||||
DW_DSC_label = 0,
|
||||
DW_DSC_range = 1
|
||||
};
|
||||
|
||||
/* line number opcodes */
|
||||
enum dwarf_line_number_ops
|
||||
{
|
||||
DW_LNS_extended_op = 0,
|
||||
DW_LNS_copy = 1,
|
||||
DW_LNS_advance_pc = 2,
|
||||
DW_LNS_advance_line = 3,
|
||||
DW_LNS_set_file = 4,
|
||||
DW_LNS_set_column = 5,
|
||||
DW_LNS_negate_stmt = 6,
|
||||
DW_LNS_set_basic_block = 7,
|
||||
DW_LNS_const_add_pc = 8,
|
||||
DW_LNS_fixed_advance_pc = 9
|
||||
};
|
||||
|
||||
/* line number extended opcodes */
|
||||
enum dwarf_line_number_x_ops
|
||||
{
|
||||
DW_LNE_end_sequence = 1,
|
||||
DW_LNE_set_address = 2,
|
||||
DW_LNE_define_file = 3
|
||||
};
|
||||
|
||||
/* call frame information */
|
||||
enum dwarf_call_frame_info
|
||||
{
|
||||
DW_CFA_advance_loc = 0x40,
|
||||
DW_CFA_offset = 0x80,
|
||||
DW_CFA_restore = 0xc0,
|
||||
DW_CFA_nop = 0x00,
|
||||
DW_CFA_set_loc = 0x01,
|
||||
DW_CFA_advance_loc1 = 0x02,
|
||||
DW_CFA_advance_loc2 = 0x03,
|
||||
DW_CFA_advance_loc4 = 0x04,
|
||||
DW_CFA_offset_extended = 0x05,
|
||||
DW_CFA_restore_extended = 0x06,
|
||||
DW_CFA_undefined = 0x07,
|
||||
DW_CFA_same_value = 0x08,
|
||||
DW_CFA_register = 0x09,
|
||||
DW_CFA_remember_state = 0x0a,
|
||||
DW_CFA_restore_state = 0x0b,
|
||||
DW_CFA_def_cfa = 0x0c,
|
||||
DW_CFA_def_cfa_register = 0x0d,
|
||||
DW_CFA_def_cfa_offset = 0x0e,
|
||||
DW_CFA_def_cfa_expression = 0x0f,
|
||||
DW_CFA_expression = 0x10,
|
||||
/* Dwarf 2.1 */
|
||||
DW_CFA_offset_extended_sf = 0x11,
|
||||
DW_CFA_def_cfa_sf = 0x12,
|
||||
DW_CFA_def_cfa_offset_sf = 0x13,
|
||||
|
||||
/* SGI/MIPS specific */
|
||||
DW_CFA_MIPS_advance_loc8 = 0x1d,
|
||||
|
||||
/* GNU extensions */
|
||||
DW_CFA_GNU_window_save = 0x2d,
|
||||
DW_CFA_GNU_args_size = 0x2e,
|
||||
DW_CFA_GNU_negative_offset_extended = 0x2f
|
||||
};
|
||||
|
||||
#define DW_CIE_ID 0xffffffff
|
||||
#define DW_CIE_VERSION 1
|
||||
|
||||
#define DW_CFA_extended 0
|
||||
#define DW_CFA_low_user 0x1c
|
||||
#define DW_CFA_high_user 0x3f
|
||||
|
||||
#define DW_CHILDREN_no 0x00
|
||||
#define DW_CHILDREN_yes 0x01
|
||||
|
||||
#define DW_ADDR_none 0
|
||||
|
||||
/* Source language names and codes. */
|
||||
|
||||
enum dwarf_source_language
|
||||
{
|
||||
DW_LANG_C89 = 0x0001,
|
||||
DW_LANG_C = 0x0002,
|
||||
DW_LANG_Ada83 = 0x0003,
|
||||
DW_LANG_C_plus_plus = 0x0004,
|
||||
DW_LANG_Cobol74 = 0x0005,
|
||||
DW_LANG_Cobol85 = 0x0006,
|
||||
DW_LANG_Fortran77 = 0x0007,
|
||||
DW_LANG_Fortran90 = 0x0008,
|
||||
DW_LANG_Pascal83 = 0x0009,
|
||||
DW_LANG_Modula2 = 0x000a,
|
||||
DW_LANG_Java = 0x000b,
|
||||
DW_LANG_Mips_Assembler = 0x8001
|
||||
};
|
||||
|
||||
|
||||
#define DW_LANG_lo_user 0x8000 /* implementation-defined range start */
|
||||
#define DW_LANG_hi_user 0xffff /* implementation-defined range start */
|
||||
|
||||
/* Names and codes for macro information. */
|
||||
|
||||
enum dwarf_macinfo_record_type
|
||||
{
|
||||
DW_MACINFO_define = 1,
|
||||
DW_MACINFO_undef = 2,
|
||||
DW_MACINFO_start_file = 3,
|
||||
DW_MACINFO_end_file = 4,
|
||||
DW_MACINFO_vendor_ext = 255
|
||||
};
|
||||
|
||||
#endif /* !ASSEMBLER */
|
||||
|
||||
/* @@@ For use with GNU frame unwind information. */
|
||||
|
||||
#define DW_EH_PE_absptr 0x00
|
||||
#define DW_EH_PE_omit 0xff
|
||||
|
||||
#define DW_EH_PE_uleb128 0x01
|
||||
#define DW_EH_PE_udata2 0x02
|
||||
#define DW_EH_PE_udata4 0x03
|
||||
#define DW_EH_PE_udata8 0x04
|
||||
#define DW_EH_PE_sleb128 0x09
|
||||
#define DW_EH_PE_sdata2 0x0A
|
||||
#define DW_EH_PE_sdata4 0x0B
|
||||
#define DW_EH_PE_sdata8 0x0C
|
||||
#define DW_EH_PE_signed 0x08
|
||||
|
||||
#define DW_EH_PE_pcrel 0x10
|
||||
#define DW_EH_PE_textrel 0x20
|
||||
#define DW_EH_PE_datarel 0x30
|
||||
#define DW_EH_PE_funcrel 0x40
|
||||
#define DW_EH_PE_aligned 0x50
|
||||
|
||||
#define DW_EH_PE_indirect 0x80
|
||||
|
||||
#endif /* dwarf2.h */
|
3182
utils/memcpy-bench/glibc/memcpy-ssse3-back.S
Normal file
3182
utils/memcpy-bench/glibc/memcpy-ssse3-back.S
Normal file
File diff suppressed because it is too large
Load Diff
3152
utils/memcpy-bench/glibc/memcpy-ssse3.S
Normal file
3152
utils/memcpy-bench/glibc/memcpy-ssse3.S
Normal file
File diff suppressed because it is too large
Load Diff
12
utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
Normal file
12
utils/memcpy-bench/glibc/memmove-avx-unaligned-erms.S
Normal file
@ -0,0 +1,12 @@
|
||||
#if 1
|
||||
# define VEC_SIZE 32
|
||||
# define VEC(i) ymm##i
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
|
||||
# define SECTION(p) p##.avx
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
|
||||
|
||||
# include "memmove-vec-unaligned-erms.S"
|
||||
#endif
|
419
utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
Normal file
419
utils/memcpy-bench/glibc/memmove-avx512-no-vzeroupper.S
Normal file
@ -0,0 +1,419 @@
|
||||
/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
|
||||
Copyright (C) 2016-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sysdep.h"
|
||||
|
||||
#if 1
|
||||
|
||||
# include "asm-syntax.h"
|
||||
|
||||
.section .text.avx512,"ax",@progbits
|
||||
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__mempcpy_chk_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__mempcpy_avx512_no_vzeroupper)
|
||||
mov %RDI_LP, %RAX_LP
|
||||
add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (__mempcpy_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__memmove_chk_avx512_no_vzeroupper)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__memmove_chk_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__memmove_avx512_no_vzeroupper)
|
||||
mov %RDI_LP, %RAX_LP
|
||||
# ifdef USE_AS_MEMPCPY
|
||||
add %RDX_LP, %RAX_LP
|
||||
# endif
|
||||
L(start):
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
# endif
|
||||
lea (%rsi, %rdx), %rcx
|
||||
lea (%rdi, %rdx), %r9
|
||||
cmp $512, %rdx
|
||||
ja L(512bytesormore)
|
||||
|
||||
L(check):
|
||||
cmp $16, %rdx
|
||||
jbe L(less_16bytes)
|
||||
cmp $256, %rdx
|
||||
jb L(less_256bytes)
|
||||
vmovups (%rsi), %zmm0
|
||||
vmovups 0x40(%rsi), %zmm1
|
||||
vmovups 0x80(%rsi), %zmm2
|
||||
vmovups 0xC0(%rsi), %zmm3
|
||||
vmovups -0x100(%rcx), %zmm4
|
||||
vmovups -0xC0(%rcx), %zmm5
|
||||
vmovups -0x80(%rcx), %zmm6
|
||||
vmovups -0x40(%rcx), %zmm7
|
||||
vmovups %zmm0, (%rdi)
|
||||
vmovups %zmm1, 0x40(%rdi)
|
||||
vmovups %zmm2, 0x80(%rdi)
|
||||
vmovups %zmm3, 0xC0(%rdi)
|
||||
vmovups %zmm4, -0x100(%r9)
|
||||
vmovups %zmm5, -0xC0(%r9)
|
||||
vmovups %zmm6, -0x80(%r9)
|
||||
vmovups %zmm7, -0x40(%r9)
|
||||
ret
|
||||
|
||||
L(less_256bytes):
|
||||
cmp $128, %dl
|
||||
jb L(less_128bytes)
|
||||
vmovups (%rsi), %zmm0
|
||||
vmovups 0x40(%rsi), %zmm1
|
||||
vmovups -0x80(%rcx), %zmm2
|
||||
vmovups -0x40(%rcx), %zmm3
|
||||
vmovups %zmm0, (%rdi)
|
||||
vmovups %zmm1, 0x40(%rdi)
|
||||
vmovups %zmm2, -0x80(%r9)
|
||||
vmovups %zmm3, -0x40(%r9)
|
||||
ret
|
||||
|
||||
L(less_128bytes):
|
||||
cmp $64, %dl
|
||||
jb L(less_64bytes)
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu 0x20(%rsi), %ymm1
|
||||
vmovdqu -0x40(%rcx), %ymm2
|
||||
vmovdqu -0x20(%rcx), %ymm3
|
||||
vmovdqu %ymm0, (%rdi)
|
||||
vmovdqu %ymm1, 0x20(%rdi)
|
||||
vmovdqu %ymm2, -0x40(%r9)
|
||||
vmovdqu %ymm3, -0x20(%r9)
|
||||
ret
|
||||
|
||||
L(less_64bytes):
|
||||
cmp $32, %dl
|
||||
jb L(less_32bytes)
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu -0x20(%rcx), %ymm1
|
||||
vmovdqu %ymm0, (%rdi)
|
||||
vmovdqu %ymm1, -0x20(%r9)
|
||||
ret
|
||||
|
||||
L(less_32bytes):
|
||||
vmovdqu (%rsi), %xmm0
|
||||
vmovdqu -0x10(%rcx), %xmm1
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm1, -0x10(%r9)
|
||||
ret
|
||||
|
||||
L(less_16bytes):
|
||||
cmp $8, %dl
|
||||
jb L(less_8bytes)
|
||||
movq (%rsi), %rsi
|
||||
movq -0x8(%rcx), %rcx
|
||||
movq %rsi, (%rdi)
|
||||
movq %rcx, -0x8(%r9)
|
||||
ret
|
||||
|
||||
L(less_8bytes):
|
||||
cmp $4, %dl
|
||||
jb L(less_4bytes)
|
||||
mov (%rsi), %esi
|
||||
mov -0x4(%rcx), %ecx
|
||||
mov %esi, (%rdi)
|
||||
mov %ecx, -0x4(%r9)
|
||||
ret
|
||||
|
||||
L(less_4bytes):
|
||||
cmp $2, %dl
|
||||
jb L(less_2bytes)
|
||||
mov (%rsi), %si
|
||||
mov -0x2(%rcx), %cx
|
||||
mov %si, (%rdi)
|
||||
mov %cx, -0x2(%r9)
|
||||
ret
|
||||
|
||||
L(less_2bytes):
|
||||
cmp $1, %dl
|
||||
jb L(less_1bytes)
|
||||
mov (%rsi), %cl
|
||||
mov %cl, (%rdi)
|
||||
L(less_1bytes):
|
||||
ret
|
||||
|
||||
L(512bytesormore):
|
||||
# ifdef SHARED_CACHE_SIZE_HALF
|
||||
mov $SHARED_CACHE_SIZE_HALF, %r8
|
||||
# else
|
||||
mov __x86_shared_cache_size_half(%rip), %r8
|
||||
# endif
|
||||
cmp %r8, %rdx
|
||||
jae L(preloop_large)
|
||||
cmp $1024, %rdx
|
||||
ja L(1024bytesormore)
|
||||
prefetcht1 (%rsi)
|
||||
prefetcht1 0x40(%rsi)
|
||||
prefetcht1 0x80(%rsi)
|
||||
prefetcht1 0xC0(%rsi)
|
||||
prefetcht1 0x100(%rsi)
|
||||
prefetcht1 0x140(%rsi)
|
||||
prefetcht1 0x180(%rsi)
|
||||
prefetcht1 0x1C0(%rsi)
|
||||
prefetcht1 -0x200(%rcx)
|
||||
prefetcht1 -0x1C0(%rcx)
|
||||
prefetcht1 -0x180(%rcx)
|
||||
prefetcht1 -0x140(%rcx)
|
||||
prefetcht1 -0x100(%rcx)
|
||||
prefetcht1 -0xC0(%rcx)
|
||||
prefetcht1 -0x80(%rcx)
|
||||
prefetcht1 -0x40(%rcx)
|
||||
vmovups (%rsi), %zmm0
|
||||
vmovups 0x40(%rsi), %zmm1
|
||||
vmovups 0x80(%rsi), %zmm2
|
||||
vmovups 0xC0(%rsi), %zmm3
|
||||
vmovups 0x100(%rsi), %zmm4
|
||||
vmovups 0x140(%rsi), %zmm5
|
||||
vmovups 0x180(%rsi), %zmm6
|
||||
vmovups 0x1C0(%rsi), %zmm7
|
||||
vmovups -0x200(%rcx), %zmm8
|
||||
vmovups -0x1C0(%rcx), %zmm9
|
||||
vmovups -0x180(%rcx), %zmm10
|
||||
vmovups -0x140(%rcx), %zmm11
|
||||
vmovups -0x100(%rcx), %zmm12
|
||||
vmovups -0xC0(%rcx), %zmm13
|
||||
vmovups -0x80(%rcx), %zmm14
|
||||
vmovups -0x40(%rcx), %zmm15
|
||||
vmovups %zmm0, (%rdi)
|
||||
vmovups %zmm1, 0x40(%rdi)
|
||||
vmovups %zmm2, 0x80(%rdi)
|
||||
vmovups %zmm3, 0xC0(%rdi)
|
||||
vmovups %zmm4, 0x100(%rdi)
|
||||
vmovups %zmm5, 0x140(%rdi)
|
||||
vmovups %zmm6, 0x180(%rdi)
|
||||
vmovups %zmm7, 0x1C0(%rdi)
|
||||
vmovups %zmm8, -0x200(%r9)
|
||||
vmovups %zmm9, -0x1C0(%r9)
|
||||
vmovups %zmm10, -0x180(%r9)
|
||||
vmovups %zmm11, -0x140(%r9)
|
||||
vmovups %zmm12, -0x100(%r9)
|
||||
vmovups %zmm13, -0xC0(%r9)
|
||||
vmovups %zmm14, -0x80(%r9)
|
||||
vmovups %zmm15, -0x40(%r9)
|
||||
ret
|
||||
|
||||
L(1024bytesormore):
|
||||
cmp %rsi, %rdi
|
||||
ja L(1024bytesormore_bkw)
|
||||
sub $512, %r9
|
||||
vmovups -0x200(%rcx), %zmm8
|
||||
vmovups -0x1C0(%rcx), %zmm9
|
||||
vmovups -0x180(%rcx), %zmm10
|
||||
vmovups -0x140(%rcx), %zmm11
|
||||
vmovups -0x100(%rcx), %zmm12
|
||||
vmovups -0xC0(%rcx), %zmm13
|
||||
vmovups -0x80(%rcx), %zmm14
|
||||
vmovups -0x40(%rcx), %zmm15
|
||||
prefetcht1 (%rsi)
|
||||
prefetcht1 0x40(%rsi)
|
||||
prefetcht1 0x80(%rsi)
|
||||
prefetcht1 0xC0(%rsi)
|
||||
prefetcht1 0x100(%rsi)
|
||||
prefetcht1 0x140(%rsi)
|
||||
prefetcht1 0x180(%rsi)
|
||||
prefetcht1 0x1C0(%rsi)
|
||||
|
||||
/* Loop with unaligned memory access. */
|
||||
L(gobble_512bytes_loop):
|
||||
vmovups (%rsi), %zmm0
|
||||
vmovups 0x40(%rsi), %zmm1
|
||||
vmovups 0x80(%rsi), %zmm2
|
||||
vmovups 0xC0(%rsi), %zmm3
|
||||
vmovups 0x100(%rsi), %zmm4
|
||||
vmovups 0x140(%rsi), %zmm5
|
||||
vmovups 0x180(%rsi), %zmm6
|
||||
vmovups 0x1C0(%rsi), %zmm7
|
||||
add $512, %rsi
|
||||
prefetcht1 (%rsi)
|
||||
prefetcht1 0x40(%rsi)
|
||||
prefetcht1 0x80(%rsi)
|
||||
prefetcht1 0xC0(%rsi)
|
||||
prefetcht1 0x100(%rsi)
|
||||
prefetcht1 0x140(%rsi)
|
||||
prefetcht1 0x180(%rsi)
|
||||
prefetcht1 0x1C0(%rsi)
|
||||
vmovups %zmm0, (%rdi)
|
||||
vmovups %zmm1, 0x40(%rdi)
|
||||
vmovups %zmm2, 0x80(%rdi)
|
||||
vmovups %zmm3, 0xC0(%rdi)
|
||||
vmovups %zmm4, 0x100(%rdi)
|
||||
vmovups %zmm5, 0x140(%rdi)
|
||||
vmovups %zmm6, 0x180(%rdi)
|
||||
vmovups %zmm7, 0x1C0(%rdi)
|
||||
add $512, %rdi
|
||||
cmp %r9, %rdi
|
||||
jb L(gobble_512bytes_loop)
|
||||
vmovups %zmm8, (%r9)
|
||||
vmovups %zmm9, 0x40(%r9)
|
||||
vmovups %zmm10, 0x80(%r9)
|
||||
vmovups %zmm11, 0xC0(%r9)
|
||||
vmovups %zmm12, 0x100(%r9)
|
||||
vmovups %zmm13, 0x140(%r9)
|
||||
vmovups %zmm14, 0x180(%r9)
|
||||
vmovups %zmm15, 0x1C0(%r9)
|
||||
ret
|
||||
|
||||
L(1024bytesormore_bkw):
|
||||
add $512, %rdi
|
||||
vmovups 0x1C0(%rsi), %zmm8
|
||||
vmovups 0x180(%rsi), %zmm9
|
||||
vmovups 0x140(%rsi), %zmm10
|
||||
vmovups 0x100(%rsi), %zmm11
|
||||
vmovups 0xC0(%rsi), %zmm12
|
||||
vmovups 0x80(%rsi), %zmm13
|
||||
vmovups 0x40(%rsi), %zmm14
|
||||
vmovups (%rsi), %zmm15
|
||||
prefetcht1 -0x40(%rcx)
|
||||
prefetcht1 -0x80(%rcx)
|
||||
prefetcht1 -0xC0(%rcx)
|
||||
prefetcht1 -0x100(%rcx)
|
||||
prefetcht1 -0x140(%rcx)
|
||||
prefetcht1 -0x180(%rcx)
|
||||
prefetcht1 -0x1C0(%rcx)
|
||||
prefetcht1 -0x200(%rcx)
|
||||
|
||||
/* Backward loop with unaligned memory access. */
|
||||
L(gobble_512bytes_loop_bkw):
|
||||
vmovups -0x40(%rcx), %zmm0
|
||||
vmovups -0x80(%rcx), %zmm1
|
||||
vmovups -0xC0(%rcx), %zmm2
|
||||
vmovups -0x100(%rcx), %zmm3
|
||||
vmovups -0x140(%rcx), %zmm4
|
||||
vmovups -0x180(%rcx), %zmm5
|
||||
vmovups -0x1C0(%rcx), %zmm6
|
||||
vmovups -0x200(%rcx), %zmm7
|
||||
sub $512, %rcx
|
||||
prefetcht1 -0x40(%rcx)
|
||||
prefetcht1 -0x80(%rcx)
|
||||
prefetcht1 -0xC0(%rcx)
|
||||
prefetcht1 -0x100(%rcx)
|
||||
prefetcht1 -0x140(%rcx)
|
||||
prefetcht1 -0x180(%rcx)
|
||||
prefetcht1 -0x1C0(%rcx)
|
||||
prefetcht1 -0x200(%rcx)
|
||||
vmovups %zmm0, -0x40(%r9)
|
||||
vmovups %zmm1, -0x80(%r9)
|
||||
vmovups %zmm2, -0xC0(%r9)
|
||||
vmovups %zmm3, -0x100(%r9)
|
||||
vmovups %zmm4, -0x140(%r9)
|
||||
vmovups %zmm5, -0x180(%r9)
|
||||
vmovups %zmm6, -0x1C0(%r9)
|
||||
vmovups %zmm7, -0x200(%r9)
|
||||
sub $512, %r9
|
||||
cmp %rdi, %r9
|
||||
ja L(gobble_512bytes_loop_bkw)
|
||||
vmovups %zmm8, -0x40(%rdi)
|
||||
vmovups %zmm9, -0x80(%rdi)
|
||||
vmovups %zmm10, -0xC0(%rdi)
|
||||
vmovups %zmm11, -0x100(%rdi)
|
||||
vmovups %zmm12, -0x140(%rdi)
|
||||
vmovups %zmm13, -0x180(%rdi)
|
||||
vmovups %zmm14, -0x1C0(%rdi)
|
||||
vmovups %zmm15, -0x200(%rdi)
|
||||
ret
|
||||
|
||||
L(preloop_large):
|
||||
cmp %rsi, %rdi
|
||||
ja L(preloop_large_bkw)
|
||||
vmovups (%rsi), %zmm4
|
||||
vmovups 0x40(%rsi), %zmm5
|
||||
|
||||
mov %rdi, %r11
|
||||
/* Align destination for access with non-temporal stores in the loop. */
|
||||
mov %rdi, %r8
|
||||
and $-0x80, %rdi
|
||||
add $0x80, %rdi
|
||||
sub %rdi, %r8
|
||||
sub %r8, %rsi
|
||||
add %r8, %rdx
|
||||
L(gobble_256bytes_nt_loop):
|
||||
prefetcht1 0x200(%rsi)
|
||||
prefetcht1 0x240(%rsi)
|
||||
prefetcht1 0x280(%rsi)
|
||||
prefetcht1 0x2C0(%rsi)
|
||||
prefetcht1 0x300(%rsi)
|
||||
prefetcht1 0x340(%rsi)
|
||||
prefetcht1 0x380(%rsi)
|
||||
prefetcht1 0x3C0(%rsi)
|
||||
vmovdqu64 (%rsi), %zmm0
|
||||
vmovdqu64 0x40(%rsi), %zmm1
|
||||
vmovdqu64 0x80(%rsi), %zmm2
|
||||
vmovdqu64 0xC0(%rsi), %zmm3
|
||||
vmovntdq %zmm0, (%rdi)
|
||||
vmovntdq %zmm1, 0x40(%rdi)
|
||||
vmovntdq %zmm2, 0x80(%rdi)
|
||||
vmovntdq %zmm3, 0xC0(%rdi)
|
||||
sub $256, %rdx
|
||||
add $256, %rsi
|
||||
add $256, %rdi
|
||||
cmp $256, %rdx
|
||||
ja L(gobble_256bytes_nt_loop)
|
||||
sfence
|
||||
vmovups %zmm4, (%r11)
|
||||
vmovups %zmm5, 0x40(%r11)
|
||||
jmp L(check)
|
||||
|
||||
L(preloop_large_bkw):
|
||||
vmovups -0x80(%rcx), %zmm4
|
||||
vmovups -0x40(%rcx), %zmm5
|
||||
|
||||
/* Align end of destination for access with non-temporal stores. */
|
||||
mov %r9, %r8
|
||||
and $-0x80, %r9
|
||||
sub %r9, %r8
|
||||
sub %r8, %rcx
|
||||
sub %r8, %rdx
|
||||
add %r9, %r8
|
||||
L(gobble_256bytes_nt_loop_bkw):
|
||||
prefetcht1 -0x400(%rcx)
|
||||
prefetcht1 -0x3C0(%rcx)
|
||||
prefetcht1 -0x380(%rcx)
|
||||
prefetcht1 -0x340(%rcx)
|
||||
prefetcht1 -0x300(%rcx)
|
||||
prefetcht1 -0x2C0(%rcx)
|
||||
prefetcht1 -0x280(%rcx)
|
||||
prefetcht1 -0x240(%rcx)
|
||||
vmovdqu64 -0x100(%rcx), %zmm0
|
||||
vmovdqu64 -0xC0(%rcx), %zmm1
|
||||
vmovdqu64 -0x80(%rcx), %zmm2
|
||||
vmovdqu64 -0x40(%rcx), %zmm3
|
||||
vmovntdq %zmm0, -0x100(%r9)
|
||||
vmovntdq %zmm1, -0xC0(%r9)
|
||||
vmovntdq %zmm2, -0x80(%r9)
|
||||
vmovntdq %zmm3, -0x40(%r9)
|
||||
sub $256, %rdx
|
||||
sub $256, %rcx
|
||||
sub $256, %r9
|
||||
cmp $256, %rdx
|
||||
ja L(gobble_256bytes_nt_loop_bkw)
|
||||
sfence
|
||||
vmovups %zmm4, -0x80(%r8)
|
||||
vmovups %zmm5, -0x40(%r8)
|
||||
jmp L(check)
|
||||
END (__memmove_avx512_no_vzeroupper)
|
||||
|
||||
strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
|
||||
strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
|
||||
#endif
|
12
utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
Normal file
12
utils/memcpy-bench/glibc/memmove-avx512-unaligned-erms.S
Normal file
@ -0,0 +1,12 @@
|
||||
#if 1
|
||||
# define VEC_SIZE 64
|
||||
# define VEC(i) zmm##i
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
|
||||
# define SECTION(p) p##.avx512
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
# include "memmove-vec-unaligned-erms.S"
|
||||
#endif
|
33
utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
Normal file
33
utils/memcpy-bench/glibc/memmove-sse2-unaligned-erms.S
Normal file
@ -0,0 +1,33 @@
|
||||
/* memmove with SSE2.
|
||||
Copyright (C) 2017-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#if 1
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
|
||||
#else
|
||||
weak_alias (__mempcpy, mempcpy)
|
||||
#endif
|
||||
|
||||
#include "memmove.S"
|
||||
|
||||
#if defined SHARED
|
||||
# include <shlib-compat.h>
|
||||
# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
|
||||
/* Use __memmove_sse2_unaligned to support overlapping addresses. */
|
||||
compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
|
||||
# endif
|
||||
#endif
|
559
utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
Normal file
559
utils/memcpy-bench/glibc/memmove-vec-unaligned-erms.S
Normal file
@ -0,0 +1,559 @@
|
||||
/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
|
||||
Copyright (C) 2016-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* memmove/memcpy/mempcpy is implemented as:
|
||||
1. Use overlapping load and store to avoid branch.
|
||||
2. Load all sources into registers and store them together to avoid
|
||||
possible address overlap between source and destination.
|
||||
3. If size is 8 * VEC_SIZE or less, load all sources into registers
|
||||
and store them together.
|
||||
4. If address of destination > address of source, backward copy
|
||||
4 * VEC_SIZE at a time with unaligned load and aligned store.
|
||||
Load the first 4 * VEC and last VEC before the loop and store
|
||||
them after the loop to support overlapping addresses.
|
||||
5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
|
||||
load and aligned store. Load the last 4 * VEC and first VEC
|
||||
before the loop and store them after the loop to support
|
||||
overlapping addresses.
|
||||
6. If size >= __x86_shared_non_temporal_threshold and there is no
|
||||
overlap between destination and source, use non-temporal store
|
||||
instead of aligned store. */
|
||||
|
||||
#include "sysdep.h"
|
||||
|
||||
#ifndef MEMCPY_SYMBOL
|
||||
# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||
#endif
|
||||
|
||||
#ifndef MEMPCPY_SYMBOL
|
||||
# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||
#endif
|
||||
|
||||
#ifndef MEMMOVE_CHK_SYMBOL
|
||||
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||
#endif
|
||||
|
||||
#ifndef VZEROUPPER
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER vzeroupper
|
||||
# else
|
||||
# define VZEROUPPER
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef PREFETCH
|
||||
# define PREFETCH(addr) prefetcht0 addr
|
||||
#endif
|
||||
|
||||
/* Assume 64-byte prefetch size. */
|
||||
#ifndef PREFETCH_SIZE
|
||||
# define PREFETCH_SIZE 64
|
||||
#endif
|
||||
|
||||
#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
|
||||
|
||||
#if PREFETCH_SIZE == 64
|
||||
# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
|
||||
# define PREFETCH_ONE_SET(dir, base, offset) \
|
||||
PREFETCH ((offset)base)
|
||||
# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
|
||||
# define PREFETCH_ONE_SET(dir, base, offset) \
|
||||
PREFETCH ((offset)base); \
|
||||
PREFETCH ((offset + dir * PREFETCH_SIZE)base)
|
||||
# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
|
||||
# define PREFETCH_ONE_SET(dir, base, offset) \
|
||||
PREFETCH ((offset)base); \
|
||||
PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
|
||||
PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
|
||||
PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
|
||||
# else
|
||||
# error Unsupported PREFETCHED_LOAD_SIZE!
|
||||
# endif
|
||||
#else
|
||||
# error Unsupported PREFETCH_SIZE!
|
||||
#endif
|
||||
|
||||
#ifndef SECTION
|
||||
# error SECTION is not defined!
|
||||
#endif
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
#if defined SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
||||
mov %RDI_LP, %RAX_LP
|
||||
add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
||||
|
||||
#if defined SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
movq %rdi, %rax
|
||||
L(start):
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(more_2x_vec)
|
||||
#if !defined USE_MULTIARCH
|
||||
L(last_2x_vec):
|
||||
#endif
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
#if !defined USE_MULTIARCH
|
||||
L(nop):
|
||||
#endif
|
||||
ret
|
||||
#if defined USE_MULTIARCH
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
|
||||
# if VEC_SIZE == 16
|
||||
ENTRY (__mempcpy_chk_erms)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__mempcpy_chk_erms)
|
||||
|
||||
/* Only used to measure performance of REP MOVSB. */
|
||||
ENTRY (__mempcpy_erms)
|
||||
mov %RDI_LP, %RAX_LP
|
||||
/* Skip zero length. */
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz 2f
|
||||
add %RDX_LP, %RAX_LP
|
||||
jmp L(start_movsb)
|
||||
END (__mempcpy_erms)
|
||||
|
||||
ENTRY (__memmove_chk_erms)
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__memmove_chk_erms)
|
||||
|
||||
ENTRY (__memmove_erms)
|
||||
movq %rdi, %rax
|
||||
/* Skip zero length. */
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz 2f
|
||||
L(start_movsb):
|
||||
mov %RDX_LP, %RCX_LP
|
||||
cmp %RSI_LP, %RDI_LP
|
||||
jb 1f
|
||||
/* Source == destination is less common. */
|
||||
je 2f
|
||||
lea (%rsi,%rcx), %RDX_LP
|
||||
cmp %RDX_LP, %RDI_LP
|
||||
jb L(movsb_backward)
|
||||
1:
|
||||
rep movsb
|
||||
2:
|
||||
ret
|
||||
L(movsb_backward):
|
||||
leaq -1(%rdi,%rcx), %rdi
|
||||
leaq -1(%rsi,%rcx), %rsi
|
||||
std
|
||||
rep movsb
|
||||
cld
|
||||
ret
|
||||
END (__memmove_erms)
|
||||
strong_alias (__memmove_erms, __memcpy_erms)
|
||||
strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
|
||||
# endif
|
||||
|
||||
# ifdef SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||
mov %RDI_LP, %RAX_LP
|
||||
add %RDX_LP, %RAX_LP
|
||||
jmp L(start_erms)
|
||||
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||
|
||||
# ifdef SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
movq %rdi, %rax
|
||||
L(start_erms):
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(movsb_more_2x_vec)
|
||||
L(last_2x_vec):
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
L(return):
|
||||
VZEROUPPER
|
||||
ret
|
||||
|
||||
L(movsb):
|
||||
cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
|
||||
jae L(more_8x_vec)
|
||||
cmpq %rsi, %rdi
|
||||
jb 1f
|
||||
/* Source == destination is less common. */
|
||||
je L(nop)
|
||||
leaq (%rsi,%rdx), %r9
|
||||
cmpq %r9, %rdi
|
||||
/* Avoid slow backward REP MOVSB. */
|
||||
jb L(more_8x_vec_backward)
|
||||
1:
|
||||
mov %RDX_LP, %RCX_LP
|
||||
rep movsb
|
||||
L(nop):
|
||||
ret
|
||||
#endif
|
||||
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
#endif
|
||||
#if VEC_SIZE > 32
|
||||
cmpb $32, %dl
|
||||
jae L(between_32_63)
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
cmpb $16, %dl
|
||||
jae L(between_16_31)
|
||||
#endif
|
||||
cmpb $8, %dl
|
||||
jae L(between_8_15)
|
||||
cmpb $4, %dl
|
||||
jae L(between_4_7)
|
||||
cmpb $1, %dl
|
||||
ja L(between_2_3)
|
||||
jb 1f
|
||||
movzbl (%rsi), %ecx
|
||||
movb %cl, (%rdi)
|
||||
1:
|
||||
ret
|
||||
#if VEC_SIZE > 32
|
||||
L(between_32_63):
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu -32(%rsi,%rdx), %ymm1
|
||||
vmovdqu %ymm0, (%rdi)
|
||||
vmovdqu %ymm1, -32(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
vmovdqu (%rsi), %xmm0
|
||||
vmovdqu -16(%rsi,%rdx), %xmm1
|
||||
vmovdqu %xmm0, (%rdi)
|
||||
vmovdqu %xmm1, -16(%rdi,%rdx)
|
||||
ret
|
||||
#endif
|
||||
L(between_8_15):
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
movq -8(%rsi,%rdx), %rcx
|
||||
movq (%rsi), %rsi
|
||||
movq %rcx, -8(%rdi,%rdx)
|
||||
movq %rsi, (%rdi)
|
||||
ret
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
movl -4(%rsi,%rdx), %ecx
|
||||
movl (%rsi), %esi
|
||||
movl %ecx, -4(%rdi,%rdx)
|
||||
movl %esi, (%rdi)
|
||||
ret
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
movzwl -2(%rsi,%rdx), %ecx
|
||||
movzwl (%rsi), %esi
|
||||
movw %cx, -2(%rdi,%rdx)
|
||||
movw %si, (%rdi)
|
||||
ret
|
||||
|
||||
#if defined USE_MULTIARCH
|
||||
L(movsb_more_2x_vec):
|
||||
cmp $REP_MOSB_THRESHOLD, %RDX_LP
|
||||
ja L(movsb)
|
||||
#endif
|
||||
L(more_2x_vec):
|
||||
/* More than 2 * VEC and there may be overlap between destination
|
||||
and source. */
|
||||
cmpq $(VEC_SIZE * 8), %rdx
|
||||
ja L(more_8x_vec)
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
jb L(last_4x_vec)
|
||||
/* Copy from 4 * VEC to 8 * VEC, inclusively. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
|
||||
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
|
||||
VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(last_4x_vec):
|
||||
/* Copy from 2 * VEC to 4 * VEC. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
|
||||
L(more_8x_vec):
|
||||
cmpq %rsi, %rdi
|
||||
ja L(more_8x_vec_backward)
|
||||
/* Source == destination is less common. */
|
||||
je L(nop)
|
||||
/* Load the first VEC and last 4 * VEC to support overlapping
|
||||
addresses. */
|
||||
VMOVU (%rsi), %VEC(4)
|
||||
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
|
||||
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
|
||||
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
|
||||
/* Save start and stop of the destination buffer. */
|
||||
movq %rdi, %r11
|
||||
leaq -VEC_SIZE(%rdi, %rdx), %rcx
|
||||
/* Align destination for aligned stores in the loop. Compute
|
||||
how much destination is misaligned. */
|
||||
movq %rdi, %r8
|
||||
andq $(VEC_SIZE - 1), %r8
|
||||
/* Get the negative of offset for alignment. */
|
||||
subq $VEC_SIZE, %r8
|
||||
/* Adjust source. */
|
||||
subq %r8, %rsi
|
||||
/* Adjust destination which should be aligned now. */
|
||||
subq %r8, %rdi
|
||||
/* Adjust length. */
|
||||
addq %r8, %rdx
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16)
|
||||
/* Check non-temporal store threshold. */
|
||||
cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
|
||||
ja L(large_forward)
|
||||
#endif
|
||||
L(loop_4x_vec_forward):
|
||||
/* Copy 4 * VEC a time forward. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
addq $(VEC_SIZE * 4), %rsi
|
||||
subq $(VEC_SIZE * 4), %rdx
|
||||
VMOVA %VEC(0), (%rdi)
|
||||
VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
addq $(VEC_SIZE * 4), %rdi
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
ja L(loop_4x_vec_forward)
|
||||
/* Store the last 4 * VEC. */
|
||||
VMOVU %VEC(5), (%rcx)
|
||||
VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
||||
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||||
/* Store the first VEC. */
|
||||
VMOVU %VEC(4), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
|
||||
L(more_8x_vec_backward):
|
||||
/* Load the first 4 * VEC and last VEC to support overlapping
|
||||
addresses. */
|
||||
VMOVU (%rsi), %VEC(4)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(5)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
|
||||
/* Save stop of the destination buffer. */
|
||||
leaq -VEC_SIZE(%rdi, %rdx), %r11
|
||||
/* Align destination end for aligned stores in the loop. Compute
|
||||
how much destination end is misaligned. */
|
||||
leaq -VEC_SIZE(%rsi, %rdx), %rcx
|
||||
movq %r11, %r9
|
||||
movq %r11, %r8
|
||||
andq $(VEC_SIZE - 1), %r8
|
||||
/* Adjust source. */
|
||||
subq %r8, %rcx
|
||||
/* Adjust the end of destination which should be aligned now. */
|
||||
subq %r8, %r9
|
||||
/* Adjust length. */
|
||||
subq %r8, %rdx
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16)
|
||||
/* Check non-temporal store threshold. */
|
||||
cmp $SHARED_NON_TEMPORAL_THRESHOLD, %RDX_LP
|
||||
ja L(large_backward)
|
||||
#endif
|
||||
L(loop_4x_vec_backward):
|
||||
/* Copy 4 * VEC a time backward. */
|
||||
VMOVU (%rcx), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||||
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||||
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||||
subq $(VEC_SIZE * 4), %rcx
|
||||
subq $(VEC_SIZE * 4), %rdx
|
||||
VMOVA %VEC(0), (%r9)
|
||||
VMOVA %VEC(1), -VEC_SIZE(%r9)
|
||||
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||||
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||||
subq $(VEC_SIZE * 4), %r9
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
ja L(loop_4x_vec_backward)
|
||||
/* Store the first 4 * VEC. */
|
||||
VMOVU %VEC(4), (%rdi)
|
||||
VMOVU %VEC(5), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||||
/* Store the last VEC. */
|
||||
VMOVU %VEC(8), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16)
|
||||
L(large_forward):
|
||||
/* Don't use non-temporal store if there is overlap between
|
||||
destination and source since destination may be in cache
|
||||
when source is loaded. */
|
||||
leaq (%rdi, %rdx), %r10
|
||||
cmpq %r10, %rsi
|
||||
jb L(loop_4x_vec_forward)
|
||||
L(loop_large_forward):
|
||||
/* Copy 4 * VEC a time forward with non-temporal stores. */
|
||||
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
||||
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
addq $PREFETCHED_LOAD_SIZE, %rsi
|
||||
subq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
VMOVNT %VEC(0), (%rdi)
|
||||
VMOVNT %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
addq $PREFETCHED_LOAD_SIZE, %rdi
|
||||
cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
ja L(loop_large_forward)
|
||||
sfence
|
||||
/* Store the last 4 * VEC. */
|
||||
VMOVU %VEC(5), (%rcx)
|
||||
VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
||||
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||||
/* Store the first VEC. */
|
||||
VMOVU %VEC(4), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
|
||||
L(large_backward):
|
||||
/* Don't use non-temporal store if there is overlap between
|
||||
destination and source since destination may be in cache
|
||||
when source is loaded. */
|
||||
leaq (%rcx, %rdx), %r10
|
||||
cmpq %r10, %r9
|
||||
jb L(loop_4x_vec_backward)
|
||||
L(loop_large_backward):
|
||||
/* Copy 4 * VEC a time backward with non-temporal stores. */
|
||||
PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
|
||||
PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
|
||||
VMOVU (%rcx), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||||
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||||
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||||
subq $PREFETCHED_LOAD_SIZE, %rcx
|
||||
subq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
VMOVNT %VEC(0), (%r9)
|
||||
VMOVNT %VEC(1), -VEC_SIZE(%r9)
|
||||
VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||||
VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||||
subq $PREFETCHED_LOAD_SIZE, %r9
|
||||
cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
ja L(loop_large_backward)
|
||||
sfence
|
||||
/* Store the first 4 * VEC. */
|
||||
VMOVU %VEC(4), (%rdi)
|
||||
VMOVU %VEC(5), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||||
/* Store the last VEC. */
|
||||
VMOVU %VEC(8), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
#endif
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
|
||||
#if 1
|
||||
# ifdef USE_MULTIARCH
|
||||
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
|
||||
MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
|
||||
# ifdef SHARED
|
||||
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
|
||||
MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
|
||||
# endif
|
||||
# endif
|
||||
# ifdef SHARED
|
||||
strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
|
||||
MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
|
||||
# endif
|
||||
#endif
|
||||
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
|
||||
MEMCPY_SYMBOL (__memcpy, unaligned))
|
71
utils/memcpy-bench/glibc/memmove.S
Normal file
71
utils/memcpy-bench/glibc/memmove.S
Normal file
@ -0,0 +1,71 @@
|
||||
/* Optimized memmove for x86-64.
|
||||
Copyright (C) 2016-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sysdep.h"
|
||||
|
||||
#define VEC_SIZE 16
|
||||
#define VEC(i) xmm##i
|
||||
#define PREFETCHNT prefetchnta
|
||||
#define VMOVNT movntdq
|
||||
/* Use movups and movaps for smaller code sizes. */
|
||||
#define VMOVU movups
|
||||
#define VMOVA movaps
|
||||
|
||||
#define SECTION(p) p
|
||||
|
||||
#ifdef USE_MULTIARCH
|
||||
# if 0
|
||||
# define MEMCPY_SYMBOL(p,s) memcpy
|
||||
# endif
|
||||
#else
|
||||
# if defined SHARED
|
||||
# define MEMCPY_SYMBOL(p,s) __memcpy
|
||||
# else
|
||||
# define MEMCPY_SYMBOL(p,s) memcpy
|
||||
# endif
|
||||
#endif
|
||||
#if !defined USE_MULTIARCH
|
||||
# define MEMPCPY_SYMBOL(p,s) __mempcpy
|
||||
#endif
|
||||
#ifndef MEMMOVE_SYMBOL
|
||||
# define MEMMOVE_CHK_SYMBOL(p,s) p
|
||||
# define MEMMOVE_SYMBOL(p,s) memmove
|
||||
#endif
|
||||
|
||||
#include "memmove-vec-unaligned-erms.S"
|
||||
|
||||
#ifndef USE_MULTIARCH
|
||||
libc_hidden_builtin_def (memmove)
|
||||
# if defined SHARED && IS_IN (libc)
|
||||
strong_alias (memmove, __memcpy)
|
||||
libc_hidden_ver (memmove, memcpy)
|
||||
# endif
|
||||
libc_hidden_def (__mempcpy)
|
||||
weak_alias (__mempcpy, mempcpy)
|
||||
libc_hidden_builtin_def (mempcpy)
|
||||
|
||||
# if defined SHARED && IS_IN (libc)
|
||||
# undef memcpy
|
||||
# include <shlib-compat.h>
|
||||
versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
|
||||
|
||||
# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
|
||||
compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
131
utils/memcpy-bench/glibc/sysdep.h
Normal file
131
utils/memcpy-bench/glibc/sysdep.h
Normal file
@ -0,0 +1,131 @@
|
||||
#pragma once
|
||||
|
||||
/* Assembler macros for x86-64.
|
||||
Copyright (C) 2001-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _X86_64_SYSDEP_H
|
||||
#define _X86_64_SYSDEP_H 1
|
||||
|
||||
#include "sysdep_x86.h"
|
||||
|
||||
#ifdef __ASSEMBLER__
|
||||
|
||||
/* Syntactic details of assembler. */
|
||||
|
||||
/* This macro is for setting proper CFI with DW_CFA_expression describing
|
||||
the register as saved relative to %rsp instead of relative to the CFA.
|
||||
Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset
|
||||
from %rsp. */
|
||||
#define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \
|
||||
0x77, off & 0x7F | 0x80, off >> 7
|
||||
|
||||
/* If compiled for profiling, call `mcount' at the start of each function. */
|
||||
#ifdef PROF
|
||||
/* The mcount code relies on a normal frame pointer being on the stack
|
||||
to locate our caller, so push one just for its benefit. */
|
||||
#define CALL_MCOUNT \
|
||||
pushq %rbp; \
|
||||
cfi_adjust_cfa_offset(8); \
|
||||
movq %rsp, %rbp; \
|
||||
cfi_def_cfa_register(%rbp); \
|
||||
call JUMPTARGET(mcount); \
|
||||
popq %rbp; \
|
||||
cfi_def_cfa(rsp,8);
|
||||
#else
|
||||
#define CALL_MCOUNT /* Do nothing. */
|
||||
#endif
|
||||
|
||||
#define PSEUDO(name, syscall_name, args) \
|
||||
lose: \
|
||||
jmp JUMPTARGET(syscall_error) \
|
||||
.globl syscall_error; \
|
||||
ENTRY (name) \
|
||||
DO_CALL (syscall_name, args); \
|
||||
jb lose
|
||||
|
||||
#undef JUMPTARGET
|
||||
#ifdef SHARED
|
||||
# ifdef BIND_NOW
|
||||
# define JUMPTARGET(name) *name##@GOTPCREL(%rip)
|
||||
# else
|
||||
# define JUMPTARGET(name) name##@PLT
|
||||
# endif
|
||||
#else
|
||||
/* For static archives, branch to target directly. */
|
||||
# define JUMPTARGET(name) name
|
||||
#endif
|
||||
|
||||
/* Long and pointer size in bytes. */
|
||||
#define LP_SIZE 8
|
||||
|
||||
/* Instruction to operate on long and pointer. */
|
||||
#define LP_OP(insn) insn##q
|
||||
|
||||
/* Assembler address directive. */
|
||||
#define ASM_ADDR .quad
|
||||
|
||||
/* Registers to hold long and pointer. */
|
||||
#define RAX_LP rax
|
||||
#define RBP_LP rbp
|
||||
#define RBX_LP rbx
|
||||
#define RCX_LP rcx
|
||||
#define RDI_LP rdi
|
||||
#define RDX_LP rdx
|
||||
#define RSI_LP rsi
|
||||
#define RSP_LP rsp
|
||||
#define R8_LP r8
|
||||
#define R9_LP r9
|
||||
#define R10_LP r10
|
||||
#define R11_LP r11
|
||||
#define R12_LP r12
|
||||
#define R13_LP r13
|
||||
#define R14_LP r14
|
||||
#define R15_LP r15
|
||||
|
||||
#else /* __ASSEMBLER__ */
|
||||
|
||||
/* Long and pointer size in bytes. */
|
||||
#define LP_SIZE "8"
|
||||
|
||||
/* Instruction to operate on long and pointer. */
|
||||
#define LP_OP(insn) #insn "q"
|
||||
|
||||
/* Assembler address directive. */
|
||||
#define ASM_ADDR ".quad"
|
||||
|
||||
/* Registers to hold long and pointer. */
|
||||
#define RAX_LP "rax"
|
||||
#define RBP_LP "rbp"
|
||||
#define RBX_LP "rbx"
|
||||
#define RCX_LP "rcx"
|
||||
#define RDI_LP "rdi"
|
||||
#define RDX_LP "rdx"
|
||||
#define RSI_LP "rsi"
|
||||
#define RSP_LP "rsp"
|
||||
#define R8_LP "r8"
|
||||
#define R9_LP "r9"
|
||||
#define R10_LP "r10"
|
||||
#define R11_LP "r11"
|
||||
#define R12_LP "r12"
|
||||
#define R13_LP "r13"
|
||||
#define R14_LP "r14"
|
||||
#define R15_LP "r15"
|
||||
|
||||
#endif /* __ASSEMBLER__ */
|
||||
|
||||
#endif /* _X86_64_SYSDEP_H */
|
115
utils/memcpy-bench/glibc/sysdep_generic.h
Normal file
115
utils/memcpy-bench/glibc/sysdep_generic.h
Normal file
@ -0,0 +1,115 @@
|
||||
#pragma once
|
||||
|
||||
/* Generic asm macros used on many machines.
|
||||
Copyright (C) 1991-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#define C_SYMBOL_NAME(name) name
|
||||
#define HIDDEN_JUMPTARGET(name) 0x0
|
||||
#define SHARED_CACHE_SIZE_HALF (1024*1024)
|
||||
#define DATA_CACHE_SIZE_HALF (1024*32/2)
|
||||
#define DATA_CACHE_SIZE (1024*32)
|
||||
#define SHARED_NON_TEMPORAL_THRESHOLD (1024*1024*4)
|
||||
#define REP_MOSB_THRESHOLD 1024
|
||||
|
||||
#define USE_MULTIARCH
|
||||
|
||||
#define ASM_LINE_SEP ;
|
||||
|
||||
#define strong_alias(original, alias) \
|
||||
.globl C_SYMBOL_NAME (alias) ASM_LINE_SEP \
|
||||
C_SYMBOL_NAME (alias) = C_SYMBOL_NAME (original)
|
||||
|
||||
#ifndef C_LABEL
|
||||
|
||||
/* Define a macro we can use to construct the asm name for a C symbol. */
|
||||
# define C_LABEL(name) name##:
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __ASSEMBLER__
|
||||
/* Mark the end of function named SYM. This is used on some platforms
|
||||
to generate correct debugging information. */
|
||||
# ifndef END
|
||||
# define END(sym)
|
||||
# endif
|
||||
|
||||
# ifndef JUMPTARGET
|
||||
# define JUMPTARGET(sym) sym
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Macros to generate eh_frame unwind information. */
|
||||
#ifdef __ASSEMBLER__
|
||||
# define cfi_startproc .cfi_startproc
|
||||
# define cfi_endproc .cfi_endproc
|
||||
# define cfi_def_cfa(reg, off) .cfi_def_cfa reg, off
|
||||
# define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg
|
||||
# define cfi_def_cfa_offset(off) .cfi_def_cfa_offset off
|
||||
# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
|
||||
# define cfi_offset(reg, off) .cfi_offset reg, off
|
||||
# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
|
||||
# define cfi_register(r1, r2) .cfi_register r1, r2
|
||||
# define cfi_return_column(reg) .cfi_return_column reg
|
||||
# define cfi_restore(reg) .cfi_restore reg
|
||||
# define cfi_same_value(reg) .cfi_same_value reg
|
||||
# define cfi_undefined(reg) .cfi_undefined reg
|
||||
# define cfi_remember_state .cfi_remember_state
|
||||
# define cfi_restore_state .cfi_restore_state
|
||||
# define cfi_window_save .cfi_window_save
|
||||
# define cfi_personality(enc, exp) .cfi_personality enc, exp
|
||||
# define cfi_lsda(enc, exp) .cfi_lsda enc, exp
|
||||
|
||||
#else /* ! ASSEMBLER */
|
||||
|
||||
# define CFI_STRINGIFY(Name) CFI_STRINGIFY2 (Name)
|
||||
# define CFI_STRINGIFY2(Name) #Name
|
||||
# define CFI_STARTPROC ".cfi_startproc"
|
||||
# define CFI_ENDPROC ".cfi_endproc"
|
||||
# define CFI_DEF_CFA(reg, off) \
|
||||
".cfi_def_cfa " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
|
||||
# define CFI_DEF_CFA_REGISTER(reg) \
|
||||
".cfi_def_cfa_register " CFI_STRINGIFY(reg)
|
||||
# define CFI_DEF_CFA_OFFSET(off) \
|
||||
".cfi_def_cfa_offset " CFI_STRINGIFY(off)
|
||||
# define CFI_ADJUST_CFA_OFFSET(off) \
|
||||
".cfi_adjust_cfa_offset " CFI_STRINGIFY(off)
|
||||
# define CFI_OFFSET(reg, off) \
|
||||
".cfi_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
|
||||
# define CFI_REL_OFFSET(reg, off) \
|
||||
".cfi_rel_offset " CFI_STRINGIFY(reg) "," CFI_STRINGIFY(off)
|
||||
# define CFI_REGISTER(r1, r2) \
|
||||
".cfi_register " CFI_STRINGIFY(r1) "," CFI_STRINGIFY(r2)
|
||||
# define CFI_RETURN_COLUMN(reg) \
|
||||
".cfi_return_column " CFI_STRINGIFY(reg)
|
||||
# define CFI_RESTORE(reg) \
|
||||
".cfi_restore " CFI_STRINGIFY(reg)
|
||||
# define CFI_UNDEFINED(reg) \
|
||||
".cfi_undefined " CFI_STRINGIFY(reg)
|
||||
# define CFI_REMEMBER_STATE \
|
||||
".cfi_remember_state"
|
||||
# define CFI_RESTORE_STATE \
|
||||
".cfi_restore_state"
|
||||
# define CFI_WINDOW_SAVE \
|
||||
".cfi_window_save"
|
||||
# define CFI_PERSONALITY(enc, exp) \
|
||||
".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
|
||||
# define CFI_LSDA(enc, exp) \
|
||||
".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp)
|
||||
#endif
|
||||
|
||||
#include "dwarf2.h"
|
115
utils/memcpy-bench/glibc/sysdep_x86.h
Normal file
115
utils/memcpy-bench/glibc/sysdep_x86.h
Normal file
@ -0,0 +1,115 @@
|
||||
#pragma once
|
||||
|
||||
/* Assembler macros for x86.
|
||||
Copyright (C) 2017-2020 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifndef _X86_SYSDEP_H
|
||||
#define _X86_SYSDEP_H 1
|
||||
|
||||
#include "sysdep_generic.h"
|
||||
|
||||
/* __CET__ is defined by GCC with Control-Flow Protection values:
|
||||
|
||||
enum cf_protection_level
|
||||
{
|
||||
CF_NONE = 0,
|
||||
CF_BRANCH = 1 << 0,
|
||||
CF_RETURN = 1 << 1,
|
||||
CF_FULL = CF_BRANCH | CF_RETURN,
|
||||
CF_SET = 1 << 2
|
||||
};
|
||||
*/
|
||||
|
||||
/* Set if CF_BRANCH (IBT) is enabled. */
|
||||
#define X86_FEATURE_1_IBT (1U << 0)
|
||||
/* Set if CF_RETURN (SHSTK) is enabled. */
|
||||
#define X86_FEATURE_1_SHSTK (1U << 1)
|
||||
|
||||
#ifdef __CET__
|
||||
# define CET_ENABLED 1
|
||||
# define IBT_ENABLED (__CET__ & X86_FEATURE_1_IBT)
|
||||
# define SHSTK_ENABLED (__CET__ & X86_FEATURE_1_SHSTK)
|
||||
#else
|
||||
# define CET_ENABLED 0
|
||||
# define IBT_ENABLED 0
|
||||
# define SHSTK_ENABLED 0
|
||||
#endif
|
||||
|
||||
/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need
|
||||
space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be
|
||||
aligned to 16 bytes for fxsave and 64 bytes for xsave. */
|
||||
#define STATE_SAVE_OFFSET (8 * 7 + 8)
|
||||
|
||||
/* Save SSE, AVX, AVX512, mask and bound registers. */
|
||||
#define STATE_SAVE_MASK \
|
||||
((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
|
||||
|
||||
#ifdef __ASSEMBLER__
|
||||
|
||||
/* Syntactic details of assembler. */
|
||||
|
||||
#ifdef _CET_ENDBR
|
||||
# define _CET_NOTRACK notrack
|
||||
#else
|
||||
# define _CET_ENDBR
|
||||
# define _CET_NOTRACK
|
||||
#endif
|
||||
|
||||
/* ELF uses byte-counts for .align, most others use log2 of count of bytes. */
|
||||
#define ALIGNARG(log2) 1<<log2
|
||||
#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
|
||||
|
||||
/* Define an entry point visible from C. */
|
||||
#define ENTRY(name) \
|
||||
.globl C_SYMBOL_NAME(name); \
|
||||
.type C_SYMBOL_NAME(name),@function; \
|
||||
.align ALIGNARG(4); \
|
||||
C_LABEL(name) \
|
||||
cfi_startproc; \
|
||||
_CET_ENDBR; \
|
||||
CALL_MCOUNT
|
||||
|
||||
#undef END
|
||||
#define END(name) \
|
||||
cfi_endproc; \
|
||||
ASM_SIZE_DIRECTIVE(name)
|
||||
|
||||
#define ENTRY_CHK(name) ENTRY (name)
|
||||
#define END_CHK(name) END (name)
|
||||
|
||||
/* Since C identifiers are not normally prefixed with an underscore
|
||||
on this system, the asm identifier `syscall_error' intrudes on the
|
||||
C name space. Make sure we use an innocuous name. */
|
||||
#define syscall_error __syscall_error
|
||||
#define mcount _mcount
|
||||
|
||||
#undef PSEUDO_END
|
||||
#define PSEUDO_END(name) \
|
||||
END (name)
|
||||
|
||||
/* Local label name for asm code. */
|
||||
#ifndef L
|
||||
/* ELF-like local names start with `.L'. */
|
||||
# define L(name) .L##name
|
||||
#endif
|
||||
|
||||
#define atom_text_section .section ".text.atom", "ax"
|
||||
|
||||
#endif /* __ASSEMBLER__ */
|
||||
|
||||
#endif /* _X86_SYSDEP_H */
|
@ -1,5 +1,6 @@
|
||||
#include <memory>
|
||||
#include <cstddef>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <random>
|
||||
#include <iostream>
|
||||
@ -14,15 +15,11 @@
|
||||
|
||||
#include <Common/Stopwatch.h>
|
||||
|
||||
#pragma GCC diagnostic ignored "-Wold-style-cast"
|
||||
#pragma GCC diagnostic ignored "-Wcast-align"
|
||||
#pragma GCC diagnostic ignored "-Wcast-qual"
|
||||
#include "FastMemcpy.h"
|
||||
//#include "FastMemcpy_Avx.h"
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
|
||||
template <typename F, typename MemcpyImpl>
|
||||
void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl)
|
||||
@ -36,6 +33,9 @@ void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_d
|
||||
dst += bytes_to_copy;
|
||||
src += bytes_to_copy;
|
||||
size -= bytes_to_copy;
|
||||
|
||||
/// Execute at least one SSE instruction as a penalty after running AVX code.
|
||||
__asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
|
||||
}
|
||||
}
|
||||
|
||||
@ -47,7 +47,7 @@ size_t generatorUniform(RNG & rng) { return rng() % N; };
|
||||
|
||||
|
||||
template <typename F, typename MemcpyImpl>
|
||||
void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl)
|
||||
uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl, const char * name)
|
||||
{
|
||||
Stopwatch watch;
|
||||
|
||||
@ -76,15 +76,15 @@ void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t n
|
||||
for (auto & thread : threads)
|
||||
thread.join();
|
||||
|
||||
double elapsed_ns = watch.elapsed();
|
||||
uint64_t elapsed_ns = watch.elapsed();
|
||||
|
||||
/// Validation
|
||||
size_t sum = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
sum += dst[i];
|
||||
if (dst[i] != uint8_t(i))
|
||||
throw std::logic_error("Incorrect result");
|
||||
|
||||
std::cerr << std::fixed << std::setprecision(3)
|
||||
<< "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n";
|
||||
std::cout << name;
|
||||
return elapsed_ns;
|
||||
}
|
||||
|
||||
|
||||
@ -101,9 +101,30 @@ static void * memcpy_erms(void * dst, const void * src, size_t size)
|
||||
return dst;
|
||||
}
|
||||
|
||||
static void * memcpy_trivial(void * __restrict dst_, const void * __restrict src_, size_t size)
|
||||
{
|
||||
char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
|
||||
const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
|
||||
void * ret = dst;
|
||||
|
||||
while (size > 0)
|
||||
{
|
||||
*dst = *src;
|
||||
++dst;
|
||||
++src;
|
||||
--size;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
extern "C" void * memcpy_jart(void * dst, const void * src, size_t size);
|
||||
extern "C" void MemCpy(void * dst, const void * src, size_t size);
|
||||
|
||||
void * memcpy_fast_sse(void * dst, const void * src, size_t size);
|
||||
void * memcpy_fast_avx(void * dst, const void * src, size_t size);
|
||||
void * memcpy_tiny(void * dst, const void * src, size_t size);
|
||||
|
||||
|
||||
static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size)
|
||||
{
|
||||
@ -329,7 +350,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t *
|
||||
if (padding > 0)
|
||||
{
|
||||
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
|
||||
_mm256_storeu_si256((__m256i*)dst, head);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
|
||||
dst += padding;
|
||||
src += padding;
|
||||
size -= padding;
|
||||
@ -539,70 +560,141 @@ tail:
|
||||
return ret;
|
||||
}
|
||||
|
||||
extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_ssse3_back(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_avx_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_avx_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_avx512_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_avx512_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
extern "C" void * __memcpy_avx512_no_vzeroupper(void * __restrict destination, const void * __restrict source, size_t size);
|
||||
|
||||
|
||||
#define VARIANT(N, NAME) \
|
||||
if (memcpy_variant == N) \
|
||||
return test(dst, src, size, iterations, num_threads, std::forward<F>(generator), NAME, #NAME);
|
||||
|
||||
template <typename F>
|
||||
void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
|
||||
uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
|
||||
{
|
||||
memcpy_type memcpy_libc = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
|
||||
memcpy_type memcpy_libc_old = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
|
||||
|
||||
if (memcpy_variant == 1)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy);
|
||||
if (memcpy_variant == 2)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_libc);
|
||||
if (memcpy_variant == 3)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_erms);
|
||||
if (memcpy_variant == 4)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), MemCpy);
|
||||
if (memcpy_variant == 5)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2);
|
||||
if (memcpy_variant == 6)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled2);
|
||||
if (memcpy_variant == 7)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled4);
|
||||
if (memcpy_variant == 8)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled8);
|
||||
// if (memcpy_variant == 9)
|
||||
// test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_fast_avx);
|
||||
if (memcpy_variant == 10)
|
||||
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_my);
|
||||
VARIANT(1, memcpy)
|
||||
VARIANT(2, memcpy_trivial)
|
||||
VARIANT(3, memcpy_libc_old)
|
||||
VARIANT(4, memcpy_erms)
|
||||
VARIANT(5, MemCpy)
|
||||
VARIANT(6, memcpySSE2)
|
||||
VARIANT(7, memcpySSE2Unrolled2)
|
||||
VARIANT(8, memcpySSE2Unrolled4)
|
||||
VARIANT(9, memcpySSE2Unrolled8)
|
||||
VARIANT(10, memcpy_fast_sse)
|
||||
VARIANT(11, memcpy_fast_avx)
|
||||
VARIANT(12, memcpy_my)
|
||||
|
||||
VARIANT(21, __memcpy_erms)
|
||||
VARIANT(22, __memcpy_sse2_unaligned)
|
||||
VARIANT(23, __memcpy_ssse3)
|
||||
VARIANT(24, __memcpy_ssse3_back)
|
||||
VARIANT(25, __memcpy_avx_unaligned)
|
||||
VARIANT(26, __memcpy_avx_unaligned_erms)
|
||||
VARIANT(27, __memcpy_avx512_unaligned)
|
||||
VARIANT(28, __memcpy_avx512_unaligned_erms)
|
||||
VARIANT(29, __memcpy_avx512_no_vzeroupper)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
|
||||
uint64_t dispatchVariants(
|
||||
size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
|
||||
{
|
||||
if (generator_variant == 1)
|
||||
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
|
||||
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
|
||||
if (generator_variant == 2)
|
||||
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
|
||||
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
|
||||
if (generator_variant == 3)
|
||||
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
|
||||
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
|
||||
if (generator_variant == 4)
|
||||
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
|
||||
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
|
||||
if (generator_variant == 5)
|
||||
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
|
||||
return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
size_t size = 1000000000;
|
||||
if (argc >= 2)
|
||||
size = std::stoull(argv[1]);
|
||||
boost::program_options::options_description desc("Allowed options");
|
||||
desc.add_options()("help,h", "produce help message")
|
||||
("size", boost::program_options::value<size_t>()->default_value(1000000), "Bytes to copy on every iteration")
|
||||
("iterations", boost::program_options::value<size_t>(), "Number of iterations")
|
||||
("threads", boost::program_options::value<size_t>()->default_value(1), "Number of copying threads")
|
||||
("distribution", boost::program_options::value<size_t>()->default_value(4), "Distribution of chunk sizes to perform copy")
|
||||
("variant", boost::program_options::value<size_t>(), "Variant of memcpy implementation")
|
||||
("tsv", "Print result in tab-separated format")
|
||||
;
|
||||
|
||||
size_t iterations = 10;
|
||||
if (argc >= 3)
|
||||
iterations = std::stoull(argv[2]);
|
||||
boost::program_options::variables_map options;
|
||||
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
|
||||
|
||||
size_t num_threads = 1;
|
||||
if (argc >= 4)
|
||||
num_threads = std::stoull(argv[3]);
|
||||
if (options.count("help") || !options.count("variant"))
|
||||
{
|
||||
std::cout << R"(Usage:
|
||||
|
||||
size_t memcpy_variant = 1;
|
||||
if (argc >= 5)
|
||||
memcpy_variant = std::stoull(argv[4]);
|
||||
for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do
|
||||
for threads in 1 2 4 $(($(nproc) / 2)) $(nproc); do
|
||||
for distribution in 1 2 3 4 5; do
|
||||
for variant in {1..12} {21..29}; do
|
||||
for i in {1..10}; do
|
||||
./memcpy-bench --tsv --size $size --variant $variant --threads $threads --distribution $distribution;
|
||||
done;
|
||||
done;
|
||||
done;
|
||||
done;
|
||||
done | tee result.tsv
|
||||
|
||||
size_t generator_variant = 1;
|
||||
if (argc >= 6)
|
||||
generator_variant = std::stoull(argv[5]);
|
||||
clickhouse-local --structure '
|
||||
name String,
|
||||
size UInt64,
|
||||
iterations UInt64,
|
||||
threads UInt16,
|
||||
generator UInt8,
|
||||
memcpy UInt8,
|
||||
elapsed UInt64
|
||||
' --query "
|
||||
SELECT
|
||||
size, name,
|
||||
avg(1000 * elapsed / size / iterations) AS s,
|
||||
count() AS c
|
||||
FROM table
|
||||
GROUP BY size, name
|
||||
ORDER BY size ASC, s DESC
|
||||
" --output-format PrettyCompact < result.tsv
|
||||
|
||||
)" << std::endl;
|
||||
std::cout << desc << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t size = options["size"].as<size_t>();
|
||||
size_t num_threads = options["threads"].as<size_t>();
|
||||
size_t memcpy_variant = options["variant"].as<size_t>();
|
||||
size_t generator_variant = options["distribution"].as<size_t>();
|
||||
|
||||
size_t iterations;
|
||||
if (options.count("iterations"))
|
||||
{
|
||||
iterations = options["iterations"].as<size_t>();
|
||||
}
|
||||
else
|
||||
{
|
||||
iterations = 10000000000ULL / size;
|
||||
|
||||
if (generator_variant == 1)
|
||||
iterations /= 10;
|
||||
}
|
||||
|
||||
std::unique_ptr<uint8_t[]> src(new uint8_t[size]);
|
||||
std::unique_ptr<uint8_t[]> dst(new uint8_t[size]);
|
||||
@ -614,7 +706,25 @@ int main(int argc, char ** argv)
|
||||
/// Fill dst to avoid page faults.
|
||||
memset(dst.get(), 0, size);
|
||||
|
||||
dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
|
||||
uint64_t elapsed_ns = dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
|
||||
|
||||
std::cout << std::fixed << std::setprecision(3);
|
||||
|
||||
if (options.count("tsv"))
|
||||
{
|
||||
std::cout
|
||||
<< '\t' << size
|
||||
<< '\t' << iterations
|
||||
<< '\t' << num_threads
|
||||
<< '\t' << generator_variant
|
||||
<< '\t' << memcpy_variant
|
||||
<< '\t' << elapsed_ns
|
||||
<< '\n';
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << ": processed in " << (elapsed_ns / 1e9) << " sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec\n";
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user