This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug rtl-optimization/56511] New: memcpy misses chance to use AVX instructions
- From: "jyasskin at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Sun, 03 Mar 2013 06:14:02 +0000
- Subject: [Bug rtl-optimization/56511] New: memcpy misses chance to use AVX instructions
- Auto-submitted: auto-generated
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56511
Bug #: 56511
Summary: memcpy misses chance to use AVX instructions
Classification: Unclassified
Product: gcc
Version: 4.7.2
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: jyasskin@gcc.gnu.org
When operating on sufficiently aligned storage, memcpy should be able to use
vector instructions.
$ cat test.c
#include <string.h>
typedef float vec __attribute__((vector_size(32)));
typedef struct S {
vec v;
char __attribute__((aligned(__alignof__(vec)))) c[sizeof(vec)];
} S;
void assign_vec(S* s, const vec* v) { s->v = *v; }
void memcpy_vec(S* s, const vec* v) { memcpy(&s->v, v, sizeof(vec)); }
void memcpy_char(S* s, const vec* v) { memcpy(s->c, v, sizeof(vec)); }
$ gcc -mavx -S test.c -O2 -Wall -o -
.file "test.c"
.text
.p2align 4,,15
.globl assign_vec
.type assign_vec, @function
assign_vec:
.LFB12:
.cfi_startproc
vmovaps (%rsi), %ymm0
vmovaps %ymm0, (%rdi)
vzeroupper
ret
.cfi_endproc
.LFE12:
.size assign_vec, .-assign_vec
.p2align 4,,15
.globl memcpy_vec
.type memcpy_vec, @function
memcpy_vec:
.LFB13:
.cfi_startproc
movq (%rsi), %rax
movq %rax, (%rdi)
movq 8(%rsi), %rax
movq %rax, 8(%rdi)
movq 16(%rsi), %rax
movq %rax, 16(%rdi)
movq 24(%rsi), %rax
movq %rax, 24(%rdi)
ret
.cfi_endproc
.LFE13:
.size memcpy_vec, .-memcpy_vec
.p2align 4,,15
.globl memcpy_char
.type memcpy_char, @function
memcpy_char:
.LFB14:
.cfi_startproc
movq (%rsi), %rdx
movq %rdx, 32(%rdi)
movq 8(%rsi), %rdx
movq %rdx, 40(%rdi)
movq 16(%rsi), %rdx
movq %rdx, 48(%rdi)
movq 24(%rsi), %rdx
movq %rdx, 56(%rdi)
ret
.cfi_endproc
.LFE14:
.size memcpy_char, .-memcpy_char
I don't have a gcc-4.8 around to test with, but I believe it's also missing
this optimization.