This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
clearing many bytes variables (could use one machine instruction)?
- From: Basile Starynkevitch <basile at starynkevitch dot net>
- To: gcc at gcc dot gnu dot org
- Date: Tue, 9 Mar 2010 10:58:07 +0100
- Subject: clearing many bytes variables (could use one machine instruction)?
- Reply-to: basile at starynkevitch dot net
Hello All,
With a recently compiled gcc-trunk on x86-64/linux, I am compiling the folllowing example:
#################
/* file testmanychar.c */
extern void g (int, char *, char *, char *);
void
f (void)
{
char x0, x1, x2, x3, x4, x5, x6, x7;
/* assuming x0 is word aligned on a x86_64, and variables are bytes in memory, we could clear all the variables in one machine instruction */
x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = (char) 0;
g (10, &x0, &x1, &x2);
g (20, &x2, &x3, &x4);
g (30, &x4, &x5, &x6);
g (40, &x6, &x7, &x0);
}
#################
My intuition was that GCC could store x0 on a 64 bits aligned byte, and x1 immediately after, and so one, and clear all the eight bytes at once using a single machine instruction [clearing a 64 bits word].
But this is not the case, since
gcc-trunk -S -O3 -fverbose-asm testmanychar.c
gives the following code
#################
.type f, @function
f:
.LFB0:
.cfi_startproc
movq %rbx, -24(%rsp) #,
movq %rbp, -16(%rsp) #,
movl $10, %edi #,
movq %r12, -8(%rsp) #,
subq $40, %rsp #,
.cfi_def_cfa_offset 48
leaq 13(%rsp), %rbx #, tmp58
.cfi_offset 12, -16
.cfi_offset 6, -24
.cfi_offset 3, -32
leaq 15(%rsp), %rbp #, tmp60
leaq 14(%rsp), %rdx #, tmp59
leaq 11(%rsp), %r12 #, tmp61
movb $0, 8(%rsp) #, x7
movb $0, 9(%rsp) #, x6
movq %rbx, %rcx # tmp58,
movq %rbp, %rsi # tmp60,
movb $0, 10(%rsp) #, x5
movb $0, 11(%rsp) #, x4
movb $0, 12(%rsp) #, x3
movb $0, 13(%rsp) #, x2
movb $0, 14(%rsp) #, x1
movb $0, 15(%rsp) #, x0
call g #
leaq 12(%rsp), %rdx #, tmp62
movq %r12, %rcx # tmp61,
movq %rbx, %rsi # tmp58,
movl $20, %edi #,
leaq 9(%rsp), %rbx #, tmp64
call g #
leaq 10(%rsp), %rdx #, tmp65
movq %rbx, %rcx # tmp64,
movq %r12, %rsi # tmp61,
movl $30, %edi #,
call g #
leaq 8(%rsp), %rdx #, tmp68
movq %rbp, %rcx # tmp60,
movq %rbx, %rsi # tmp64,
movl $40, %edi #,
call g #
movq 16(%rsp), %rbx #,
movq 24(%rsp), %rbp #,
movq 32(%rsp), %r12 #,
addq $40, %rsp #,
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size f, .-f
.ident "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision 157303]"
#####################
With
gcc-trunk -S -O3 -fverbose-asm -march=core2 -mtune=core2 testmanychar.c
I am getting still
##################
# options passed: testmanychar.c -march=core2 -mtune=core2 -O3
.globl f
.type f, @function
f:
.LFB0:
.cfi_startproc
movq %rbx, -24(%rsp) #,
movq %rbp, -16(%rsp) #,
movq %r12, -8(%rsp) #,
movl $10, %edi #,
subq $40, %rsp #,
.cfi_def_cfa_offset 48
leaq 13(%rsp), %rbx #, tmp58
.cfi_offset 12, -16
.cfi_offset 6, -24
.cfi_offset 3, -32
leaq 15(%rsp), %rbp #, tmp60
leaq 11(%rsp), %r12 #, tmp61
leaq 14(%rsp), %rdx #, tmp59
movq %rbx, %rcx # tmp58,
movq %rbp, %rsi # tmp60,
movb $0, 8(%rsp) #, x7
movb $0, 9(%rsp) #, x6
movb $0, 10(%rsp) #, x5
movb $0, 11(%rsp) #, x4
movb $0, 12(%rsp) #, x3
movb $0, 13(%rsp) #, x2
movb $0, 14(%rsp) #, x1
movb $0, 15(%rsp) #, x0
call g #
leaq 12(%rsp), %rdx #, tmp62
movq %r12, %rcx # tmp61,
movq %rbx, %rsi # tmp58,
movl $20, %edi #,
leaq 9(%rsp), %rbx #, tmp64
call g #
leaq 10(%rsp), %rdx #, tmp65
movq %rbx, %rcx # tmp64,
movq %r12, %rsi # tmp61,
movl $30, %edi #,
call g #
leaq 8(%rsp), %rdx #, tmp68
movq %rbp, %rcx # tmp60,
movq %rbx, %rsi # tmp64,
movl $40, %edi #,
call g #
movq 16(%rsp), %rbx #,
movq 24(%rsp), %rbp #,
movq 32(%rsp), %r12 #,
addq $40, %rsp #,
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size f, .-f
.ident "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision 157303]"
####
I was hoping that
movb $0, 8(%rsp) #, x7
movb $0, 9(%rsp) #, x6
movb $0, 10(%rsp) #, x5
movb $0, 11(%rsp) #, x4
movb $0, 12(%rsp) #, x3
movb $0, 13(%rsp) #, x2
movb $0, 14(%rsp) #, x1
movb $0, 15(%rsp) #, x0
could be just something like
movq $0, 8(%rsp)
or something similar.
I do realize that such an optimization is difficult to implement...
(probably messing the register allocator, etc...). Or is the Core2 processor
sufficient smart to execute exactly as fast a sequence of 8 consecutive byte
moves as a single 8-byte word move?
Regards.
--
Basile STARYNKEVITCH http://starynkevitch.net/Basile/
email: basile<at>starynkevitch<dot>net mobile: +33 6 8501 2359
8, rue de la Faiencerie, 92340 Bourg La Reine, France
*** opinions {are only mines, sont seulement les miennes} ***