This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Question about GCC -O optimization with sse2 programming.
- From: "haibo" <chbchb1130 at sina dot com>
- To: "gcc" <gcc at gcc dot gnu dot org>
- Date: Tue, 9 Mar 2004 11:19:51 +0800
- Subject: Question about GCC -O optimization with sse2 programming.
Hi:
I am now trying to compile some sse2 programming with gcc-3.4-0225(prerelease version)
(because when compiling some sse2 program with gcc-3.3.3,It may end with compilation error ),
when I compile my program with no optimization(-O0),I can get the right result,but if I
turn on -O1 things will be wrong.From the documentation,I known the -O(1) is equal with the following
optimization option:
-fdefer-pop
-fmerge-constants
-fthread-jumps
-floop-optimize
-fcrossjumping
-fif-conversion
-fif-conversion2
-fdelayed-branch
-fguess-branch-probability
-fcprop-registers
-fomit-frame-pointer
but I turn on all the options with the following compile option:
CFLAGS =-march=pentium4 -msse2 \
-fdefer-pop \
-fmerge-constants \
-fthread-jumps \
-floop-optimize \
-fcrossjumping \
-fif-conversion \
-fif-conversion2 \
-fguess-branch-probability \
-fcprop-registers \
-fomit-frame-pointer
I would find the effect being equal with -O0 rather than -O(1).
And the following is my test program and the assembly with -O:
#include <emmintrin.h>
void foo(int a[8][8]){
__m128i mark0,mark1,mark2,mark3,mark4,mark5,mark6,mark7;
int i;
mark0=_mm_set1_epi16(0xffff);
mark0=_mm_insert_epi16(mark0,0,0);
mark1=mark0;
mark1=_mm_insert_epi16(mark1,0,1);
mark2=mark1;
mark2=_mm_insert_epi16(mark2,0,2);
mark3=mark2;
mark3=_mm_insert_epi16(mark3,0,3);
mark4=mark3;
mark4=_mm_insert_epi16(mark4,0,4);
mark5=mark4;
mark5=_mm_insert_epi16(mark5,0,5);
mark6=mark5;
mark6=_mm_insert_epi16(mark6,0,6);
mark7=mark6;
mark7=_mm_insert_epi16(mark7,0,7);
a[0][0]= _mm_extract_epi16( mark0, 0);
a[1][0]= _mm_extract_epi16( mark1, 0);
a[2][0]= _mm_extract_epi16( mark2, 0);
a[3][0]= _mm_extract_epi16( mark3, 0);
a[4][0]= _mm_extract_epi16( mark4, 0);
a[5][0]= _mm_extract_epi16( mark5, 0);
a[6][0]= _mm_extract_epi16( mark6, 0);
a[7][0]= _mm_extract_epi16( mark7, 0);
a[0][1]= _mm_extract_epi16( mark0, 1);
a[1][1]= _mm_extract_epi16( mark1, 1);
a[2][1]= _mm_extract_epi16( mark2, 1);
a[3][1]= _mm_extract_epi16( mark3, 1);
a[4][1]= _mm_extract_epi16( mark4, 1);
a[5][1]= _mm_extract_epi16( mark5, 1);
a[6][1]= _mm_extract_epi16( mark6, 1);
a[7][1]= _mm_extract_epi16( mark7, 1);
a[0][2]= _mm_extract_epi16( mark0, 2);
a[1][2]= _mm_extract_epi16( mark1, 2);
a[2][2]= _mm_extract_epi16( mark2, 2);
a[3][2]= _mm_extract_epi16( mark3, 2);
a[4][2]= _mm_extract_epi16( mark4, 2);
a[5][2]= _mm_extract_epi16( mark5, 2);
a[6][2]= _mm_extract_epi16( mark6, 2);
a[7][2]= _mm_extract_epi16( mark7, 2);
a[0][3]= _mm_extract_epi16( mark0, 3);
a[1][3]= _mm_extract_epi16( mark1, 3);
a[2][3]= _mm_extract_epi16( mark2, 3);
a[3][3]= _mm_extract_epi16( mark3, 3);
a[4][3]= _mm_extract_epi16( mark4, 3);
a[5][3]= _mm_extract_epi16( mark5, 3);
a[6][3]= _mm_extract_epi16( mark6, 3);
a[7][3]= _mm_extract_epi16( mark7, 3);
a[0][4]= _mm_extract_epi16( mark0, 4);
a[1][4]= _mm_extract_epi16( mark1, 4);
a[2][4]= _mm_extract_epi16( mark2, 4);
a[3][4]= _mm_extract_epi16( mark3, 4);
a[4][4]= _mm_extract_epi16( mark4, 4);
a[5][4]= _mm_extract_epi16( mark5, 4);
a[6][4]= _mm_extract_epi16( mark6, 4);
a[7][4]= _mm_extract_epi16( mark7, 4);
a[0][5]= _mm_extract_epi16( mark0, 5);
a[1][5]= _mm_extract_epi16( mark1, 5);
a[2][5]= _mm_extract_epi16( mark2, 5);
a[3][5]= _mm_extract_epi16( mark3, 5);
a[4][5]= _mm_extract_epi16( mark4, 5);
a[5][5]= _mm_extract_epi16( mark5, 5);
a[6][5]= _mm_extract_epi16( mark6, 5);
a[7][5]= _mm_extract_epi16( mark7, 5);
a[0][6]= _mm_extract_epi16( mark0, 6);
a[1][6]= _mm_extract_epi16( mark1, 6);
a[2][6]= _mm_extract_epi16( mark2, 6);
a[3][6]= _mm_extract_epi16( mark3, 6);
a[4][6]= _mm_extract_epi16( mark4, 6);
a[5][6]= _mm_extract_epi16( mark5, 6);
a[6][6]= _mm_extract_epi16( mark6, 6);
a[7][6]= _mm_extract_epi16( mark7, 6);
a[0][7]= _mm_extract_epi16( mark0, 7);
a[1][7]= _mm_extract_epi16( mark1, 7);
a[2][7]= _mm_extract_epi16( mark2, 7);
a[3][7]= _mm_extract_epi16( mark3, 7);
a[4][7]= _mm_extract_epi16( mark4, 7);
a[5][7]= _mm_extract_epi16( mark5, 7);
a[6][7]= _mm_extract_epi16( mark6, 7);
a[7][7]= _mm_extract_epi16( mark7, 7);
}
int main(){
int a[8][8],i,j;
foo(a);
for(i=0;i < 8 ; i ++){
for(j= 0; j < 8 ; j++){
printf("%d\t",a[i][j]);
}
printf("\n");
}
return 1;
}
the assembly with -O1:
.file "pcm.c"
.text
.globl foo
.type foo, @function
foo:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
movl $0, (%eax)
movl $0, 32(%eax)
movl $0, 64(%eax)
movl $0, 96(%eax)
movl $0, 128(%eax)
movl $0, 160(%eax)
movl $0, 192(%eax)
movl $0, 224(%eax)
movl $0, 4(%eax)
movl $0, 36(%eax)
movl $0, 68(%eax)
movl $0, 100(%eax)
movl $0, 132(%eax)
movl $0, 164(%eax)
movl $0, 196(%eax)
movl $0, 228(%eax)
movl $0, 8(%eax)
movl $0, 40(%eax)
movl $0, 72(%eax)
movl $0, 104(%eax)
movl $0, 136(%eax)
movl $0, 168(%eax)
movl $0, 200(%eax)
movl $0, 232(%eax)
movl $0, 12(%eax)
movl $0, 44(%eax)
movl $0, 76(%eax)
movl $0, 108(%eax)
movl $0, 140(%eax)
movl $0, 172(%eax)
movl $0, 204(%eax)
movl $0, 236(%eax)
movl $0, 16(%eax)
movl $0, 48(%eax)
movl $0, 80(%eax)
movl $0, 112(%eax)
movl $0, 144(%eax)
movl $0, 176(%eax)
movl $0, 208(%eax)
movl $0, 240(%eax)
movl $0, 20(%eax)
movl $0, 52(%eax)
movl $0, 84(%eax)
movl $0, 116(%eax)
movl $0, 148(%eax)
movl $0, 180(%eax)
movl $0, 212(%eax)
movl $0, 244(%eax)
movl $0, 24(%eax)
movl $0, 56(%eax)
movl $0, 88(%eax)
movl $0, 120(%eax)
movl $0, 152(%eax)
movl $0, 184(%eax)
movl $0, 216(%eax)
movl $0, 248(%eax)
movl $0, 28(%eax)
movl $0, 60(%eax)
movl $0, 92(%eax)
movl $0, 124(%eax)
movl $0, 156(%eax)
movl $0, 188(%eax)
movl $0, 220(%eax)
movl $0, 252(%eax)
popl %ebp
ret
.size foo, .-foo
.section .rodata.str1.1,"aMS",@progbits,1
.LC3:
.string "%d\t"
.text
.globl main
.type main, @function
main:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $284, %esp
andl $-16, %esp
subl $16, %esp
leal -280(%ebp), %eax
movl %eax, (%esp)
call foo
movl $0, %edi
.L10:
movl $0, %ebx
leal 0(,%edi,8), %esi
.L9:
leal (%esi,%ebx), %eax
movl -280(%ebp,%eax,4), %eax
movl %eax, 4(%esp)
movl $.LC3, (%esp)
call printf
addl $1, %ebx
cmpl $7, %ebx
jle .L9
movl $10, (%esp)
call putchar
addl $1, %edi
cmpl $7, %edi
jle .L10
movl $1, %eax
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.size main, .-main
.section .note.GNU-stack,"",@progbits
.ident "GCC: (GNU) 3.4.0 20040225 (prerelease)"
haibo
chbchb1130@sina.com
2004-03-09