This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Question about GCC -O optimization with sse2 programming.


Hi:
	I am now trying to compile some sse2 programming with gcc-3.4-0225(prerelease version)
(because when compiling some sse2 program with gcc-3.3.3,It may end with compilation error ),
when I compile my program with no optimization(-O0),I can get the right result,but if I 
turn on -O1 things will be wrong.From the documentation,I known the -O(1) is equal with the following 
optimization option:
 		  -fdefer-pop 
          -fmerge-constants 
          -fthread-jumps 
          -floop-optimize 
          -fcrossjumping 
          -fif-conversion 
          -fif-conversion2 
          -fdelayed-branch 
          -fguess-branch-probability 
          -fcprop-registers
		  -fomit-frame-pointer 

but I turn on all the options with the following compile option:
CFLAGS =-march=pentium4 -msse2 \
        -fdefer-pop \
          -fmerge-constants \
          -fthread-jumps \
          -floop-optimize \
          -fcrossjumping \
          -fif-conversion \
          -fif-conversion2 \
          -fguess-branch-probability \
          -fcprop-registers     \
          -fomit-frame-pointer
I would find the effect being equal with -O0 rather than -O(1).
And the following is my test program and the assembly with -O:

#include <emmintrin.h>

void foo(int a[8][8]){
		__m128i mark0,mark1,mark2,mark3,mark4,mark5,mark6,mark7;
		int i;
		mark0=_mm_set1_epi16(0xffff);
		mark0=_mm_insert_epi16(mark0,0,0);
		mark1=mark0;
		mark1=_mm_insert_epi16(mark1,0,1);
		mark2=mark1;
		mark2=_mm_insert_epi16(mark2,0,2);
		mark3=mark2;
		mark3=_mm_insert_epi16(mark3,0,3);
		mark4=mark3;
		mark4=_mm_insert_epi16(mark4,0,4);
		mark5=mark4;
		mark5=_mm_insert_epi16(mark5,0,5);
		mark6=mark5;
		mark6=_mm_insert_epi16(mark6,0,6);
		mark7=mark6;
		mark7=_mm_insert_epi16(mark7,0,7);        
		a[0][0]= _mm_extract_epi16( mark0, 0);
		a[1][0]= _mm_extract_epi16( mark1, 0);
		a[2][0]= _mm_extract_epi16( mark2, 0);
		a[3][0]= _mm_extract_epi16( mark3, 0);
		a[4][0]= _mm_extract_epi16( mark4, 0);
		a[5][0]= _mm_extract_epi16( mark5, 0);
		a[6][0]= _mm_extract_epi16( mark6, 0);
		a[7][0]= _mm_extract_epi16( mark7, 0);

		a[0][1]= _mm_extract_epi16( mark0, 1);
		a[1][1]= _mm_extract_epi16( mark1, 1);
		a[2][1]= _mm_extract_epi16( mark2, 1);
		a[3][1]= _mm_extract_epi16( mark3, 1);
		a[4][1]= _mm_extract_epi16( mark4, 1);
		a[5][1]= _mm_extract_epi16( mark5, 1);
		a[6][1]= _mm_extract_epi16( mark6, 1);
		a[7][1]= _mm_extract_epi16( mark7, 1);

		a[0][2]= _mm_extract_epi16( mark0, 2);
		a[1][2]= _mm_extract_epi16( mark1, 2);
		a[2][2]= _mm_extract_epi16( mark2, 2);
		a[3][2]= _mm_extract_epi16( mark3, 2);
		a[4][2]= _mm_extract_epi16( mark4, 2);
		a[5][2]= _mm_extract_epi16( mark5, 2);
		a[6][2]= _mm_extract_epi16( mark6, 2);
		a[7][2]= _mm_extract_epi16( mark7, 2);

		a[0][3]= _mm_extract_epi16( mark0, 3);
		a[1][3]= _mm_extract_epi16( mark1, 3);
		a[2][3]= _mm_extract_epi16( mark2, 3);
		a[3][3]= _mm_extract_epi16( mark3, 3);
		a[4][3]= _mm_extract_epi16( mark4, 3);
		a[5][3]= _mm_extract_epi16( mark5, 3);
		a[6][3]= _mm_extract_epi16( mark6, 3);
		a[7][3]= _mm_extract_epi16( mark7, 3);

		a[0][4]= _mm_extract_epi16( mark0, 4);
		a[1][4]= _mm_extract_epi16( mark1, 4);
		a[2][4]= _mm_extract_epi16( mark2, 4);
		a[3][4]= _mm_extract_epi16( mark3, 4);
		a[4][4]= _mm_extract_epi16( mark4, 4);
		a[5][4]= _mm_extract_epi16( mark5, 4);
		a[6][4]= _mm_extract_epi16( mark6, 4);
		a[7][4]= _mm_extract_epi16( mark7, 4);

		a[0][5]= _mm_extract_epi16( mark0, 5);
		a[1][5]= _mm_extract_epi16( mark1, 5);
		a[2][5]= _mm_extract_epi16( mark2, 5);
		a[3][5]= _mm_extract_epi16( mark3, 5);
		a[4][5]= _mm_extract_epi16( mark4, 5);
		a[5][5]= _mm_extract_epi16( mark5, 5);
		a[6][5]= _mm_extract_epi16( mark6, 5);
		a[7][5]= _mm_extract_epi16( mark7, 5);

		a[0][6]= _mm_extract_epi16( mark0, 6);
		a[1][6]= _mm_extract_epi16( mark1, 6);
		a[2][6]= _mm_extract_epi16( mark2, 6);
		a[3][6]= _mm_extract_epi16( mark3, 6);
		a[4][6]= _mm_extract_epi16( mark4, 6);
		a[5][6]= _mm_extract_epi16( mark5, 6);
		a[6][6]= _mm_extract_epi16( mark6, 6);
		a[7][6]= _mm_extract_epi16( mark7, 6);

		a[0][7]= _mm_extract_epi16( mark0, 7);
		a[1][7]= _mm_extract_epi16( mark1, 7);
		a[2][7]= _mm_extract_epi16( mark2, 7);
		a[3][7]= _mm_extract_epi16( mark3, 7);
		a[4][7]= _mm_extract_epi16( mark4, 7);
		a[5][7]= _mm_extract_epi16( mark5, 7);
		a[6][7]= _mm_extract_epi16( mark6, 7);
		a[7][7]= _mm_extract_epi16( mark7, 7);
    
}

int main(){
	int a[8][8],i,j;        
	foo(a);
	for(i=0;i < 8 ; i ++){
		for(j= 0; j < 8 ; j++){
			printf("%d\t",a[i][j]);
		}
		printf("\n");
	}
	return 1;	
}

the assembly with -O1:

	.file	"pcm.c"
	.text
.globl foo
	.type	foo, @function
foo:
	pushl	%ebp
	movl	%esp, %ebp
	movl	8(%ebp), %eax
	movl	$0, (%eax)
	movl	$0, 32(%eax)
	movl	$0, 64(%eax)
	movl	$0, 96(%eax)
	movl	$0, 128(%eax)
	movl	$0, 160(%eax)
	movl	$0, 192(%eax)
	movl	$0, 224(%eax)
	movl	$0, 4(%eax)
	movl	$0, 36(%eax)
	movl	$0, 68(%eax)
	movl	$0, 100(%eax)
	movl	$0, 132(%eax)
	movl	$0, 164(%eax)
	movl	$0, 196(%eax)
	movl	$0, 228(%eax)
	movl	$0, 8(%eax)
	movl	$0, 40(%eax)
	movl	$0, 72(%eax)
	movl	$0, 104(%eax)
	movl	$0, 136(%eax)
	movl	$0, 168(%eax)
	movl	$0, 200(%eax)
	movl	$0, 232(%eax)
	movl	$0, 12(%eax)
	movl	$0, 44(%eax)
	movl	$0, 76(%eax)
	movl	$0, 108(%eax)
	movl	$0, 140(%eax)
	movl	$0, 172(%eax)
	movl	$0, 204(%eax)
	movl	$0, 236(%eax)
	movl	$0, 16(%eax)
	movl	$0, 48(%eax)
	movl	$0, 80(%eax)
	movl	$0, 112(%eax)
	movl	$0, 144(%eax)
	movl	$0, 176(%eax)
	movl	$0, 208(%eax)
	movl	$0, 240(%eax)
	movl	$0, 20(%eax)
	movl	$0, 52(%eax)
	movl	$0, 84(%eax)
	movl	$0, 116(%eax)
	movl	$0, 148(%eax)
	movl	$0, 180(%eax)
	movl	$0, 212(%eax)
	movl	$0, 244(%eax)
	movl	$0, 24(%eax)
	movl	$0, 56(%eax)
	movl	$0, 88(%eax)
	movl	$0, 120(%eax)
	movl	$0, 152(%eax)
	movl	$0, 184(%eax)
	movl	$0, 216(%eax)
	movl	$0, 248(%eax)
	movl	$0, 28(%eax)
	movl	$0, 60(%eax)
	movl	$0, 92(%eax)
	movl	$0, 124(%eax)
	movl	$0, 156(%eax)
	movl	$0, 188(%eax)
	movl	$0, 220(%eax)
	movl	$0, 252(%eax)
	popl	%ebp
	ret
	.size	foo, .-foo
	.section	.rodata.str1.1,"aMS",@progbits,1
.LC3:
	.string	"%d\t"
	.text
.globl main
	.type	main, @function
main:
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	subl	$284, %esp
	andl	$-16, %esp
	subl	$16, %esp
	leal	-280(%ebp), %eax
	movl	%eax, (%esp)
	call	foo
	movl	$0, %edi
.L10:
	movl	$0, %ebx
	leal	0(,%edi,8), %esi
.L9:
	leal	(%esi,%ebx), %eax
	movl	-280(%ebp,%eax,4), %eax
	movl	%eax, 4(%esp)
	movl	$.LC3, (%esp)
	call	printf
	addl	$1, %ebx
	cmpl	$7, %ebx
	jle	.L9
	movl	$10, (%esp)
	call	putchar
	addl	$1, %edi
	cmpl	$7, %edi
	jle	.L10
	movl	$1, %eax
	leal	-12(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
	.size	main, .-main
	.section	.note.GNU-stack,"",@progbits
	.ident	"GCC: (GNU) 3.4.0 20040225 (prerelease)"
 				

        haibo
        chbchb1130@sina.com
          2004-03-09

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]