This is the mail archive of the mailing list for the GCC project.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

mainline performance regression (observered on SPARC)

Comparing the performance of g++-3.1 and mainline on the oopack
benchmark from
on a sparc-sun-solaris2.7 machine with an Ultra-IIi processor. 

reveals that there's a regression in performance for the mainline for
the "Complex" benchmark.
I don't know if this is SPARC specific, I haven't tried it on any
other system. 


                         Seconds       Mflops         
Test       Iterations     C    OOP     C    OOP  Ratio
----       ----------  -----------  -----------  -----
Max            100000    3.1   3.4   32.5  29.2    1.1
Matrix           1000    3.8   4.0   66.0  63.0    1.0
Complex        100000    8.8  30.8   91.4  26.0    3.5
Iterator       100000    2.4   2.4   83.3  82.6    1.0

mainline g++
                         Seconds       Mflops         
Test       Iterations     C    OOP     C    OOP  Ratio
----       ----------  -----------  -----------  -----
Max            100000    3.0   3.7   33.0  26.7    1.2
Matrix           1000    3.8   3.8   66.5  65.6    1.0
Complex        100000    8.7  41.6   91.5  19.2    4.8
Iterator       100000    2.4   2.4   83.0  82.6    1.0

The assembly for the offending function is below. The difference is
just one extra "add" instruction and a different schedule. 
The code was compiled with: -O3 -mcpu=ultrasparc 

_ZNK16ComplexBenchmark9oop_styleEv:        _ZNK16ComplexBenchmark9oop_styleEv:     
.LLFB19:                                   .LLFB19:                                 
	!#PROLOGUE# 0                      	!#PROLOGUE# 0                   
	save	%sp, -208, %sp             	save	%sp, -208, %sp          
.LLCFI10:                                  .LLCFI12:                               
	!#PROLOGUE# 1                      	!#PROLOGUE# 1                   
	sethi	%hi(.LLC37), %g1           	sethi	%hi(.LLC36), %i2        
	sethi	%hi(.LLC36), %i1           	sethi	%hi(.LLC37), %g1        
	ldd	[%g1+%lo(.LLC37)], %f0     	sethi	%hi(Y), %i1             
	sethi	%hi(Y), %i0                	sethi	%hi(X), %i0             
	sethi	%hi(X), %g1                	ldd	[%i2+%lo(.LLC36)], %f0  
	ldd	[%i1+%lo(.LLC36)], %f2     	or	%i1, %lo(Y), %o7        
	or	%i0, %lo(Y), %o7           	or	%i0, %lo(X), %l0        
	or	%g1, %lo(X), %l0           	std	%f0, [%fp-32]           
	std	%f0, [%fp-24]              	ldd	[%g1+%lo(.LLC37)], %f0  
	mov	0, %i0                     	mov	0, %g1                  
	std	%f2, [%fp-32]              	std	%f0, [%fp-24]           
	ldd	[%fp-32], %o0              	ldd	[%fp-32], %o4           
	ldd	[%fp-24], %o2              	ldd	[%fp-24], %o2           
.LL184:                                    .LL185:                                 
	sll	%i0, 4, %i1                	sll	%g1, 4, %i5             
	std	%o0, [%fp-96]              	std	%o4, [%fp-96]           
	add	%i0, 1, %i0                	add	%l0, %i5, %i2           
	std	%o2, [%fp-88]              	add	%i5, %l0, %i4           
	add	%l0, %i1, %i3              	add	%o7, %i5, %l1           
	add	%i1, %l0, %g1              	add	%g1, 1, %g1             
	ldd	[%i3], %i4                 	cmp	%g1, 999                
	cmp	%i0, 999                   	ldd	[%i2], %i0              
	ldd	[%fp-96], %f28             	std	%o2, [%fp-88]           
	std	%i4, [%fp-112]             	ldd	[%l1], %o0              
	add	%o7, %i1, %i4              	ldd	[%i4+8], %i2            
	ldd	[%g1+8], %i2               	add	%i5, %o7, %i4           
	add	%i1, %o7, %g1              	std	%i0, [%fp-112]          
	ldd	[%i4], %o4                 	std	%i2, [%fp-104]          
	std	%i2, [%fp-104]             	ldd	[%fp-96], %f28          
	add	%o7, %i1, %i2              	ldd	[%i4+8], %i0            
	ldd	[%fp-112], %f8             	std	%o0, [%fp-64]           
	ldd	[%fp-88], %f30             	std	%i0, [%fp-56]           
	ldd	[%fp-104], %f14            	ldd	[%fp-88], %f24          
	fmuld	%f28, %f8, %f26            	ldd	[%fp-112], %f26         
	fmuld	%f30, %f8, %f24            	ldd	[%fp-104], %f8          
	ldd	[%g1+8], %i4               	fmuld	%f28, %f26, %f20        
	fmuld	%f30, %f14, %f12           	fmuld	%f24, %f8, %f22         
	std	%o4, [%fp-64]              	fmuld	%f28, %f8, %f16         
	fmuld	%f28, %f14, %f22           	ldd	[%fp-64], %f12          
	ldd	[%fp-64], %f18             	fmuld	%f24, %f26, %f18        
	std	%i4, [%fp-56]              	ldd	[%fp-56], %f6           
	fsubd	%f26, %f12, %f20           	fsubd	%f20, %f22, %f14        
	ldd	[%fp-56], %f6              	faddd	%f16, %f18, %f10        
	faddd	%f22, %f24, %f16           	faddd	%f12, %f14, %f4         
	faddd	%f18, %f20, %f10           	std	%f4, [%fp-48]           
	std	%f10, [%fp-48]             	faddd	%f6, %f10, %f2          
	ldd	[%fp-48], %i4              	std	%f2, [%fp-40]           
	faddd	%f6, %f16, %f4             	ldd	[%fp-48], %i0           
	std	%f4, [%fp-40]              	std	%i0, [%l1]              
	std	%i4, [%i2]                 	ldd	[%fp-40], %i2           
	ldd	[%fp-40], %i2              	ble,pt	%icc, .LL185            
	ble,pt	%icc, .LL184               	std	%i2, [%i4+8]            
	std	%i2, [%g1+8]               	nop                             
	nop                                	return	%i7+8                   
	return	%i7+8                      	nop                             

Any idea what is the cause of this? 

The code is kind of silly, it moves stuff around between the integer
and fp registers, probably an artifact of the SPARC calling
conventions (passing doubles in the integer registers). But shouldn't
inlining get rid of this? 

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]