int main(void) { // static const unsigned long __flash br[8] = {0,}; // case1: 194 bytes static const __uint24 __flash br[8] = {0,}; // case2: 190 bytes (should be case1 minus 8 bytes) unsigned char i=0; for (;;i++) { *((volatile unsigned char*)100) = br[i%8]; } return (0); } //case1: avr-gcc-4.7.2 -S -mmcu=atmega8 -Os main.c ldi r24,0 .L2: mov r30,r24 andi r30,lo8(7) ldi r25,lo8(4) mul r30,r25 movw r30,r0 clr __zero_reg__ subi r30,lo8(-(br.1322)) sbci r31,hi8(-(br.1322)) lpm r25,Z sts 100,r25 subi r24,lo8(-(1)) rjmp .L2 //case2: avr-gcc-4.7.2 -S -mmcu=atmega8 -Os main.c ldi r18,0 .L2: mov r24,r18 andi r24,lo8(7) ldi r25,0 movw r30,r24 lsl r30 rol r31 add r30,r24 adc r31,r25 subi r30,lo8(-(br.1322)) sbci r31,hi8(-(br.1322)) lpm r24,Z sts 100,r24 subi r18,lo8(-(1)) rjmp .L2
It would be optimal move outside loop all extra instructions, and use temp register r0 more active. ldi r24,0 ldi r25,lo8(4) // outside loop .L2: mov r30,r24 andi r30,lo8(7) mul r30,r25 movw r30,r0 subi r30,lo8(-(br.1322)) sbci r31,hi8(-(br.1322)) lpm r0,Z // use r0 as temp sts 100,r0 subi r24,lo8(-(1)) rjmp .L2 clr __zero_reg__ // outside loop
oops r0 - is call-clobbered:(
Closed as WON'T FIX It might be very well the case that code using __[u]int24 (PSImode) variables is not compiled as efficient as could be. Maybe some generic optimizations work also for PSImode so that you'll see better code for PSI operations as a byproduct of some generic optimizations in the future. But sorry, no PSI-specific optimization hacks in the avr backend for now. Remember that R0 is special and does not survive an insn, i.e. it's used like a scratch register.