This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: rtlopt loop unroller question





Hello ,

>> However, if we complicate the example a little bit:
>>
>> {
>>   int A[N];
>>   int B[N];
>>   int C[N];
>>   int i;
>>
>>   for (i=0; i<N; i++)
>>     A[i]=B[i]+C[i];
>>   return A;
>> }
>>
>> we still get the inefficient addressing calculations.
>> Using static variables instead of local ones yields
>> much better code with or without your patch.

>I am not sure what you refer to here (could you please demonstrate on
>assembler)?

The thing is that for the above code we get the following asm code :

_foo:
      lis r3,0xfffe
      stmw r17,-60(r1)
      ori r2,r3,10976
      li r0,1250
      stwux r1,r1,r2
      mtctr r0
      addi r9,r1,24
      addis r12,r9,0x1
L5:
      addi r8,r12,4
      addi r7,r12,8
      addi r6,r12,12
      addi r5,r12,16
      addi r4,r12,20
      addi r3,r12,24
      addi r2,r12,28
      lwz r19,-25528(r12)
      lwz r17,14472(r12)
      lwz r18,14472(r8)
      lwz r10,14472(r7)
      add r24,r19,r17
      lwz r25,-25528(r8)
      addis r19,r12,0xffff
      lwz r26,-25528(r7)
      addis r17,r2,0xffff
      lwz r27,-25528(r6)
      add r25,r25,r18
      lwz r20,14472(r6)
      add r26,r26,r10
      lwz r28,-25528(r5)
      addis r10,r8,0xffff
      lwz r21,14472(r5)
      addis r8,r7,0xffff
      lwz r29,-25528(r4)
      addis r7,r6,0xffff
      lwz r22,14472(r4)
      addis r6,r5,0xffff
      lwz r11,-25528(r3)
      addis r5,r4,0xffff
      lwz r23,14472(r3)
      add r27,r27,r20
      lwz r0,-25528(r2)
      add r28,r28,r21
      lwz r9,14472(r2)
      add r29,r29,r22
      add r11,r11,r23
      addis r4,r3,0xffff
      add r18,r0,r9
      stw r24,8(r19)
      addi r12,r12,32
      stw r25,8(r10)
      stw r26,8(r8)
      stw r27,8(r7)
      stw r28,8(r6)
      stw r29,8(r5)
      stw r11,8(r4)
      stw r18,8(r17)
      bdnz L5
      addi r3,r1,32
      lwz r1,0(r1)
      lmw r17,-60(r1)
      blr
      .align 2
      .globl _main


 where we still get the inefficient addressing calculations.
 (even after applying your first patch)

 notice that when declaring the arrays as static type ,
 we get the following asm code
 where we don't get the inefficient calculations (beside the redundant adds
):

_foo:
      mflr r3
      bcl 20,31,L1$pb
L1$pb:
      stmw r28,-16(r1)
      li r0,2500
      mflr r31
      mtctr r0
      addis r2,r31,ha16(_A.0-L1$pb)
      addis r11,r31,ha16(_B.1-L1$pb)
      addis r9,r31,ha16(_C.2-L1$pb)
      la r30,lo16(_A.0-L1$pb)(r2)
      stw r3,8(r1)
      la r2,lo16(_B.1-L1$pb)(r11)
      la r12,lo16(_C.2-L1$pb)(r9)
      li r28,-4
L5:
      addi r8,r28,4
      addi r7,r8,4
      lwzx r29,r8,r12
      addi r6,r7,4
      lwzx r10,r8,r2
      addi r28,r6,4
      lwzx r4,r6,r12
      lwzx r11,r7,r2
      add r10,r10,r29
      lwzx r3,r7,r12
      lwzx r9,r6,r2
      lwzx r0,r28,r2
      add r11,r11,r3
      lwzx r5,r28,r12
      add r29,r9,r4
      stwx r10,r8,r30
      add r4,r0,r5
      stwx r11,r7,r30
      stwx r29,r6,r30
      stwx r4,r28,r30
      bdnz L5
      lwz r12,8(r1)
      mr r3,r30
      lmw r28,-16(r1)
      mtlr r12
      blr







Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]