This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [Patch,AVR] Light-weight DImode implementation.


On 11/22/2011 01:15 AM, Georg-Johann Lay wrote:

ldi r30,lo8(1) ; 25 *movqi/2 [length = 1] cp r10,r18 ; 26 *cmpqi/2 [length = 1] brlo .L2 ; 27 branch [length = 1] ldi r30,lo8(0) ; 28 *movqi/1 [length = 1] .L2: add r11,r19 ; 30 addqi3/1 [length = 1]

(WARNING: there are a lot of "if"s in this message. However, some of the ideas here might give better code in general).


I noticed that the AVR backend does not have cstore, movcc, addcc. These are quite useful in general, and especially with the kind of code that expand_binop produces. For example all of these can be implemented in terms of adc or sbc:

    x = (a < b)            (cstore ltu, using ldi+adc)
    x = (a < b) ? -1 : 0   (movcc ltu, using ldi+sbc)
    x += (a < b) ? 1 : 0   (addcc ltu, using adc)
    x += (a < b) ? -1 : 0  (addcc ltu, using sbc)

In the code above, cstore could change the brlo/ldi pair to a single adc or sbc. addcc could also change a ldi/cp/brlo/ldi/add to cp+adc. expand_binop does not try to use addcc currently, but it could do so instead of cstore+or in the computation of the carry.

Then if final can successfully combine the "add" and "cp" instructions, you'd get something like:

    add r10,r18             ; sum 0
    ldi r30,lo8(0)          ; carryout 0
    adc r30,__zero_reg__
    add r11,r19             ; sum 1
    ldi r18,lo8(0)          ; carryout 1
    adc r18,__zero_reg__
    mov r19,r30             ; sum carry 1
    add r19,r11
    adc r18,__zero_reg__    ; carryout 1
    add r12,r20             ; sum 2
    ldi r30,lo8(0)          ; carryout 2
    adc r30,__zero_reg__
    mov r20,r18             ; sum carry 2
    add r20,r12
    adc r30,__zero_reg__    ; carryout 2
    add r13,r21             ; sum 3
    ldi r18,lo8(0)          ; carryout 3
    adc r18,__zero_reg__
    mov r21,r30             ; sum carry 3
    add r21,r13
    adc r18,__zero_reg__    ; carryout 3
    add r14,r22             ; sum 4
    ldi r30,lo8(0)          ; carryout 4
    adc r30,__zero_reg__
    mov r22,r18             ; sum carry 4
    add r22,r14
    adc r30,__zero_reg__    ; carryout 4
    add r15,r23             ; sum 5
    ldi r18,lo8(0)          ; carryout 5
    adc r18,__zero_reg__
    mov r23,r30             ; sum carry 5
    add r23,r15
    adc r18,__zero_reg__    ; carryout 5
    add r16,r24             ; sum 6
    ldi r30,lo8(0)          ; carryout 6
    adc r30,__zero_reg__
    mov r24,r18             ; sum carry 6
    add r24,r16
    adc r30,__zero_reg__    ; carryout 6
    add r25,r17             ; sum 7
    mov r18,r10
    add r25,r30             ; sum carry 7

Still suboptimal, but already much better than what you get now.

expand_binop could also sum the carry first, and the operand second, which would avoid overlapping live ranges for the carry:

    add r18,r10             ; sum 0
    ldi r30,lo8(0)          ; carryout 0
    adc r30,__zero_reg__
    add r19,r30             ; sum carry 1
    ldi r30,lo8(0)          ; carryout 1
    adc r30,__zero_reg__
    add r19,r11             ; sum 1
    adc r30,__zero_reg__    ; carryout 1
    add r20,r30             ; sum carry 2
    ldi r30,lo8(0)          ; carryout 2
    adc r30,__zero_reg__
    add r20,r12             ; sum 2
    adc r30,__zero_reg__    ; carryout 2
    add r21,r30             ; sum carry 3
    ldi r30,lo8(0)          ; carryout 3
    adc r30,__zero_reg__
    add r21,r13             ; sum 3
    adc r30,__zero_reg__    ; carryout 3
    add r22,r30             ; sum carry 4
    ldi r30,lo8(0)          ; carryout 4
    adc r30,__zero_reg__
    add r22,r14             ; sum 4
    adc r30,__zero_reg__    ; carryout 4
    add r23,r30             ; sum carry 5
    ldi r30,lo8(0)          ; carryout 5
    adc r30,__zero_reg__
    add r23,r15             ; sum 5
    adc r30,__zero_reg__    ; carryout 5
    add r24,r30             ; sum carry 6
    ldi r30,lo8(0)          ; carryout 6
    adc r30,__zero_reg__
    add r24,r16             ; sum 6
    adc r30,__zero_reg__    ; carryout 6
    add r25,r30             ; sum carry 7
    add r25,r17             ; sum 7

The code now is very regular and r30 dies often, so that playing with peephole2 could give decent code. This code:

    (cp rAA, rBB)                          ; eliminated by final
    ldi r30,lo8(0)          ; carryout 0
    adc r30,__zero_reg__
    add rXX,r30             ; sum carry 1
    (cp rXX, r30)                          ; eliminated by final
    ldi r30,lo8(0)          ; carryout 1
    adc r30,__zero_reg__
    add rXX,rZZ             ; sum 1
    (cp rXX, rZZ)                          ; eliminated by final
    adc r30,__zero_reg__    ; carryout 1
    add rYY,r30             ; sum carry 2

can be rewritten to

    (cp rAA, rBB)                          ; eliminated by final
    adc rXX,rZZ
    (cp rXX, rZZ)                          ; eliminated by final
    ldi r30,lo8(0)          ; carryout 1
    adc r30,__zero_reg__
    add rYY,r30             ; sum carry 2

Now, in theory the final four instructions would chain again into the same pattern (not sure peephole2 does this properly because it scan backwards; if it doesn't, you have to code the peepholes from the bottom) until you have

    add r18,r10
    adc r19,r11
    adc r20,r12
    adc r21,r13
    adc r22,r14
    adc r23,r15
    adc r24,r16
    ldi r30,lo8(0)          ; carryout 6
    adc r30,__zero_reg__
    add r25,r30             ; sum carry 7
    add r25,r17             ; sum 7

and finally with a simpler peephole (cp/ldi+adc/add/add to cp/adc):

    add r18,r10
    adc r19,r11
    adc r20,r12
    adc r21,r13
    adc r22,r14
    adc r23,r15
    adc r24,r16
    adc r25,r17

Paolo


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]