This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [Patch,AVR] Light-weight DImode implementation.

From: Paolo Bonzini <bonzini at gnu dot org>
To: Georg-Johann Lay <avr at gjlay dot de>
Cc: Richard Henderson <rth at redhat dot com>, gcc-patches at gcc dot gnu dot org, Denis Chertykov <chertykov at gmail dot com>, Eric Weddington <eric dot weddington at atmel dot com>, Anatoly Sokolov <aesok at post dot ru>
Date: Wed, 30 Nov 2011 11:41:49 +0100
Subject: Re: [Patch,AVR] Light-weight DImode implementation.
Newsgroups: gmane.comp.gcc.patches
References: <4ECAA70E.5050205@gjlay.de> <4ECAD332.9020101@redhat.com> <4ECAE9AB.5060902@gjlay.de>

On 11/22/2011 01:15 AM, Georg-Johann Lay wrote:


     ldi r30,lo8(1)     ;  25    *movqi/2    [length = 1]
     cp r10,r18     ;  26    *cmpqi/2    [length = 1]
     brlo .L2     ;  27    branch    [length = 1]
     ldi r30,lo8(0)     ;  28    *movqi/1    [length = 1]
.L2:
     add r11,r19     ;  30    addqi3/1    [length = 1]

(WARNING: there are a lot of "if"s in this message. However, some of the ideas here might give better code in general).

I noticed that the AVR backend does not have cstore, movcc, addcc. These are quite useful in general, and especially with the kind of code that expand_binop produces. For example all of these can be implemented in terms of adc or sbc:

    x = (a < b)            (cstore ltu, using ldi+adc)
    x = (a < b) ? -1 : 0   (movcc ltu, using ldi+sbc)
    x += (a < b) ? 1 : 0   (addcc ltu, using adc)
    x += (a < b) ? -1 : 0  (addcc ltu, using sbc)

In the code above, cstore could change the brlo/ldi pair to a single adc or sbc. addcc could also change a ldi/cp/brlo/ldi/add to cp+adc. expand_binop does not try to use addcc currently, but it could do so instead of cstore+or in the computation of the carry.

Then if final can successfully combine the "add" and "cp" instructions, you'd get something like:

    add r10,r18             ; sum 0
    ldi r30,lo8(0)          ; carryout 0
    adc r30,__zero_reg__
    add r11,r19             ; sum 1
    ldi r18,lo8(0)          ; carryout 1
    adc r18,__zero_reg__
    mov r19,r30             ; sum carry 1
    add r19,r11
    adc r18,__zero_reg__    ; carryout 1
    add r12,r20             ; sum 2
    ldi r30,lo8(0)          ; carryout 2
    adc r30,__zero_reg__
    mov r20,r18             ; sum carry 2
    add r20,r12
    adc r30,__zero_reg__    ; carryout 2
    add r13,r21             ; sum 3
    ldi r18,lo8(0)          ; carryout 3
    adc r18,__zero_reg__
    mov r21,r30             ; sum carry 3
    add r21,r13
    adc r18,__zero_reg__    ; carryout 3
    add r14,r22             ; sum 4
    ldi r30,lo8(0)          ; carryout 4
    adc r30,__zero_reg__
    mov r22,r18             ; sum carry 4
    add r22,r14
    adc r30,__zero_reg__    ; carryout 4
    add r15,r23             ; sum 5
    ldi r18,lo8(0)          ; carryout 5
    adc r18,__zero_reg__
    mov r23,r30             ; sum carry 5
    add r23,r15
    adc r18,__zero_reg__    ; carryout 5
    add r16,r24             ; sum 6
    ldi r30,lo8(0)          ; carryout 6
    adc r30,__zero_reg__
    mov r24,r18             ; sum carry 6
    add r24,r16
    adc r30,__zero_reg__    ; carryout 6
    add r25,r17             ; sum 7
    mov r18,r10
    add r25,r30             ; sum carry 7

Still suboptimal, but already much better than what you get now.

expand_binop could also sum the carry first, and the operand second, which would avoid overlapping live ranges for the carry:

    add r18,r10             ; sum 0
    ldi r30,lo8(0)          ; carryout 0
    adc r30,__zero_reg__
    add r19,r30             ; sum carry 1
    ldi r30,lo8(0)          ; carryout 1
    adc r30,__zero_reg__
    add r19,r11             ; sum 1
    adc r30,__zero_reg__    ; carryout 1
    add r20,r30             ; sum carry 2
    ldi r30,lo8(0)          ; carryout 2
    adc r30,__zero_reg__
    add r20,r12             ; sum 2
    adc r30,__zero_reg__    ; carryout 2
    add r21,r30             ; sum carry 3
    ldi r30,lo8(0)          ; carryout 3
    adc r30,__zero_reg__
    add r21,r13             ; sum 3
    adc r30,__zero_reg__    ; carryout 3
    add r22,r30             ; sum carry 4
    ldi r30,lo8(0)          ; carryout 4
    adc r30,__zero_reg__
    add r22,r14             ; sum 4
    adc r30,__zero_reg__    ; carryout 4
    add r23,r30             ; sum carry 5
    ldi r30,lo8(0)          ; carryout 5
    adc r30,__zero_reg__
    add r23,r15             ; sum 5
    adc r30,__zero_reg__    ; carryout 5
    add r24,r30             ; sum carry 6
    ldi r30,lo8(0)          ; carryout 6
    adc r30,__zero_reg__
    add r24,r16             ; sum 6
    adc r30,__zero_reg__    ; carryout 6
    add r25,r30             ; sum carry 7
    add r25,r17             ; sum 7

The code now is very regular and r30 dies often, so that playing with peephole2 could give decent code. This code:

    (cp rAA, rBB)                          ; eliminated by final
    ldi r30,lo8(0)          ; carryout 0
    adc r30,__zero_reg__
    add rXX,r30             ; sum carry 1
    (cp rXX, r30)                          ; eliminated by final
    ldi r30,lo8(0)          ; carryout 1
    adc r30,__zero_reg__
    add rXX,rZZ             ; sum 1
    (cp rXX, rZZ)                          ; eliminated by final
    adc r30,__zero_reg__    ; carryout 1
    add rYY,r30             ; sum carry 2

can be rewritten to

    (cp rAA, rBB)                          ; eliminated by final
    adc rXX,rZZ
    (cp rXX, rZZ)                          ; eliminated by final
    ldi r30,lo8(0)          ; carryout 1
    adc r30,__zero_reg__
    add rYY,r30             ; sum carry 2

Now, in theory the final four instructions would chain again into the same pattern (not sure peephole2 does this properly because it scan backwards; if it doesn't, you have to code the peepholes from the bottom) until you have

    add r18,r10
    adc r19,r11
    adc r20,r12
    adc r21,r13
    adc r22,r14
    adc r23,r15
    adc r24,r16
    ldi r30,lo8(0)          ; carryout 6
    adc r30,__zero_reg__
    add r25,r30             ; sum carry 7
    add r25,r17             ; sum 7

and finally with a simpler peephole (cp/ldi+adc/add/add to cp/adc):

    add r18,r10
    adc r19,r11
    adc r20,r12
    adc r21,r13
    adc r22,r14
    adc r23,r15
    adc r24,r16
    adc r25,r17

Paolo

References:
- [Patch,AVR] Light-weight DImode implementation.
  - From: Georg-Johann Lay
- Re: [Patch,AVR] Light-weight DImode implementation.
  - From: Richard Henderson
- Re: [Patch,AVR] Light-weight DImode implementation.
  - From: Georg-Johann Lay

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]