This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [Patch,AVR] Light-weight DImode implementation.

From: Georg-Johann Lay <avr at gjlay dot de>
Cc: Richard Henderson <rth at redhat dot com>, gcc-patches at gcc dot gnu dot org, Denis Chertykov <chertykov at gmail dot com>, Eric Weddington <eric dot weddington at atmel dot com>, Anatoly Sokolov <aesok at post dot ru>
Date: Tue, 22 Nov 2011 01:15:39 +0100
Subject: Re: [Patch,AVR] Light-weight DImode implementation.
References: <4ECAA70E.5050205@gjlay.de> <4ECAD332.9020101@redhat.com>

Richard Henderson schrieb:

On 11/21/2011 11:31 AM, Georg-Johann Lay wrote:

;; The caveat is that if there are insns for some mode, there must also be a
;; respective move insn that describes reloads.  Therefore, this
;; implementation uses an accumulator-based model with two hard-coded,
;; accumulator-like registers
;;
;;    A[] = reg:DI 18
;;    B[] = reg:DI 10
;;
;; so that no DImode insn contains pseudos or needs reloading.

Well, rtl loop optimization will not work, but given that SSE


You mean "won't optimize" or "gives wrong code"?
What's SSE? I definitely need a GCC glossary.

optimizations ought to have been performed, that's probably
acceptable.

It's definitely a hack, but perhaps you'll be able to get away with
it.

Yes, I'm aware it's hack. But the extreme bloaty code -- see below -- is one of the reasons for bad reputation of avr-gcc, even though just very few people are using 64-bit.

I do wonder if you might even get smaller code if you force DImode
quantities into the stack (just hack use_register_for_decl locally
while testing; a new target hook if that pans out), and pass pointers
to the variables instead.  At the moment you're having to use 8*3
insns inline to put the quantities in place and take them back out
again.  With pointers this would seem to drop to 2*3.

I already thought about using pointers; but remind that AVR only has 3 pointer registers. Moreover, I remember some post in gcc@ or gcc-help@ where someone asked how to write an addition or similar that works /only/ on memory, and the answer was "it's not possible", IIRC.

Anyways, if you compare the new code with /some/ move insns against the old code for, say,

long long add64 (long long a, long long b)
{
    return a + b;
}

that compiles with -Os to

add64:
	push r10	 ;  222	pushqi1/1	[length = 1]
	push r11	 ;  223	pushqi1/1	[length = 1]
	push r12	 ;  224	pushqi1/1	[length = 1]
	push r13	 ;  225	pushqi1/1	[length = 1]
	push r14	 ;  226	pushqi1/1	[length = 1]
	push r15	 ;  227	pushqi1/1	[length = 1]
	push r16	 ;  228	pushqi1/1	[length = 1]
	push r17	 ;  229	pushqi1/1	[length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
	add r10,r18	 ;  24	addqi3/1	[length = 1]
	ldi r30,lo8(1)	 ;  25	*movqi/2	[length = 1]
	cp r10,r18	 ;  26	*cmpqi/2	[length = 1]
	brlo .L2	 ;  27	branch	[length = 1]
	ldi r30,lo8(0)	 ;  28	*movqi/1	[length = 1]
.L2:
	add r11,r19	 ;  30	addqi3/1	[length = 1]
	ldi r18,lo8(1)	 ;  31	*movqi/2	[length = 1]
	cp r11,r19	 ;  32	*cmpqi/2	[length = 1]
	brlo .L3	 ;  33	branch	[length = 1]
	ldi r18,lo8(0)	 ;  34	*movqi/1	[length = 1]
.L3:
	mov r19,r30	 ;  216	*movqi/1	[length = 1]
	add r19,r11	 ;  36	addqi3/1	[length = 1]
	ldi r30,lo8(1)	 ;  37	*movqi/2	[length = 1]
	cp r19,r11	 ;  38	*cmpqi/2	[length = 1]
	brlo .L4	 ;  39	branch	[length = 1]
	ldi r30,lo8(0)	 ;  40	*movqi/1	[length = 1]
.L4:
	or r18,r30	 ;  42	iorqi3/1	[length = 1]
	add r12,r20	 ;  44	addqi3/1	[length = 1]
	ldi r30,lo8(1)	 ;  45	*movqi/2	[length = 1]
	cp r12,r20	 ;  46	*cmpqi/2	[length = 1]
	brlo .L5	 ;  47	branch	[length = 1]
	ldi r30,lo8(0)	 ;  48	*movqi/1	[length = 1]
.L5:
	mov r20,r18	 ;  217	*movqi/1	[length = 1]
	add r20,r12	 ;  50	addqi3/1	[length = 1]
	ldi r18,lo8(1)	 ;  51	*movqi/2	[length = 1]
	cp r20,r12	 ;  52	*cmpqi/2	[length = 1]
	brlo .L6	 ;  53	branch	[length = 1]
	ldi r18,lo8(0)	 ;  54	*movqi/1	[length = 1]
.L6:
	or r30,r18	 ;  56	iorqi3/1	[length = 1]
	add r13,r21	 ;  58	addqi3/1	[length = 1]
	ldi r18,lo8(1)	 ;  59	*movqi/2	[length = 1]
	cp r13,r21	 ;  60	*cmpqi/2	[length = 1]
	brlo .L7	 ;  61	branch	[length = 1]
	ldi r18,lo8(0)	 ;  62	*movqi/1	[length = 1]
.L7:
	mov r21,r30	 ;  218	*movqi/1	[length = 1]
	add r21,r13	 ;  64	addqi3/1	[length = 1]
	ldi r30,lo8(1)	 ;  65	*movqi/2	[length = 1]
	cp r21,r13	 ;  66	*cmpqi/2	[length = 1]
	brlo .L8	 ;  67	branch	[length = 1]
	ldi r30,lo8(0)	 ;  68	*movqi/1	[length = 1]
.L8:
	or r18,r30	 ;  70	iorqi3/1	[length = 1]
	add r14,r22	 ;  72	addqi3/1	[length = 1]
	ldi r30,lo8(1)	 ;  73	*movqi/2	[length = 1]
	cp r14,r22	 ;  74	*cmpqi/2	[length = 1]
	brlo .L9	 ;  75	branch	[length = 1]
	ldi r30,lo8(0)	 ;  76	*movqi/1	[length = 1]
.L9:
	mov r22,r18	 ;  219	*movqi/1	[length = 1]
	add r22,r14	 ;  78	addqi3/1	[length = 1]
	ldi r18,lo8(1)	 ;  79	*movqi/2	[length = 1]
	cp r22,r14	 ;  80	*cmpqi/2	[length = 1]
	brlo .L10	 ;  81	branch	[length = 1]
	ldi r18,lo8(0)	 ;  82	*movqi/1	[length = 1]
.L10:
	or r30,r18	 ;  84	iorqi3/1	[length = 1]
	add r15,r23	 ;  86	addqi3/1	[length = 1]
	ldi r18,lo8(1)	 ;  87	*movqi/2	[length = 1]
	cp r15,r23	 ;  88	*cmpqi/2	[length = 1]
	brlo .L11	 ;  89	branch	[length = 1]
	ldi r18,lo8(0)	 ;  90	*movqi/1	[length = 1]
.L11:
	mov r23,r30	 ;  220	*movqi/1	[length = 1]
	add r23,r15	 ;  92	addqi3/1	[length = 1]
	ldi r30,lo8(1)	 ;  93	*movqi/2	[length = 1]
	cp r23,r15	 ;  94	*cmpqi/2	[length = 1]
	brlo .L12	 ;  95	branch	[length = 1]
	ldi r30,lo8(0)	 ;  96	*movqi/1	[length = 1]
.L12:
	or r18,r30	 ;  98	iorqi3/1	[length = 1]
	add r16,r24	 ;  100	addqi3/1	[length = 1]
	ldi r30,lo8(1)	 ;  101	*movqi/2	[length = 1]
	cp r16,r24	 ;  102	*cmpqi/2	[length = 1]
	brlo .L13	 ;  103	branch	[length = 1]
	ldi r30,lo8(0)	 ;  104	*movqi/1	[length = 1]
.L13:
	mov r24,r18	 ;  221	*movqi/1	[length = 1]
	add r24,r16	 ;  106	addqi3/1	[length = 1]
	ldi r18,lo8(1)	 ;  107	*movqi/2	[length = 1]
	cp r24,r16	 ;  108	*cmpqi/2	[length = 1]
	brlo .L14	 ;  109	branch	[length = 1]
	ldi r18,lo8(0)	 ;  110	*movqi/1	[length = 1]
.L14:
	or r30,r18	 ;  112	iorqi3/1	[length = 1]
	add r25,r17	 ;  114	addqi3/1	[length = 1]
	mov r18,r10	 ;  138	*movqi/1	[length = 1]
	add r25,r30	 ;  145	addqi3/1	[length = 1]
/* epilogue start */
	pop r17	 ;  232	popqi	[length = 1]
	pop r16	 ;  233	popqi	[length = 1]
	pop r15	 ;  234	popqi	[length = 1]
	pop r14	 ;  235	popqi	[length = 1]
	pop r13	 ;  236	popqi	[length = 1]
	pop r12	 ;  237	popqi	[length = 1]
	pop r11	 ;  238	popqi	[length = 1]
	pop r10	 ;  239	popqi	[length = 1]
	ret	 ;  240	return_from_epilogue	[length = 1]

I'd say that the new code is way better -- even with 24 move instructions. And if there are more DI operations in a line, some moves might vanish because only registers 18 and 10 are used.

And I even say that this approach is no worse than supplying movdi and let IRA/reload do the work -- at least that's my impression from the code that I often see from IRA, like PR50775 for example.

Johann

Follow-Ups:
- Re: [Patch,AVR] Light-weight DImode implementation.
  - From: Paolo Bonzini

References:
- [Patch,AVR] Light-weight DImode implementation.
  - From: Georg-Johann Lay
- Re: [Patch,AVR] Light-weight DImode implementation.
  - From: Richard Henderson

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]