27663 – missed-optimization transforming a byte array to unsigned long

Bug 27663 - missed-optimization transforming a byte array to unsigned long

Summary: missed-optimization transforming a byte array to unsigned long

Status:	RESOLVED FIXED

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	target (show other bugs)
Version:	4.2.0

Importance:	P3 enhancement
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:

Reported:	2006-05-18 17:58 UTC by berndtrog
Modified:	2023-05-31 08:28 UTC (History)
CC List:	6 users (show)

See Also:
Host:
Target:	avr
Build:
Known to work:	14.0
Known to fail:	4.2.0, 4.2.1, 5.1.0
Last reconfirmed:	2023-05-30 00:00:00

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description berndtrog 2006-05-18 17:58:17 UTC

Hi,
this code:

unsigned long f (unsigned char  *P)
{
  unsigned long C;
  C  = ((unsigned long)P[1] << 24)
     | ((unsigned long)P[2] << 16)
     | ((unsigned long)P[3] <<  8)
     | ((unsigned long)P[4] <<  0);
  return C;
}

compiles to this:

00000000 <f>:
   0:   f9 2f           mov     r31, r25
   2:   e8 2f           mov     r30, r24
   4:   61 81           ldd     r22, Z+1        ; 0x01
   6:   77 27           eor     r23, r23
   8:   88 27           eor     r24, r24
   a:   99 27           eor     r25, r25
   c:   96 2f           mov     r25, r22
   e:   88 27           eor     r24, r24
  10:   77 27           eor     r23, r23
  12:   66 27           eor     r22, r22
  14:   22 81           ldd     r18, Z+2        ; 0x02
  16:   33 27           eor     r19, r19
  18:   44 27           eor     r20, r20
  1a:   55 27           eor     r21, r21
  1c:   53 2f           mov     r21, r19
  1e:   42 2f           mov     r20, r18
  20:   33 27           eor     r19, r19
  22:   22 27           eor     r18, r18
  24:   62 2b           or      r22, r18
  26:   73 2b           or      r23, r19
  28:   84 2b           or      r24, r20
  2a:   95 2b           or      r25, r21
  2c:   24 81           ldd     r18, Z+4        ; 0x04
  2e:   33 27           eor     r19, r19
  30:   44 27           eor     r20, r20
  32:   55 27           eor     r21, r21
  34:   62 2b           or      r22, r18
  36:   73 2b           or      r23, r19
  38:   84 2b           or      r24, r20
  3a:   95 2b           or      r25, r21
  3c:   23 81           ldd     r18, Z+3        ; 0x03
  3e:   33 27           eor     r19, r19
  40:   44 27           eor     r20, r20
  42:   55 27           eor     r21, r21
  44:   54 2f           mov     r21, r20
  46:   43 2f           mov     r20, r19
  48:   32 2f           mov     r19, r18
  4a:   22 27           eor     r18, r18
  4c:   62 2b           or      r22, r18
  4e:   73 2b           or      r23, r19
  50:   84 2b           or      r24, r20
  52:   95 2b           or      r25, r21
  54:   08 95           ret

using this cmd line:
avr-gcc -c -Os f.c

IMO, most of the or, eor and mov instructions are unnecessary.

Comment 1 Eric Weddington 2007-04-09 23:33:20 UTC

Bernd, what mcu type was this compiled for?

Comment 2 Eric Weddington 2007-07-25 17:29:16 UTC

Confirmed on 4.2.1.

Comment 3 Urja Rannikko 2009-04-24 20:24:25 UTC

Confirmed on 4.3.2 - it's a bit different and actually worse (longer):
(Please add 4.3.2 to known to fail - i cant)
00000000 <f>:
   0:   e8 2f           mov     r30, r24
   2:   f9 2f           mov     r31, r25
   4:   21 81           ldd     r18, Z+1        ; 0x01
   6:   30 e0           ldi     r19, 0x00       ; 0
   8:   40 e0           ldi     r20, 0x00       ; 0
   a:   50 e0           ldi     r21, 0x00       ; 0
   c:   52 2f           mov     r21, r18
   e:   44 27           eor     r20, r20
  10:   33 27           eor     r19, r19
  12:   22 27           eor     r18, r18
  14:   82 81           ldd     r24, Z+2        ; 0x02
  16:   90 e0           ldi     r25, 0x00       ; 0
  18:   a0 e0           ldi     r26, 0x00       ; 0
  1a:   b0 e0           ldi     r27, 0x00       ; 0
  1c:   a8 2f           mov     r26, r24
  1e:   b9 2f           mov     r27, r25
  20:   99 27           eor     r25, r25
  22:   88 27           eor     r24, r24
  24:   28 2b           or      r18, r24
  26:   39 2b           or      r19, r25
  28:   4a 2b           or      r20, r26
  2a:   5b 2b           or      r21, r27
  2c:   84 81           ldd     r24, Z+4        ; 0x04
  2e:   90 e0           ldi     r25, 0x00       ; 0
  30:   a0 e0           ldi     r26, 0x00       ; 0
  32:   b0 e0           ldi     r27, 0x00       ; 0
  34:   28 2b           or      r18, r24
  36:   39 2b           or      r19, r25
  38:   4a 2b           or      r20, r26
  3a:   5b 2b           or      r21, r27
  3c:   83 81           ldd     r24, Z+3        ; 0x03
  3e:   90 e0           ldi     r25, 0x00       ; 0
  40:   a0 e0           ldi     r26, 0x00       ; 0
  42:   b0 e0           ldi     r27, 0x00       ; 0
  44:   ba 2f           mov     r27, r26
  46:   a9 2f           mov     r26, r25
  48:   98 2f           mov     r25, r24
  4a:   88 27           eor     r24, r24
  4c:   28 2b           or      r18, r24
  4e:   39 2b           or      r19, r25
  50:   4a 2b           or      r20, r26
  52:   5b 2b           or      r21, r27
  54:   62 2f           mov     r22, r18
  56:   73 2f           mov     r23, r19
  58:   84 2f           mov     r24, r20
  5a:   95 2f           mov     r25, r21
  5c:   08 95           ret

Comment 4 Richard Biener 2009-04-24 20:39:18 UTC

It's a target independent issue.  On non-strict alignment targets we can do a
32bit load instead of 4 byte loads.  This is what you ask for, correct?

combine unfortunately does not see enough insns to catch it.  Within a single
stmt we can teach fold to do it, otherwise forwprop is our tree level combiner.

Comment 5 Richard Biener 2009-04-24 20:43:04 UTC

Maybe it fits within the byteswap recognition pass (certainly the loads may
be because on-the-fly byteswap is requested).

Comment 6 Georg-Johann Lay 2011-05-16 14:20:25 UTC

Author: gjl
Date: Mon May 16 14:20:19 2011
New Revision: 173792

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=173792
Log:
	PR target/27663
	PR target/41076
	* config/avr/predicates.md (const_8_16_24_operand): New predicate.
	* config/avr/avr.md ("*ior<mode>qi.byte0",
	"*ior<mode>qi.byte1-3"): New define_insn_and_split patterns.


Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/config/avr/avr.md
    trunk/gcc/config/avr/predicates.md

Comment 7 Georg-Johann Lay 2011-05-16 15:05:41 UTC

The patch tries to fix the middle-end flaw in the BE by introducing some combine patterns that recognize byte-insert.

Wouldn't it be possible to recognize such situations in the middle-end and map them to something like (set (zero_extract:QI (reg:SI) ...)) or (set (subreg:QI (reg:SI) ...)?

Even if the bytes inserted do not come from consecutive memory locations, such a recognition would help.

The patch does not lead to optimal code, there is still room for improvement:

With -Os -mmcu=atmega8:

f:
	push r16
	push r17
/* prologue: function */
/* frame size = 0 */
/* stack size = 2 */
.L__stack_usage = 2
	movw r30,r24
	ldd r24,Z+1
	ldd r16,Z+2
	ldi r17,lo8(0)
	ldi r18,lo8(0)
	ldi r19,hi8(0)
	movw r18,r16
	clr r17
	clr r16
	or r19,r24
	ldd r24,Z+4
	or r16,r24
	ldd r24,Z+3
	or r17,r24
	movw r22,r16
	movw r24,r18
/* epilogue start */
	pop r17
	pop r16
	ret

The usage of r16/r17 might be an artifact of IRA because only half of a SI reg is call-saved, the other half is call-used. There is the following comment in ira-color.c:

	/* We need to save/restore the hard register in
	   epilogue/prologue.  Therefore we increase the cost.  */
	{
	  /* ??? If only part is call clobbered.  */

Despite subreg lowering, the call-used r26/r27 are not used.

Maybe you should also try to disable subreg lowering by means of -fno-split-wide-types. For the code in question that gives:

With -Os -mmcu=atmega8 -fno-split-wide-types:

f:
/* prologue: function */
/* frame size = 0 */
/* stack size = 0 */
.L__stack_usage = 0
	movw r30,r24
	ldd r18,Z+1
	ldd r22,Z+2
	mov r24,r22
	ldi r25,lo8(0)
	ldi r26,lo8(0)
	ldi r27,hi8(0)
	clr r23
	clr r22
	or r25,r18
	ldd r18,Z+4
	or r22,r18
	ldd r18,Z+3
	or r23,r18
/* epilogue start */
	ret

What I do not understand are the insns clearing r26/r27 because they are dead (which is not detected). It is an HI insn that looks like that:

; (insn 32 34 42 (set (reg:HI 26 r26 [ MEM[(unsigned char *)P_1(D) + 2B]+2 ])
;         (const_int 0 [0])) insert-byte.c:5 10 {*movhi}
;      (nil))
	ldi r26,lo8(0)	 ;  32	*movhi/1	[length = 2]
	ldi r27,hi8(0)

Comment 8 Jackie Rosen 2014-02-16 13:16:19 UTC Comment hidden (spam)

*** Bug 260998 has been marked as a duplicate of this bug. ***
Seen from the domain http://volichat.com
Page where seen: http://volichat.com/adult-chat-rooms
Marked for reference. Resolved as fixed @bugzilla.

Comment 9 Andrew Pinski 2023-05-31 04:44:35 UTC

Starting around GCC 5 or so, a call to __bswapsi2 is done here.

my bet is if avr target adds a bswapsi2 pattern (which either expands or splits into the best moves), this will be optimized correctly.

Comment 10 Georg-Johann Lay 2023-05-31 08:27:47 UTC

bswap recognition appears to have improved since then.  When I -Os -mmcu=atmega8 -dp with 14.0.0 20230530:

unsigned long f_mem1 (unsigned char *P)
{
  unsigned long c;
  c = ((unsigned long) P[1] << 24)
    | ((unsigned long) P[2] << 16)
    | ((unsigned long) P[3] << 8)
    | ((unsigned long) P[4] << 0);
  return c;
}

unsigned long f_mem2 (unsigned char *P)
{
  unsigned long c;
  c = ((unsigned long) P[4] << 0)
    | ((unsigned long) P[3] << 8)
    | ((unsigned long) P[2] << 16)
    | ((unsigned long) P[1] << 24);
  return c;
}

unsigned long f_reg (unsigned long p)
{
  unsigned long c;
  c = (p << 24)
    | (0xff0000 & (p << 8))
    | (0xff00 & (p >> 8))
    | (p >> 24);
  return c;
}


I am getting:


f_mem1:
	movw r30,r24	 ;  33	[c=4 l=1]  *movhi/0
	ldd r22,Z+1	 ;  34	[c=16 l=4]  *movsi/2
	ldd r23,Z+2
	ldd r24,Z+3
	ldd r25,Z+4
	rcall __bswapsi2	 ;  35	[c=16 l=1]  *bswapsi2.libgcc
	ret		 ;  38	[c=0 l=1]  return

f_mem2:
	movw r30,r24	 ;  33	[c=4 l=1]  *movhi/0
	ldd r22,Z+1	 ;  34	[c=16 l=4]  *movsi/2
	ldd r23,Z+2
	ldd r24,Z+3
	ldd r25,Z+4
	rcall __bswapsi2	 ;  35	[c=16 l=1]  *bswapsi2.libgcc
	ret		 ;  38	[c=0 l=1]  return

f_reg:
	rcall __bswapsi2	 ;  42	[c=16 l=1]  *bswapsi2.libgcc
	ret		 ;  45	[c=0 l=1]  return

The RCALL + RET will be optimized to RJMP by -mrelax.

Splitting into a zoo of subregs is usually not a good idea because the register allocator won't manage to optimize them, resulting in bunch of moves and register pressure that are more expensive than the bswap itself.

Notice the calls are transparent, i.e. their footprint is just as big as required:

(define_insn "*bswapsi2.libgcc"
  [(set (reg:SI 22)
        (bswap:SI (reg:SI 22)))
   (clobber (reg:CC REG_CC))]
  "reload_completed"
  "%~call __bswapsi2"
  [(set_attr "type" "xcall")])

Comment 11 Georg-Johann Lay 2023-05-31 08:28:51 UTC

(In reply to Georg-Johann Lay from comment #10)
> bswap recognition appears to have improved since then [...]

Closed thusly.