Bug 47556 - x86: fails to take advantage of high-byte addressing mode
Summary: x86: fails to take advantage of high-byte addressing mode
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 4.5.1
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2011-01-31 21:44 UTC by Jeremy Fitzhardinge
Modified: 2011-08-25 15:22 UTC (History)
4 users (show)

See Also:
Host:
Target: i?86-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2011-07-16 15:12:15


Attachments
Preprocessed file (812 bytes, application/octet-stream)
2011-01-31 21:44 UTC, Jeremy Fitzhardinge
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Jeremy Fitzhardinge 2011-01-31 21:44:39 UTC
Created attachment 23188 [details]
Preprocessed file

When comparing two bytes in a 16-bit value, gcc introduces an unnecessary temp register rather than just directly comparing the two byte halves.

In the attached source, the code generated for

  if (inc.head == inc.tail)
   goto out;

is
        movzbl  %ah, %edx
        cmpb    %al, %dl

rather than simply
        cmpb %ah, %al

If it did, then the code would be identical to the hand-written asm it is intended to replace.

I'm compiling with "gcc -S -O2 -Wall halfreg-codegen.i"

The gcc version is the standard Fedora 14 package:
$ gcc -v
Using built-in specs.
COLLECT_GCC=/usr/bin/gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.5.1/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,lto --enable-plugin --enable-java-awt=gtk --disable-dssi --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-1.5.0.0/jre --enable-libgcj-multifile --enable-java-maintainer-mode --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --disable-libjava-multilib --with-ppl --with-cloog --with-tune=generic --with-arch_32=i686 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.5.1 20100924 (Red Hat 4.5.1-4) (GCC) 

$ rpm -q gcc
gcc-4.5.1-4.fc14.x86_64
Comment 1 Andrew Pinski 2011-01-31 21:47:32 UTC
Sometimes these are done to avoid a stall due to partial use of the register in newer processors.
Comment 2 Jeremy Fitzhardinge 2011-01-31 22:16:54 UTC
Hm, yes, I see.  The hand-written asm, which uses %ah, does appear to run into false partial register stalls according to 3.5.2.3 in the Intel Optimisation Reference Manual.

On the other hand, the code generated by the C version appears to be slightly slower in measurement on a Nehalem system.  Since the code in question is all in the slow path (its the spin loop for a spinlock), perhaps the increased icache pressure from the increased code size is more significant than the register stalls.

Compiling with -Os rather than -O2 makes no difference.
Comment 3 H.J. Lu 2011-07-16 15:12:15 UTC
Here is the simplified testcase:

--
[hjl@gnu-6 pr47556]$ cat x.c
typedef unsigned short int uint16_t;
typedef unsigned char uint8_t;
typedef uint8_t  __ticket_t;
typedef uint16_t  __ticketpair_t;
typedef struct arch_spinlock {
  union {
    __ticketpair_t head_tail;
    struct __raw_tickets {
      __ticket_t head, tail;
    } tickets;
  };
} arch_spinlock_t;
static struct __raw_tickets __ticket_spin_claim(struct arch_spinlock *lock)
{
  register struct __raw_tickets tickets = { .tail = 1 };
  if (sizeof(lock->tickets.head) == sizeof(uint8_t))
    asm volatile ("lock; " "xaddw %w0, %1\n"
		  : "+r" (tickets), "+m" (lock->tickets)
		  : : "memory", "cc");
  else
    asm volatile ("lock; " "xaddl %0, %1\n"
		  : "+r" (tickets), "+m" (lock->tickets)
		  : : "memory", "cc");
  return tickets;
}
void __ticket_spin_lock(struct arch_spinlock *lock)
{
  register struct __raw_tickets inc;
  inc = __ticket_spin_claim(lock);
  for (;;) {
    if (inc.head == inc.tail)
      goto out;
    asm volatile ("pause");
    inc.head = (*(volatile typeof(lock->tickets.head) *)&(lock->tickets.head));
  }
out:
  asm volatile ("" : : : "memory");
}
[hjl@gnu-6 pr47556]$ 
---

The generated code is

---
__ticket_spin_lock:
.LFB1:
	.cfi_startproc
	movl	$256, %eax
#APP
# 17 "x.c" 1
	lock; xaddw %ax, (%rdi)

# 0 "" 2
#NO_APP
	movzbl	%ah, %edx
	cmpb	%al, %dl
	je	.L2
	.p2align 4,,10
	.p2align 3
.L4:
#APP
# 33 "x.c" 1
	pause
# 0 "" 2
#NO_APP
	movzbl	(%rdi), %eax
	cmpb	%dl, %al
	jne	.L4
.L2:
	ret
	.cfi_endproc
---

The main issue is

	cmpb	%dl, %al
	jne	.L4

But %Xh registers aren't really allocated by register allocator.
They are expressed via

        (set (reg:CCZ 17 flags)
            (compare:CCZ (subreg:QI (zero_extract:SI (subreg:DI (reg/v:HI 62 [ tickets ]) 0)
                        (const_int 8 [0x8])
                        (const_int 8 [0x8])) 0)
                (subreg:QI (reg/v:HI 62 [ tickets ]) 0)))

combine doesn't turn

(insn 10 9 11 2 (set (subreg:SI (reg:QI 61 [ tickets$tail ]) 0)
        (zero_extract:SI (subreg:SI (reg/v:HI 62 [ tickets ]) 0)
            (const_int 8 [0x8])
            (const_int 8 [0x8]))) x.c:17 89 {*movsi_extzv_1}
     (nil))

(insn 11 10 12 2 (set (reg:CCZ 17 flags)
        (compare:CCZ (reg:QI 61 [ tickets$tail ])
            (subreg:QI (reg/v:HI 62 [ tickets ]) 0))) x.c:31 4 {*cmpqi_1}
     (expr_list:REG_DEAD (reg/v:HI 62 [ tickets ])
        (nil)))
....
(insn 17 15 18 3 (set (reg:CCZ 17 flags)
        (compare:CCZ (reg:QI 59 [ inc$head ])
            (reg:QI 61 [ tickets$tail ]))) x.c:31 4 {*cmpqi_1}
     (expr_list:REG_DEAD (reg:QI 59 [ inc$head ])
        (nil)))

into

        (set (reg:CCZ 17 flags)
            (compare:CCZ (subreg:QI (zero_extract:SI (subreg:DI (reg/v:HI 62 [ tickets ]) 0)
                        (const_int 8 [0x8])
                        (const_int 8 [0x8])) 0)
                (subreg:QI (reg/v:HI 62 [ tickets ]) 0)))

....
        (set (reg:CCZ 17 flags)
            (compare:CCZ (reg:QI 59 [ inc$head ])
                (subreg:QI (zero_extract:SI (subreg:DI (reg/v:HI 62 [ tickets ]) 0)
                        (const_int 8 [0x8])
                        (const_int 8 [0x8])) 0)))