[Bug inline-asm/46615] New: [4.6 regression] possibly-invalid x86-64 inline asm miscompilation

Tue Nov 23 09:42:00 GMT 2010

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46615

           Summary: [4.6 regression] possibly-invalid x86-64 inline asm
                    miscompilation
           Product: gcc
           Version: 4.6.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: inline-asm
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: astrange@ithinksw.com

gcc 4.6 miscompiles this source from ffmpeg on x86-64-apple-darwin10, whereas
previous compilers worked. I'm not sure if the asm is legal, but it's existed
in the wild for a long time.

const unsigned long long __attribute__((aligned(8))) ff_bgr24toUV[2][4] =
{
    {0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL,
0x3838D0E300003838ULL},
    {0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL , 0x38380000D0E33838ULL,
0xF6E4D0E30000F6E4ULL},
};

static void 
bgr24ToUV_mmx_MMX2(int f)
{
    __asm__ volatile(
    "movq 24+%0, %%mm6 \n\t"
    :: "m"(ff_bgr24toUV[f == 0][0]));
}

void 
rgb24ToUV_MMX2()
{
    bgr24ToUV_mmx_MMX2(1);
}

> gcc -v
Using built-in specs.
COLLECT_GCC=/usr/local/gcc46/bin/gcc
COLLECT_LTO_WRAPPER=/usr/local/gcc46/libexec/gcc/x86_64-apple-darwin10.5.0/4.6.0/lto-wrapper
Target: x86_64-apple-darwin10.5.0
Configured with: ../../src/gcc/configure --prefix=/usr/local/gcc46
--with-arch=native --with-tune=native --disable-nls --with-gmp=/sw
--disable-bootstrap --enable-checking --enable-languages=c,c++,lto,objc,obj-c++
Thread model: posix
gcc version 4.6.0 20101122 (experimental) (GCC) 
> gcc -O -o swscale-fails.s -S swscale.i 
swscale.i: In function 'rgb24ToUV_MMX2':
swscale.i:10:2: warning: use of memory input without lvalue in asm operand 0 is
deprecated [enabled by default]

Working asm (4.2):
_rgb24ToUV_MMX2:
    pushq    %rbp
    movq    %rsp, %rbp
    movq 24+_ff_bgr24toUV(%rip), %mm6 
    leave
    ret
.globl _ff_bgr24toUV
    .const
    .align 3
_ff_bgr24toUV:
    .quad    4050987868490315832
    .quad    -1369135209168966401
    .quad    -656399642184648988
    .quad    4051217538195929144
    .quad    -1369375758026740481
    .quad    4051228417348089912
    .quad    4050987868324313144
    .quad    -656169972313032988
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support

Non-working asm (4.6):
_rgb24ToUV_MMX2:
    movq 24+LC0(%rip), %mm6     
    ret
    .globl _ff_bgr24toUV
    .const
    .align 3
_ff_bgr24toUV:
    .quad    4050987868490315832
    .quad    -1369135209168966401
    .quad    -656399642184648988
    .quad    4051217538195929144
    .quad    -1369375758026740481
    .quad    4051228417348089912
    .quad    4050987868324313144
    .quad    -656169972313032988
    .literal8
    .align 3
LC0:
    .quad    4050987868490315832
    .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support

24+_ff_bgr24toUV(%rip) is fine, but 24+LC0(%rip) is a pointer to nothing, and
ld breaks:

ld: in /var/folders/MY/MYkVh2TwHgKZhNFIG8M3wU+++TI/-Tmp-//cc9dJIWa.o, in
section __TEXT,__text reloc 0: local relocation for address 0x0000000C in
section __text does not target section __literal8

I'm going to fix the asm since it looks fragile anyway, but that won't fix
existing releases of ffmpeg.

Note that creating LC0 is not even an optimization since it doesn't save any
space (because the array is __attribute__((used))).