[Bug rtl-optimization/102178] [12 Regression] SPECFP 2006 470.lbm regressions on AMD Zen CPUs after r12-897-gde56f95afaaa22

Jan Hubicka hubicka@kam.mff.cuni.cz
Thu Jan 27 12:04:34 GMT 2022


> I would say so.  It saves code size and also uop space unless the two
> can magically fuse to a immediate to %xmm move (I doubt that).
I made simple benchmark

double a=10;
int
main()
{
        long int i;
        double sum,val1,val2,val3,val4;
         for (i=0;i<1000000000;i++)
         {
#if 1
#if 1
                asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   %%r8, %0": "=x"(val1): :"r8","xmm11");
                asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   %%r8, %0": "=x"(val2): :"r8","xmm11");
                asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   %%r8, %0": "=x"(val3): :"r8","xmm11");
                asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   %%r8, %0": "=x"(val4): :"r8","xmm11");
#else
                asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": "=x"(val1):"m"(a) :"r8","xmm11");
                asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": "=x"(val2):"m"(a) :"r8","xmm11");
                asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": "=x"(val3):"m"(a) :"r8","xmm11");
                asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": "=x"(val4):"m"(a) :"r8","xmm11");
#endif
#else
                asm __volatile__("vmovq   %1, %0": "=x"(val1):"m"(a) :"r8","xmm11");
                asm __volatile__("vmovq   %1, %0": "=x"(val2):"m"(a) :"r8","xmm11");
                asm __volatile__("vmovq   %1, %0": "=x"(val3):"m"(a) :"r8","xmm11");
                asm __volatile__("vmovq   %1, %0": "=x"(val4):"m"(a) :"r8","xmm11");
#endif
                sum+=val1+val2+val3+val4;
                 }
         return sum;

and indeed the third variant runs 1.2s while the first two takes equal
time 2.4s on my zen2 laptop.


More information about the Gcc-bugs mailing list