Bug 19463

Summary:	387 constants still emitted with -mno-80387 & -mfpmath=sse
Product:	gcc	Reporter:	tbp <tbptbp>
Component:	target	Assignee:	Not yet assigned to anyone <unassigned>
Status:	RESOLVED FIXED
Severity:	enhancement	CC:	gcc-bugs
Priority:	P3	Keywords:	missed-optimization, ssemmx
Version:	4.0.0
Target Milestone:	4.0.0
Host:	cygwin	Target:
Build:		Known to work:
Known to fail:		Last reconfirmed:

Description tbp 2005-01-15 20:11:19 UTC

Under elusive conditions, hence the rather cluttered testcase, i'm seeing some
387 constants being used here and there; it doesn't have a big performance
impact but it's not uncommon or correct :)

Happens on gcc-40102 and a 2 day old cvs version (post SSE audit).

With: -O2 -march=k8 -fomit-frame-pointer -mfpmath=sse -mno-80387
[-mno-fancy-math-387]
#include <math.h>

struct vec_t {
       float x,y,z;
       vec_t() {}
       vec_t(const float f1,const float f2,const float f3) : x(f1),y(f2),z(f3) {}

       vec_t operator *(const float f)  const { return vec_t(x*f,y*f,z*f); }
       vec_t operator -(const vec_t &v) const { return vec_t(x-v.x,y-v.y,z-v.z); }
       float mag() const { return sqrtf(x*x + y*y + z*z); }
       vec_t normalize() const { return *this * (1.f/mag()); }
       vec_t cross(const vec_t &v) const { return vec_t(y*v.z - z*v.y,z*v.x
- x*v.z, x*v.y - y*v.x); }
};

// needs -O >= 2
void fpu_constant(vec_t &v1, vec_t &v2, vec_t &v3) {
       const vec_t
               d       = (v1-v2).normalize(),
               r       = v3.cross(d).normalize(),
               u       = d.cross(r).normalize();

       v1 = d; v2 = r; v3 = u;
}

// while making the testcase also found that; needs -O >=1
void fpu_load_store(vec_t &v1, vec_t &v2, vec_t &v3) {
       const vec_t
               d       = (v1-v2).normalize(),
               r       = v3.cross(d).normalize(),
               u;//    = d.cross(r).normalize();

       v1 = d; v2 = r; v3 = u;
}

int main() { return 0; }

In fpu_constant i'm seeing:
 401073:       fld1
...
 4010ba:       movss  0x402000,%xmm1
 4010c2:       sqrtss %xmm0,%xmm0
 4010c6:       divss  %xmm0,%xmm1
...
 401123:       fstps  (%esp)
...
 4011c0:       movss  (%esp),%xmm1
 4011c5:       sqrtss %xmm0,%xmm0
 4011c9:       divss  %xmm0,%xmm1

While making that testcase, i've stumbled upon fpu_load_store:
 401220:       flds   0x402004
...
 401325:       fsts   0x8(%ecx)
 401328:       fsts   0x4(%ecx)
 40132b:       fstps  (%ecx)
 40132d:       add    $0x8,%esp
 401330:       pop    %ebx
 401331:       ret

I've also spotted an fldz on occasion (but not in this testcase).

Comment 1 Andrew Pinski 2005-01-15 20:39:55 UTC

I would doubt this is target related but really the register allocator sucking.

Comment 2 Andrew Pinski 2005-01-18 15:22:23 UTC

This has been fixed now on the mainline:

        subss   8(%rsi), %xmm7
        movss   .LC1(%rip), %xmm6


        movss   .LC0(%rip), %xmm11
        movss   4(%rdx), %xmm2

Comment 3 Andrew Pinski 2005-01-18 15:26:25 UTC

(In reply to comment #2)
> This has been fixed now on the mainline:
Note I copied the wrong asm:
        pushl   %ebx
.LCFI5: 
        movl    $0x3f800000, %ebx
        subl    $8, %esp
.LCFI6: 
        movl    16(%esp), %eax
        movl    20(%esp), %edx
        movl    24(%esp), %ecx
        movl    %ebx, 4(%esp)
        movss   4(%eax), %xmm4

 pushl   %ebx
.LCFI3: 
        movl    $0x3f800000, %ebx
        subl    $16, %esp
.LCFI4: 
        movl    24(%esp), %eax
        movl    28(%esp), %edx
        movl    32(%esp), %ecx
        movss   4(%eax), %xmm2
        movss   (%eax), %xmm4
        subss   4(%edx), %xmm2


But it is still fixed.