[patch] reload1.c for incorrect code generation

Mike Stump mrs@apple.com
Tue Apr 24 20:01:00 GMT 2007


On Apr 24, 2007, at 11:33 AM, Ian Lance Taylor wrote:
> This is OK, but please move the comment out of the conditional up with
> the other comments.

Here is the final version I put in...  The testcase is identical to  
the last, and the first two comment blocks are the same, though I did  
take the opportunity to re-wrap them as they were too long and to make  
it a single comment block.

Thanks for the fast turn around.

Doing diffs in .:
--- ./reload1.c.~1~	2007-04-21 15:48:10.000000000 -0700
+++ ./reload1.c	2007-04-24 11:59:41.000000000 -0700
@@ -6323,15 +6323,23 @@ merge_assigned_reloads (rtx insn)
  		transfer_replacements (i, j);
  	      }

-	  /* If this is now RELOAD_OTHER, look for any reloads that load
-	     parts of this operand and set them to RELOAD_FOR_OTHER_ADDRESS
-	     if they were for inputs, RELOAD_OTHER for outputs.  Note that
-	     this test is equivalent to looking for reloads for this operand
-	     number.  */
-	  /* We must take special care with RELOAD_FOR_OUTPUT_ADDRESS; it may
-	     share registers with a RELOAD_FOR_INPUT, so we can not change it
-	     to RELOAD_FOR_OTHER_ADDRESS.  We should never need to, since we
-	     do not modify RELOAD_FOR_OUTPUT.  */
+	  /* If this is now RELOAD_OTHER, look for any reloads that
+	     load parts of this operand and set them to
+	     RELOAD_FOR_OTHER_ADDRESS if they were for inputs,
+	     RELOAD_OTHER for outputs.  Note that this test is
+	     equivalent to looking for reloads for this operand
+	     number.
+
+	     We must take special care with RELOAD_FOR_OUTPUT_ADDRESS;
+	     it may share registers with a RELOAD_FOR_INPUT, so we can
+	     not change it to RELOAD_FOR_OTHER_ADDRESS.  We should
+	     never need to, since we do not modify RELOAD_FOR_OUTPUT.
+
+	     It is possible that the RELOAD_FOR_OPERAND_ADDRESS
+	     instruction is assigned the same register as the earlier
+	     RELOAD_FOR_OTHER_ADDRESS instruction.  Merging these two
+	     instructions will cause the RELOAD_FOR_OTHER_ADDRESS
+	     instruction to be deleted later on.  */

  	  if (rld[i].when_needed == RELOAD_OTHER)
  	    for (j = 0; j < n_reloads; j++)
@@ -6339,6 +6347,7 @@ merge_assigned_reloads (rtx insn)
  		  && rld[j].when_needed != RELOAD_OTHER
  		  && rld[j].when_needed != RELOAD_FOR_OTHER_ADDRESS
  		  && rld[j].when_needed != RELOAD_FOR_OUTPUT_ADDRESS
+		  && rld[j].when_needed != RELOAD_FOR_OPERAND_ADDRESS
  		  && (! conflicting_input
  		      || rld[j].when_needed == RELOAD_FOR_INPUT_ADDRESS
  		      || rld[j].when_needed == RELOAD_FOR_INPADDR_ADDRESS)
--- ./testsuite/gcc.target/i386/reload-1.c.~1~	2007-04-24  
11:50:47.000000000 -0700
+++ ./testsuite/gcc.target/i386/reload-1.c	2007-04-24  
11:50:47.000000000 -0700
@@ -0,0 +1,109 @@
+/* { dg-do compile { target i?86-*-* } } */
+/* { dg-options "-O3 -msse2 -fdump-rtl-csa" } */
+/* { dg-skip-if "" { i?86-*-* } { "-m64" } { "" } } */
+/* { dg-final { scan-file-not reload-1.c.167r.csa "deleted 1 dead  
insns" } }*/
+#include <emmintrin.h>
+typedef __SIZE_TYPE__ size_t;
+typedef float vFloat __attribute__ ((__vector_size__ (16)));
+typedef double vDouble __attribute__ ((__vector_size__ (16)));
+typedef struct buf
+{
+  void *data;
+  unsigned long h;
+  unsigned long  w;
+  size_t bytes;
+} buf;
+
+typedef struct job
+{
+  struct Job *next;
+  void * info;
+  long (*func)(struct Job *job);
+  long error;
+} job;
+
+typedef struct fj
+{
+    job hd;
+    buf src;
+    buf dest;
+    float g;
+    unsigned int flags;
+} fj;
+
+static const double r[256], t[256];
+
+long bar (const buf *src, const buf *dest, float g, unsigned int flags)
+{
+  float *d0 = (float*) src->data;
+  float *d1 = (float*) dest->data;
+  uintptr_t w = dest->w;
+  uintptr_t idx;
+  vFloat p0;
+  static const vFloat m0;
+  static const vDouble p[3], m, b;
+  float *sr = d0;
+  float *dr = d1;
+  for( idx = 0; idx + 8 <= w; idx += 8 )
+  {
+    vFloat f0 = _mm_loadu_ps (sr);
+    vFloat f1 = _mm_loadu_ps (sr + 4);
+    sr += 8;
+    vFloat fa0 = _mm_andnot_ps (m0, f0);
+    vFloat fa1 = _mm_andnot_ps (m0, f1);
+    vDouble v0 = _mm_cvtps_pd (fa0);
+    vDouble v1 = _mm_cvtps_pd (_mm_movehl_ps (fa0, fa0));
+    vDouble v2 = _mm_cvtps_pd (fa1);
+    vDouble v3 = _mm_cvtps_pd (_mm_movehl_ps (fa1, fa1));
+    vDouble  vi0, vi1, vi2, vi3;
+    __m128i b0, b1, b2, b3;
+    b0 = _mm_packs_epi32 (_mm_packs_epi32 (b0, b1), _mm_packs_epi32  
(b2, b3));
+    b1 = _mm_srli_epi64 (b0, 32);
+    unsigned int i0 = _mm_cvtsi128_si32 (b0);
+    unsigned int i2 = _mm_cvtsi128_si32 (b1);
+    v0 -= _mm_loadh_pd (_mm_load_sd (r + (i0 & 0xff)), r + (i0 >> 16));
+    v1 -= _mm_loadh_pd (_mm_load_sd (r + (i2 & 0xff)), r + (i2 >> 16));
+    b0 = _mm_unpackhi_epi64 (b0, b0);
+    b1 = _mm_unpackhi_epi64 (b1, b1);
+    unsigned int i4 = _mm_cvtsi128_si32 (b0);
+    unsigned int i6 = _mm_cvtsi128_si32 (b1);
+    v2 -= _mm_loadh_pd (_mm_load_sd (r + (i4 & 0xff)), r + (i4 >> 16));
+    v3 -= _mm_loadh_pd (_mm_load_sd (r + (i6 & 0xff)), r + (i6 >> 16));
+    v0 = p[0] + (p[1] + p[2] * v0) * v0;
+    v1 = p[0] + (p[1] + p[2] * v1) * v1;
+    v2 = p[0] + (p[1] + p[2] * v2) * v2;
+    v3 = p[0] + (p[1] + p[2] * v3) * v3;
+    vi0 = (vDouble) _mm_slli_epi64 ((__m128i)((vi0 + b) + m), 52);
+    vi1 = (vDouble) _mm_slli_epi64 ((__m128i)((vi1 + b) + m), 52);
+    vi2 = (vDouble) _mm_slli_epi64 ((__m128i)((vi2 + b) + m), 52);
+    vi3 = (vDouble) _mm_slli_epi64 ((__m128i)((vi3 + b) + m), 52);
+    vi0 *= _mm_loadh_pd (_mm_load_sd (t + (i0 & 0xff)), t + (i0 >>  
16));
+    vi1 *= _mm_loadh_pd (_mm_load_sd (t + (i2 & 0xff)), t + (i2 >>  
16));
+    vi2 *= _mm_loadh_pd (_mm_load_sd (t + (i4 & 0xff)), t + (i4 >>  
16));
+    vi3 *= _mm_loadh_pd (_mm_load_sd (t + (i6 & 0xff)), t + (i6 >>  
16));
+    v0 *= vi0;
+    v1 *= vi1;
+    v2 *= vi2;
+    v3 *= vi3;
+    vFloat r0 = _mm_movelh_ps (_mm_cvtpd_ps( v0 ), _mm_cvtpd_ps (v1));
+    vFloat r1 = _mm_movelh_ps (_mm_cvtpd_ps( v2 ), _mm_cvtpd_ps (v3));
+    vFloat z0 = _mm_cmpeq_ps (f0, _mm_setzero_ps());
+    vFloat z1 = _mm_cmpeq_ps (f1, _mm_setzero_ps());
+    r0 = _mm_andnot_ps (z0, r0);
+    r1 = _mm_andnot_ps (z1, r1);
+    z0 = _mm_and_ps (z0, p0);
+    z1 = _mm_and_ps (z1, p0);
+    r0 = _mm_or_ps (r0, z0);
+    r1 = _mm_or_ps (r1, z1);
+    _mm_storeu_ps (dr, r0);
+    _mm_storeu_ps (dr + 4, r1);
+    dr += 8;
+  }
+  return 0;
+}
+
+long foo (job *j )
+{
+  fj *jd = (fj*) j;
+  return bar (&jd->src, &jd->dest, jd->g, jd->flags);
+}
--------------



More information about the Gcc-patches mailing list