[PATCH, RS6000 PR target/94954] Fix wrong codegen for vec_pack_to_short_fp32() builtin

Fri Jun 12 14:46:05 GMT 2020

On Fri, 2020-06-12 at 10:24 -0400, David Edelsohn wrote:
> Hi, Will

Hi, 

> 
> On Fri, Jun 12, 2020 at 12:22 AM will schmidt <
> will_schmidt@vnet.ibm.com> wrote:
> > 
> > 
> > Hi,
> >   Fix codegen implementation for the builtin
> > vec_pack_to_short_fp32.
> > 
> >   Regtests are underway against powerpc64
> > (power7be,power8le,power9le).
> >   (this is a power9 builtin, so should be a noop for most of
> > those).
> >   OK for trunk and backports?
> > 
> >   Thanks
> >   -Will
> > 
> > 
> >     [gcc]
> >       target pr/94954
> > 
> >         * config/rs6000/altivec.h
> > (vec_pack_to_short_fp32):  Update.
> >         * config/rs6000/altivec.md (UNSPEC_CONVERT_4F32_8F16):  New
> > unspec.
> >         (convert_4f32_8f16):  New define_expand.
> >         * config/rs6000/rs6000-builtin.def (convert_4f32_8f16): New
> > builtin define
> >         and overload.
> >         * config/rs6000/rs6000-call.c
> > (P9V_BUILTIN_VEC_CONVERT_4F32_8F16): New
> >         overloaded builtin entry.
> >         * config/rs6000/vsx.md (UNSPEC_VSX_XVCVSPHP): New unspec.
> >         (vsx_xvcvsphp): New define_insn.
> > 
> >     [testsuite]
> >         * testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c:
> > Update.
> > 
> > diff --git a/gcc/config/rs6000/altivec.h
> > b/gcc/config/rs6000/altivec.h
> > index 0a7e8ab..ab10025 100644
> > --- a/gcc/config/rs6000/altivec.h
> > +++ b/gcc/config/rs6000/altivec.h
> > @@ -431,11 +431,11 @@
> >  /* Vector additions added in ISA 3.0.  */
> >  #define vec_first_match_index __builtin_vec_first_match_index
> >  #define vec_first_match_or_eos_index
> > __builtin_vec_first_match_or_eos_index
> >  #define vec_first_mismatch_index
> > __builtin_vec_first_mismatch_index
> >  #define vec_first_mismatch_or_eos_index
> > __builtin_vec_first_mismatch_or_eos_index
> > -#define vec_pack_to_short_fp32 __builtin_vec_convert_4f32_8i16
> > +#define vec_pack_to_short_fp32 __builtin_vec_convert_4f32_8f16
> >  #define vec_parity_lsbb __builtin_vec_vparity_lsbb
> >  #define vec_vctz __builtin_vec_vctz
> >  #define vec_cnttz __builtin_vec_vctz
> >  #define vec_vctzb __builtin_vec_vctzb
> >  #define vec_vctzd __builtin_vec_vctzd
> > diff --git a/gcc/config/rs6000/altivec.md
> > b/gcc/config/rs6000/altivec.md
> > index 159f24e..855e7cc 100644
> > --- a/gcc/config/rs6000/altivec.md
> > +++ b/gcc/config/rs6000/altivec.md
> > @@ -78,10 +78,11 @@
> >     UNSPEC_VUNPACK_HI_SIGN_DIRECT
> >     UNSPEC_VUNPACK_LO_SIGN_DIRECT
> >     UNSPEC_VUPKHPX
> >     UNSPEC_VUPKLPX
> >     UNSPEC_CONVERT_4F32_8I16
> > +   UNSPEC_CONVERT_4F32_8F16
> >     UNSPEC_DST
> >     UNSPEC_DSTT
> >     UNSPEC_DSTST
> >     UNSPEC_DSTSTT
> >     UNSPEC_LVSL
> > @@ -3215,10 +3216,32 @@
> >    emit_insn (gen_altivec_vctuxs (rtx_tmp_lo, operands[2],
> > const0_rtx));
> >    emit_insn (gen_altivec_vpkswss (operands[0], rtx_tmp_hi,
> > rtx_tmp_lo));
> >    DONE;
> >  })
> > 
> > +
> > +;; Convert two vector F32 to packed vector f16.
> 
> Just for consistency, the comment above probably should be F16
> (capitalized) above.

Ok. I've fixed that locally. 

> Why does this pattern use V8HI mode instead of V8HF if this is for
> 8F16 in contrast to the existing 8I16 pattern?  Other than that
> (fundamental) question, the rest looks good.

Yeah, so even though the contents of the returned vector consists of
packed float data, the return type is still vector unsigned short.

I can't speak to the reasoning behind why the builtin was defined that
way. 

Thanks for the review, 

thanks,
-Will

> 
> Thanks, David
> 
> > +(define_expand "convert_4f32_8f16"
> > +  [(set (match_operand:V8HI 0 "register_operand" "=v")
> > +       (unspec:V8HI [(match_operand:V4SF 1 "register_operand" "v")
> > +                     (match_operand:V4SF 2 "register_operand"
> > "v")]
> > +                    UNSPEC_CONVERT_4F32_8F16))]
> > +  "TARGET_P9_VECTOR"
> > +{
> > +  rtx rtx_tmp_hi = gen_reg_rtx (V4SImode);
> > +  rtx rtx_tmp_lo = gen_reg_rtx (V4SImode);
> > +
> > +  emit_insn (gen_vsx_xvcvsphp (rtx_tmp_hi, operands[1] ));
> > +  emit_insn (gen_vsx_xvcvsphp (rtx_tmp_lo, operands[2] ));
> > +  if (BYTES_BIG_ENDIAN)
> > +    emit_insn (gen_altivec_vpkuwum (operands[0], rtx_tmp_lo,
> > rtx_tmp_hi));
> > +  else
> > +    emit_insn (gen_altivec_vpkuwum (operands[0], rtx_tmp_hi,
> > rtx_tmp_lo));
> > +  DONE;
> > +})
> > +
> > +
> >  ;; Generate
> >  ;;    xxlxor/vxor SCRATCH0,SCRATCH0,SCRATCH0
> >  ;;    vsubu?m SCRATCH2,SCRATCH1,%1
> >  ;;    vmaxs? %0,%1,SCRATCH2"
> >  (define_expand "abs<mode>2"
> > diff --git a/gcc/config/rs6000/rs6000-builtin.def
> > b/gcc/config/rs6000/rs6000-builtin.def
> > index 8b1ddb0..47e9137 100644
> > --- a/gcc/config/rs6000/rs6000-builtin.def
> > +++ b/gcc/config/rs6000/rs6000-builtin.def
> > @@ -2208,10 +2208,11 @@ BU_P8V_OVERLOAD_3 (VPERMXOR,   "vpermxor")
> > 
> >  /* ISA 3.0 vector overloaded 2-argument functions. */
> >  BU_P9V_AV_2 (VSLV,             "vslv",                 CONST,
> > vslv)
> >  BU_P9V_AV_2 (VSRV,             "vsrv",                 CONST,
> > vsrv)
> >  BU_P9V_AV_2 (CONVERT_4F32_8I16, "convert_4f32_8i16", CONST,
> > convert_4f32_8i16)
> > +BU_P9V_AV_2 (CONVERT_4F32_8F16, "convert_4f32_8f16", CONST,
> > convert_4f32_8f16)
> > 
> >  BU_P9V_AV_2 (VFIRSTMATCHINDEX_V16QI, "first_match_index_v16qi",
> >              CONST, first_match_index_v16qi)
> >  BU_P9V_AV_2 (VFIRSTMATCHINDEX_V8HI, "first_match_index_v8hi",
> >              CONST, first_match_index_v8hi)
> > @@ -2238,10 +2239,11 @@ BU_P9V_AV_2 (VFIRSTMISMATCHOREOSINDEX_V4SI,
> > "first_mismatch_or_eos_index_v4si",
> > 
> >  /* ISA 3.0 vector overloaded 2-argument functions. */
> >  BU_P9V_OVERLOAD_2 (VSLV,       "vslv")
> >  BU_P9V_OVERLOAD_2 (VSRV,       "vsrv")
> >  BU_P9V_OVERLOAD_2 (CONVERT_4F32_8I16, "convert_4f32_8i16")
> > +BU_P9V_OVERLOAD_2 (CONVERT_4F32_8F16, "convert_4f32_8f16")
> > 
> >  /* 2 argument vector functions added in ISA 3.0 (power9). */
> >  BU_P9V_AV_2
> > (VADUB,            "vadub",                CONST,  vaduv16qi3)
> >  BU_P9V_AV_2
> > (VADUH,            "vaduh",                CONST,  vaduv8hi3)
> >  BU_P9V_AV_2
> > (VADUW,            "vaduw",                CONST,  vaduv4si3)
> > diff --git a/gcc/config/rs6000/rs6000-call.c
> > b/gcc/config/rs6000/rs6000-call.c
> > index 0ac8054..9708d7e 100644
> > --- a/gcc/config/rs6000/rs6000-call.c
> > +++ b/gcc/config/rs6000/rs6000-call.c
> > @@ -1975,10 +1975,12 @@ const struct altivec_builtin_types
> > altivec_overloaded_builtins[] = {
> >    { P8V_BUILTIN_VEC_NEG, P8V_BUILTIN_NEG_V2DF,
> >      RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
> > 
> >    { P9V_BUILTIN_VEC_CONVERT_4F32_8I16,
> > P9V_BUILTIN_CONVERT_4F32_8I16,
> >      RS6000_BTI_unsigned_V8HI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0
> > },
> > +  { P9V_BUILTIN_VEC_CONVERT_4F32_8F16,
> > P9V_BUILTIN_CONVERT_4F32_8F16,
> > +    RS6000_BTI_unsigned_V8HI, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0
> > },
> > 
> >    { P9V_BUILTIN_VEC_VFIRSTMATCHINDEX,
> > P9V_BUILTIN_VFIRSTMATCHINDEX_V16QI,
> >      RS6000_BTI_UINTSI, RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0 },
> >    { P9V_BUILTIN_VEC_VFIRSTMATCHINDEX,
> > P9V_BUILTIN_VFIRSTMATCHINDEX_V16QI,
> >      RS6000_BTI_UINTSI, RS6000_BTI_unsigned_V16QI,
> > RS6000_BTI_unsigned_V16QI, 0 },
> > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> > index 2a28215..38908a5 100644
> > --- a/gcc/config/rs6000/vsx.md
> > +++ b/gcc/config/rs6000/vsx.md
> > @@ -295,10 +295,11 @@
> >     UNSPEC_VSX_DIVSD
> >     UNSPEC_VSX_DIVUD
> >     UNSPEC_VSX_MULSD
> >     UNSPEC_VSX_SIGN_EXTEND
> >     UNSPEC_VSX_XVCVSPSXDS
> > +   UNSPEC_VSX_XVCVSPHP
> >     UNSPEC_VSX_VSLO
> >     UNSPEC_VSX_EXTRACT
> >     UNSPEC_VSX_SXEXPDP
> >     UNSPEC_VSX_SXSIG
> >     UNSPEC_VSX_SIEXPDP
> > @@ -2177,10 +2178,19 @@
> >                      UNSPEC_VSX_CVHPSP))]
> >    "TARGET_P9_VECTOR"
> >    "xvcvhpsp %x0,%x1"
> >    [(set_attr "type" "vecfloat")])
> > 
> > +;; Generate xvcvsphp
> > +(define_insn "vsx_xvcvsphp"
> > +  [(set (match_operand:V4SI 0 "register_operand" "=wa")
> > +        (unspec:V4SI [(match_operand:V4SF 1 "vsx_register_operand"
> > "wa")]
> > +                    UNSPEC_VSX_XVCVSPHP))]
> > +  "TARGET_P9_VECTOR"
> > +  "xvcvsphp %x0,%x1"
> > +[(set_attr "type" "vecfloat")])
> > +
> >  ;; xscvdpsp used for splat'ing a scalar to V4SF, knowing that the
> > internal SF
> >  ;; format of scalars is actually DF.
> >  (define_insn "vsx_xscvdpsp_scalar"
> >    [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
> >         (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand"
> > "wa")]
> > diff --git a/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-
> > runnable.c b/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-
> > runnable.c
> > index 0e4ab48..038ba44 100644
> > --- a/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1-p9-runnable.c
> > @@ -1,25 +1,37 @@
> >  /* { dg-do run { target { powerpc*-*-linux* && { lp64 &&
> > p9vector_hw } } } } */
> >  /* { dg-require-effective-target powerpc_p9vector_ok } */
> >  /* { dg-options "-O2 -mdejagnu-cpu=power9" } */
> > 
> >  #include <altivec.h>
> > +#include <stdio.h>
> > 
> >  void abort (void);
> > 
> >  int main() {
> >    int i;
> >    vector float vfa, vfb;
> > -  vector unsigned short vur, vuexpt;
> > +  vector unsigned short vur, vresult;
> > 
> > -  vfa = (vector float){3.4, 5.0, 20.0, 50.9 };
> > -  vfb = (vector float){10.0, 40.0, 70.0, 100.0 };
> > -  vuexpt = (vector unsigned short){ 3, 5, 20, 50,
> > -                                    10, 40, 70, 100};
> > +  vfa = (vector float){0.4, 1.6, 20.0, 99.9 };
> > +  vfb = (vector float){10.0, -2.0, 70.0, 999.0 };
> > 
> > +  /* Results from converting those single-precion floats to half-
> > precision packed values. */
> > +  vresult = (vector unsigned short) { 0x3666, 0x3e66, 0x4d00,
> > 0x563e,
> > +                                     0x4900, 0xc000, 0x5460,
> > 0x63ce };
> > +
> > +  /* convert from single-precion float to unsigned short and pack
> > */
> >    vur = vec_pack_to_short_fp32 (vfa, vfb);
> > 
> > +#ifdef DEBUG
> > +  for(i = 0; i< 4; i++) { printf("i=[%d] %f  \n",i,vfa[i]); }
> > +  for(i = 0; i< 4; i++) { printf("i=[%d] %f  \n",i+4,vfb[i]); }
> > +  for(i = 0; i< 8; i++) { printf("i=[%d] %d  \n",i,vur[i]); }
> > +#endif
> > +
> >    for(i = 0; i< 8; i++) {
> > -    if (vur[i] != vuexpt[i])
> > +    if (vur[i] != vresult[i]) {
> > +           printf("i=[%d] 0x%x != 0x%x \n",i,vur[i],vresult[i]);
> >        abort();
> > +    }
> >    }
> >  }
> >