[AArch64] Enable generation of FRINTNZ instructions

Thu Nov 25 13:53:08 GMT 2021

On 22/11/2021 11:41, Richard Biener wrote:
>
>> On 18/11/2021 11:05, Richard Biener wrote:
>>> This is a good shout and made me think about something I hadn't before... I
>>> thought I could handle the vector forms later, but the problem is if I add
>>> support for the scalar, it will stop the vectorizer. It seems
>>> vectorizable_call expects all arguments to have the same type, which doesn't
>>> work with passing the integer type as an operand work around.
> We already special case some IFNs there (masked load/store and gather)
> to ignore some args, so that would just add to this set.
>
> Richard.
Hi,

Reworked it to add support of the new IFN to the vectorizer. Was 
initially trying to make vectorizable_call and 
vectorizable_internal_function handle IFNs with different inputs more 
generically, using the information we have in the <IFN>_direct structs 
regarding what operands to get the modes from. Unfortunately, that 
wasn't straightforward because of how vectorizable_call assumes operands 
have the same type and uses the type of the DEF_STMT_INFO of the 
non-constant operands (either output operand or non-constant inputs) to 
determine the type of constants. I assume there is some reason why we 
use the DEF_STMT_INFO and not always use get_vectype_for_scalar_type on 
the argument types. That is why I ended up with this sort of half-way 
mix of both, which still allows room to add more IFNs that don't take 
inputs of the same type, but require adding a bit of special casing 
similar to the IFN_FTRUNC_INT and masking ones.

Bootstrapped on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

         * config/aarch64/aarch64.md (ftrunc<mode><frintnz_mode>2): New 
pattern.
         * config/aarch64/iterators.md (FRINTNZ): New iterator.
         (frintnz_mode): New int attribute.
         (VSFDF): Make iterator conditional.
         * internal-fn.def (FTRUNC_INT): New IFN.
         * internal-fn.c (ftrunc_int_direct): New define.
         (expand_ftrunc_int_optab_fn): New custom expander.
         (direct_ftrunc_int_optab_supported_p): New supported_p.
         * match.pd: Add to the existing TRUNC pattern match.
         * optabs.def (ftrunc_int): New entry.
         * stor-layout.h (element_precision): Moved from here...
         * tree.h (element_precision): ... to here.
         (element_type): New declaration.
         * tree.c (element_type): New function.
         (element_precision): Changed to use element_type.
         * tree-vect-stmts.c (vectorizable_internal_function): Add 
support for
         IFNs with different input types.
         (vectorizable_call): Teach to handle IFN_FTRUNC_INT.
         * doc/md.texi: New entry for ftrunc pattern name.
         * doc/sourcebuild.texi (aarch64_frintzx_ok): New target.

gcc/testsuite/ChangeLog:

         * gcc.target/aarch64/merge_trunc1.c: Adapted to skip if frintNz 
instruction available.
         * lib/target-supports.exp: Added arm_v8_5a_frintnzx_ok target.
         * gcc.target/aarch64/frintnz.c: New test.
         * gcc.target/aarch64/frintnz_vec.c: New test.
-------------- next part --------------

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 4035e061706793849c68ae09bcb2e4b9580ab7b6..c5c60e7a810e22b0ea9ed6bf056ddd6431d60269 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7345,12 +7345,18 @@ (define_insn "despeculate_simpleti"
    (set_attr "speculation_barrier" "true")]
 )
 
+(define_expand "ftrunc<mode><frintnz_mode>2"
+  [(set (match_operand:VSFDF 0 "register_operand" "=w")
+        (unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
+		      FRINTNZ))]
+  "TARGET_FRINT"
+)
+
 (define_insn "aarch64_<frintnzs_op><mode>"
   [(set (match_operand:VSFDF 0 "register_operand" "=w")
 	(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
 		      FRINTNZX))]
-  "TARGET_FRINT && TARGET_FLOAT
-   && !(VECTOR_MODE_P (<MODE>mode) && !TARGET_SIMD)"
+  "TARGET_FRINT"
   "<frintnzs_op>\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
   [(set_attr "type" "f_rint<stype>")]
 )
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index bdc8ba3576cf2c9b4ae96b45a382234e4e25b13f..51f00344b02d0d1d4adf97463f6a46f9fd0fb43f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -160,7 +160,11 @@ (define_mode_iterator VHSDF_HSDF [(V4HF "TARGET_SIMD_F16INST")
 				  SF DF])
 
 ;; Scalar and vetor modes for SF, DF.
-(define_mode_iterator VSFDF [V2SF V4SF V2DF DF SF])
+(define_mode_iterator VSFDF [ (V2SF "TARGET_SIMD")
+			      (V4SF "TARGET_SIMD")
+			      (V2DF "TARGET_SIMD")
+			      (DF "TARGET_FLOAT")
+			      (SF "TARGET_FLOAT")])
 
 ;; Advanced SIMD single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
@@ -3067,6 +3071,8 @@ (define_int_iterator FCMLA [UNSPEC_FCMLA
 (define_int_iterator FRINTNZX [UNSPEC_FRINT32Z UNSPEC_FRINT32X
 			       UNSPEC_FRINT64Z UNSPEC_FRINT64X])
 
+(define_int_iterator FRINTNZ [UNSPEC_FRINT32Z UNSPEC_FRINT64Z])
+
 (define_int_iterator SVE_BRK_UNARY [UNSPEC_BRKA UNSPEC_BRKB])
 
 (define_int_iterator SVE_BRK_BINARY [UNSPEC_BRKN UNSPEC_BRKPA UNSPEC_BRKPB])
@@ -3482,6 +3488,8 @@ (define_int_attr f16mac1 [(UNSPEC_FMLAL "a") (UNSPEC_FMLSL "s")
 (define_int_attr frintnzs_op [(UNSPEC_FRINT32Z "frint32z") (UNSPEC_FRINT32X "frint32x")
 			      (UNSPEC_FRINT64Z "frint64z") (UNSPEC_FRINT64X "frint64x")])
 
+(define_int_attr frintnz_mode [(UNSPEC_FRINT32Z "si") (UNSPEC_FRINT64Z "di")])
+
 ;; The condition associated with an UNSPEC_COND_<xx>.
 (define_int_attr cmp_op [(UNSPEC_COND_CMPEQ_WIDE "eq")
 			 (UNSPEC_COND_CMPGE_WIDE "ge")
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 41f1850bf6e95005647ca97a495a97d7e184d137..d50d09b0ae60d98537b9aece4396a490f33f174c 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6175,6 +6175,15 @@ operands; otherwise, it may not.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{ftrunc@var{m}@var{n}2} instruction pattern
+@item @samp{ftrunc@var{m}@var{n}2}
+Truncate operand 1 to a @var{n} mode signed integer, towards zero, and store
+the result in operand 0. Both operands have mode @var{m}, which is a scalar or
+vector floating-point mode.  Exception must be thrown if operand 1 does not fit
+in a @var{n} mode signed integer as it would have if the truncation happened
+through separate floating point to integer conversion.
+
+
 @cindex @code{round@var{m}2} instruction pattern
 @item @samp{round@var{m}2}
 Round operand 1 to the nearest integer, rounding away from zero in the
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 40b1e0d816789b225089c4143fb63e62a6af817a..15d4de24d15cce6793b3bb61d728e61cea00924d 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2282,6 +2282,10 @@ Like @code{aarch64_sve_hw}, but also test for an exact hardware vector length.
 @item aarch64_fjcvtzs_hw
 AArch64 target that is able to generate and execute armv8.3-a FJCVTZS
 instruction.
+
+@item aarch64_frintzx_ok
+AArch64 target that is able to generate the Armv8.5-a FRINT32Z, FRINT64Z,
+FRINT32X and FRINT64X instructions.
 @end table
 
 @subsubsection MIPS-specific attributes
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 0cba95411a63423484dda5b1251f47de24e926ba..60b404ef44360c8ae0cda1176fb888302ddbc98d 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -130,6 +130,7 @@ init_internal_fns ()
 #define fold_left_direct { 1, 1, false }
 #define mask_fold_left_direct { 1, 1, false }
 #define check_ptrs_direct { 0, 0, false }
+#define ftrunc_int_direct { 0, 1, true }
 
 const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
 #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
@@ -156,6 +157,29 @@ get_multi_vector_move (tree array_type, convert_optab optab)
   return convert_optab_handler (optab, imode, vmode);
 }
 
+/* Expand FTRUNC_INT call STMT using optab OPTAB.  */
+
+static void
+expand_ftrunc_int_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  class expand_operand ops[2];
+  tree lhs, float_type, int_type;
+  rtx target, op;
+
+  lhs = gimple_call_lhs (stmt);
+  target = expand_normal (lhs);
+  op = expand_normal (gimple_call_arg (stmt, 0));
+
+  float_type = TREE_TYPE (lhs);
+  int_type = element_type (gimple_call_arg (stmt, 1));
+
+  create_output_operand (&ops[0], target, TYPE_MODE (float_type));
+  create_input_operand (&ops[1], op, TYPE_MODE (float_type));
+
+  expand_insn (convert_optab_handler (optab, TYPE_MODE (float_type),
+				      TYPE_MODE (int_type)), 2, ops);
+}
+
 /* Expand LOAD_LANES call STMT using optab OPTAB.  */
 
 static void
@@ -3688,6 +3712,15 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 	  != CODE_FOR_nothing);
 }
 
+static bool direct_ftrunc_int_optab_supported_p (convert_optab optab,
+						 tree_pair types,
+						 optimization_type opt_type)
+{
+  return (convert_optab_handler (optab, TYPE_MODE (types.first),
+				TYPE_MODE (element_type (types.second)),
+				opt_type) != CODE_FOR_nothing);
+}
+
 #define direct_unary_optab_supported_p direct_optab_supported_p
 #define direct_binary_optab_supported_p direct_optab_supported_p
 #define direct_ternary_optab_supported_p direct_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index bb13c6cce1bf55633760bc14980402f1f0ac1689..e58891e3d3ebc805dd55ac6f70bbda617b7302b7 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -66,6 +66,9 @@ along with GCC; see the file COPYING3.  If not see
 
    - fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
    - check_ptrs: used for check_{raw,war}_ptrs
+   - ftrunc_int: a unary conversion optab that takes and returns values of the
+   same mode, but internally converts via another mode.  This second mode is
+   specified using a dummy final function argument.
 
    DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
    maps to one of two optabs, depending on the signedness of an input.
@@ -269,6 +272,7 @@ DEF_INTERNAL_FLT_FLOATN_FN (RINT, ECF_CONST, rint, unary)
 DEF_INTERNAL_FLT_FLOATN_FN (ROUND, ECF_CONST, round, unary)
 DEF_INTERNAL_FLT_FLOATN_FN (ROUNDEVEN, ECF_CONST, roundeven, unary)
 DEF_INTERNAL_FLT_FLOATN_FN (TRUNC, ECF_CONST, btrunc, unary)
+DEF_INTERNAL_OPTAB_FN (FTRUNC_INT, ECF_CONST, ftruncint, ftrunc_int)
 
 /* Binary math functions.  */
 DEF_INTERNAL_FLT_FN (ATAN2, ECF_CONST, atan2, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index a319aefa8081ac177981ad425c461f8a771128f4..80660e6fd40bc6934e1fa0329c0fbcab1658ed44 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3713,12 +3713,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    trapping behaviour, so require !flag_trapping_math. */
 #if GIMPLE
 (simplify
-   (float (fix_trunc @0))
-   (if (!flag_trapping_math
-	&& types_match (type, TREE_TYPE (@0))
-	&& direct_internal_fn_supported_p (IFN_TRUNC, type,
-					  OPTIMIZE_FOR_BOTH))
-      (IFN_TRUNC @0)))
+   (float (fix_trunc@1 @0))
+   (if (types_match (type, TREE_TYPE (@0)))
+    (with {
+      tree int_type = element_type (@1);
+     }
+     (if (TYPE_SIGN (TREE_TYPE (@1)) == SIGNED
+	  && direct_internal_fn_supported_p (IFN_FTRUNC_INT, type, int_type,
+					     OPTIMIZE_FOR_BOTH))
+      (IFN_FTRUNC_INT @0 {
+       wide_int_to_tree (int_type, wi::max_value (TYPE_PRECISION (int_type),
+						  SIGNED)); })
+      (if (!flag_trapping_math
+	   && direct_internal_fn_supported_p (IFN_TRUNC, type,
+					      OPTIMIZE_FOR_BOTH))
+       (IFN_TRUNC @0))))))
 #endif
 
 /* If we have a narrowing conversion to an integral type that is fed by a
diff --git a/gcc/optabs.def b/gcc/optabs.def
index b889ad2e5a08613db51d16d072080ac6cb48404f..57d259d33409265df3af1646d123e4ab216c34c8 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -63,6 +63,7 @@ OPTAB_CX(fractuns_optab, "fractuns$Q$b$I$a2")
 OPTAB_CL(satfract_optab, "satfract$b$Q$a2", SAT_FRACT, "satfract", gen_satfract_conv_libfunc)
 OPTAB_CL(satfractuns_optab, "satfractuns$I$b$Q$a2", UNSIGNED_SAT_FRACT, "satfractuns", gen_satfractuns_conv_libfunc)
 
+OPTAB_CD(ftruncint_optab, "ftrunc$a$b2")
 OPTAB_CD(sfixtrunc_optab, "fix_trunc$F$b$I$a2")
 OPTAB_CD(ufixtrunc_optab, "fixuns_trunc$F$b$I$a2")
 
diff --git a/gcc/stor-layout.h b/gcc/stor-layout.h
index 9e892e50c8559e497fcae1b77a36401df82fabe2..165a592d4d2c7bf525060dd51ce6094eb4f4f68a 100644
--- a/gcc/stor-layout.h
+++ b/gcc/stor-layout.h
@@ -36,7 +36,6 @@ extern void place_field (record_layout_info, tree);
 extern void compute_record_mode (tree);
 extern void finish_bitfield_layout (tree);
 extern void finish_record_layout (record_layout_info, int);
-extern unsigned int element_precision (const_tree);
 extern void finalize_size_functions (void);
 extern void fixup_unsigned_type (tree);
 extern void initialize_sizetypes (void);
diff --git a/gcc/testsuite/gcc.target/aarch64/frintnz.c b/gcc/testsuite/gcc.target/aarch64/frintnz.c
new file mode 100644
index 0000000000000000000000000000000000000000..008e1cf9f4a1b0148128c65c9ea0d1bb111467b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/frintnz.c
@@ -0,0 +1,91 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.5-a" } */
+/* { dg-require-effective-target aarch64_frintnzx_ok } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/*
+** f1:
+**	frint32z	s0, s0
+**	ret
+*/
+float
+f1 (float x)
+{
+  int y = x;
+  return (float) y;
+}
+
+/*
+** f2:
+**	frint64z	s0, s0
+**	ret
+*/
+float
+f2 (float x)
+{
+  long long int y = x;
+  return (float) y;
+}
+
+/*
+** f3:
+**	frint32z	d0, d0
+**	ret
+*/
+double
+f3 (double x)
+{
+  int y = x;
+  return (double) y;
+}
+
+/*
+** f4:
+**	frint64z	d0, d0
+**	ret
+*/
+double
+f4 (double x)
+{
+  long long int y = x;
+  return (double) y;
+}
+
+float
+f1_dont (float x)
+{
+  unsigned int y = x;
+  return (float) y;
+}
+
+float
+f2_dont (float x)
+{
+  unsigned long long int y = x;
+  return (float) y;
+}
+
+double
+f3_dont (double x)
+{
+  unsigned int y = x;
+  return (double) y;
+}
+
+double
+f4_dont (double x)
+{
+  unsigned long long int y = x;
+  return (double) y;
+}
+
+double
+f5_dont (double x)
+{
+  signed short y = x;
+  return (double) y;
+}
+
+/* Make sure the 'dont's don't generate any frintNz.  */
+/* { dg-final { scan-assembler-times {frint32z} 2 } } */
+/* { dg-final { scan-assembler-times {frint64z} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/frintnz_vec.c b/gcc/testsuite/gcc.target/aarch64/frintnz_vec.c
new file mode 100644
index 0000000000000000000000000000000000000000..b93304eb2acb3d3d954eebee51d77ff23fee68ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/frintnz_vec.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.5-a" } */
+/* { dg-require-effective-target aarch64_frintnzx_ok } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define TEST(name,float_type,int_type)					\
+void									\
+name (float_type * __restrict__ x, float_type * __restrict__ y, int n)  \
+{									\
+  for (int i = 0; i < n; ++i)					      \
+    {								      \
+      int_type x_i = x[i];					      \
+      y[i] = (float_type) x_i;					      \
+    }								      \
+}
+
+/*
+** f1:
+**	...
+**	frint32z	v0.4s, v0.4s
+**	...
+*/
+TEST(f1, float, int)
+
+/*
+** f2:
+**	...
+**	frint64z	v0.4s, v0.4s
+**	...
+*/
+TEST(f2, float, long long)
+
+/*
+** f3:
+**	...
+**	frint32z	v0.2d, v0.2d
+**	...
+*/
+TEST(f3, double, int)
+
+/*
+** f4:
+**	...
+**	frint64z	v0.2d, v0.2d
+**	...
+*/
+TEST(f4, double, long long)
diff --git a/gcc/testsuite/gcc.target/aarch64/merge_trunc1.c b/gcc/testsuite/gcc.target/aarch64/merge_trunc1.c
index 07217064e2ba54fcf4f5edc440e6ec19ddae66e1..3d80871c4cebd5fb5cac0714b3feee27038f05fd 100644
--- a/gcc/testsuite/gcc.target/aarch64/merge_trunc1.c
+++ b/gcc/testsuite/gcc.target/aarch64/merge_trunc1.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -ffast-math" } */
+/* { dg-skip-if "" { aarch64_frintnzx_ok } } */
 
 float
 f1 (float x)
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 8cbda192fe0fae59ea208ee43696b4d22c43e61e..450ca78230faeba40b89fc7987af27b6bf0a0d53 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -11365,6 +11365,32 @@ proc check_effective_target_arm_v8_3a_bkey_directive { } {
 	}]
 }
 
+# Return 1 if the target supports Armv8.5-A scalar and Advanced SIMD
+# FRINT32[ZX] andd FRINT64[ZX] instructions, 0 otherwise. The test is valid for
+# AArch64.
+proc check_effective_target_aarch64_frintnzx_ok_nocache { } {
+
+    if { ![istarget aarch64*-*-*] } {
+        return 0;
+    }
+
+    if { [check_no_compiler_messages_nocache \
+	      aarch64_frintnzx_ok assembly {
+	#if !defined (__ARM_FEATURE_FRINT)
+	#error "__ARM_FEATURE_FRINT not defined"
+	#endif
+    } [current_compiler_flags]] } {
+	return 1;
+    }
+
+    return 0;
+}
+
+proc check_effective_target_aarch64_frintnzx_ok { } {
+    return [check_cached_effective_target aarch64_frintnzx_ok \
+                check_effective_target_aarch64_frintnzx_ok_nocache] 
+}
+
 # Return 1 if the target supports executing the Armv8.1-M Mainline Low
 # Overhead Loop, 0 otherwise.  The test is valid for ARM.
 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 03cc7267cf80d4ce73c0d89ab86b07e84752456a..35bb1f70f7b173ad0d1e9f70ce0ac9da891dbe62 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1625,7 +1625,8 @@ vect_finish_stmt_generation (vec_info *vinfo,
 
 static internal_fn
 vectorizable_internal_function (combined_fn cfn, tree fndecl,
-				tree vectype_out, tree vectype_in)
+				tree vectype_out, tree vectype_in,
+				tree *vectypes)
 {
   internal_fn ifn;
   if (internal_fn_p (cfn))
@@ -1637,8 +1638,12 @@ vectorizable_internal_function (combined_fn cfn, tree fndecl,
       const direct_internal_fn_info &info = direct_internal_fn (ifn);
       if (info.vectorizable)
 	{
-	  tree type0 = (info.type0 < 0 ? vectype_out : vectype_in);
-	  tree type1 = (info.type1 < 0 ? vectype_out : vectype_in);
+	  tree type0 = (info.type0 < 0 ? vectype_out : vectypes[info.type0]);
+	  if (!type0)
+	    type0 = vectype_in;
+	  tree type1 = (info.type1 < 0 ? vectype_out : vectypes[info.type1]);
+	  if (!type1)
+	    type1 = vectype_in;
 	  if (direct_internal_fn_supported_p (ifn, tree_pair (type0, type1),
 					      OPTIMIZE_FOR_SPEED))
 	    return ifn;
@@ -3252,16 +3257,31 @@ vectorizable_call (vec_info *vinfo,
       rhs_type = unsigned_type_node;
     }
 
-  int mask_opno = -1;
+  /* The argument that is not of the same type as the others.  */
+  int diff_opno = -1;
+  bool masked = false;
   if (internal_fn_p (cfn))
-    mask_opno = internal_fn_mask_index (as_internal_fn (cfn));
+    {
+      if (cfn == CFN_FTRUNC_INT)
+	/* For FTRUNC this represents the argument that carries the type of the
+	   intermediate signed integer.  */
+	diff_opno = 1;
+      else
+	{
+	  /* For masked operations this represents the argument that carries the
+	     mask.  */
+	  diff_opno = internal_fn_mask_index (as_internal_fn (cfn));
+	  masked = diff_opno >=  0;
+	}
+    }
 
   for (i = 0; i < nargs; i++)
     {
-      if ((int) i == mask_opno)
+      if ((int) i == diff_opno && masked)
 	{
-	  if (!vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_opno,
-				       &op, &slp_op[i], &dt[i], &vectypes[i]))
+	  if (!vect_check_scalar_mask (vinfo, stmt_info, slp_node,
+				       diff_opno, &op, &slp_op[i], &dt[i],
+				       &vectypes[i]))
 	    return false;
 	  continue;
 	}
@@ -3275,27 +3295,35 @@ vectorizable_call (vec_info *vinfo,
 	  return false;
 	}
 
-      /* We can only handle calls with arguments of the same type.  */
-      if (rhs_type
-	  && !types_compatible_p (rhs_type, TREE_TYPE (op)))
+      if ((int) i != diff_opno)
 	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "argument types differ.\n");
-	  return false;
-	}
-      if (!rhs_type)
-	rhs_type = TREE_TYPE (op);
+	  /* We can only handle calls with arguments of the same type.  */
+	  if (rhs_type
+	      && !types_compatible_p (rhs_type, TREE_TYPE (op)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "argument types differ.\n");
+	      return false;
+	    }
+	  if (!rhs_type)
+	    rhs_type = TREE_TYPE (op);
 
-      if (!vectype_in)
-	vectype_in = vectypes[i];
-      else if (vectypes[i]
-	       && !types_compatible_p (vectypes[i], vectype_in))
+	  if (!vectype_in)
+	    vectype_in = vectypes[i];
+	  else if (vectypes[i]
+		   && !types_compatible_p (vectypes[i], vectype_in))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "argument vector types differ.\n");
+	      return false;
+	    }
+	}
+      else
 	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "argument vector types differ.\n");
-	  return false;
+	  vectypes[i] = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op),
+						     slp_node);
 	}
     }
   /* If all arguments are external or constant defs, infer the vector type
@@ -3371,8 +3399,8 @@ vectorizable_call (vec_info *vinfo,
 	  || (modifier == NARROW
 	      && simple_integer_narrowing (vectype_out, vectype_in,
 					   &convert_code))))
-    ifn = vectorizable_internal_function (cfn, callee, vectype_out,
-					  vectype_in);
+    ifn = vectorizable_internal_function (cfn, callee, vectype_out, vectype_in,
+					  &vectypes[0]);
 
   /* If that fails, try asking for a target-specific built-in function.  */
   if (ifn == IFN_LAST)
@@ -3446,12 +3474,12 @@ vectorizable_call (vec_info *vinfo,
 	record_stmt_cost (cost_vec, ncopies / 2,
 			  vec_promote_demote, stmt_info, 0, vect_body);
 
-      if (loop_vinfo && mask_opno >= 0)
+      if (loop_vinfo && masked)
 	{
 	  unsigned int nvectors = (slp_node
 				   ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
 				   : ncopies);
-	  tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
+	  tree scalar_mask = gimple_call_arg (stmt_info->stmt, diff_opno);
 	  vect_record_loop_mask (loop_vinfo, masks, nvectors,
 				 vectype_out, scalar_mask);
 	}
@@ -3499,7 +3527,7 @@ vectorizable_call (vec_info *vinfo,
 		    {
 		      /* We don't define any narrowing conditional functions
 			 at present.  */
-		      gcc_assert (mask_opno < 0);
+		      gcc_assert (!masked);
 		      tree half_res = make_ssa_name (vectype_in);
 		      gcall *call
 			= gimple_build_call_internal_vec (ifn, vargs);
@@ -3519,15 +3547,15 @@ vectorizable_call (vec_info *vinfo,
 		    }
 		  else
 		    {
-		      if (mask_opno >= 0 && masked_loop_p)
+		      if (masked && masked_loop_p)
 			{
 			  unsigned int vec_num = vec_oprnds0.length ();
 			  /* Always true for SLP.  */
 			  gcc_assert (ncopies == 1);
 			  tree mask = vect_get_loop_mask (gsi, masks, vec_num,
 							  vectype_out, i);
-			  vargs[mask_opno] = prepare_load_store_mask
-			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
+			  vargs[diff_opno] = prepare_load_store_mask
+			    (TREE_TYPE (mask), mask, vargs[diff_opno], gsi);
 			}
 
 		      gcall *call;
@@ -3559,13 +3587,13 @@ vectorizable_call (vec_info *vinfo,
 	      orig_vargs[i] = vargs[i] = vec_defs[i][j];
 	    }
 
-	  if (mask_opno >= 0 && masked_loop_p)
+	  if (masked && masked_loop_p)
 	    {
 	      tree mask = vect_get_loop_mask (gsi, masks, ncopies,
 					      vectype_out, j);
-	      vargs[mask_opno]
+	      vargs[diff_opno]
 		= prepare_load_store_mask (TREE_TYPE (mask), mask,
-					   vargs[mask_opno], gsi);
+					   vargs[diff_opno], gsi);
 	    }
 
 	  gimple *new_stmt;
@@ -3584,7 +3612,7 @@ vectorizable_call (vec_info *vinfo,
 	    {
 	      /* We don't define any narrowing conditional functions at
 		 present.  */
-	      gcc_assert (mask_opno < 0);
+	      gcc_assert (!masked);
 	      tree half_res = make_ssa_name (vectype_in);
 	      gcall *call = gimple_build_call_internal_vec (ifn, vargs);
 	      gimple_call_set_lhs (call, half_res);
@@ -3628,7 +3656,7 @@ vectorizable_call (vec_info *vinfo,
     {
       auto_vec<vec<tree> > vec_defs (nargs);
       /* We don't define any narrowing conditional functions at present.  */
-      gcc_assert (mask_opno < 0);
+      gcc_assert (!masked);
       for (j = 0; j < ncopies; ++j)
 	{
 	  /* Build argument list for the vectorized call.  */
diff --git a/gcc/tree.h b/gcc/tree.h
index f62c00bc8707029db52e2f3fe529948755235d3d..31ce45a84cc267ea2022c8ca6323368fbe15eb8b 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -6547,4 +6547,12 @@ extern unsigned fndecl_dealloc_argno (tree);
    object or pointer.  Otherwise return null.  */
 extern tree get_attr_nonstring_decl (tree, tree * = NULL);
 
+/* Return the type, or for a complex or vector type the type of its
+   elements.  */
+extern tree element_type (const_tree);
+
+/* Return the precision of the type, or for a complex or vector type the
+   precision of the type of its elements.  */
+extern unsigned int element_precision (const_tree);
+
 #endif  /* GCC_TREE_H  */
diff --git a/gcc/tree.c b/gcc/tree.c
index 845228a055b2cfac0c9ca8c0cda1b9df4b0095c6..f1e9a1eb48769cb11aa69730e2480ed5522f78c1 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -6645,11 +6645,11 @@ valid_constant_size_p (const_tree size, cst_size_error *perr /* = NULL */)
   return true;
 }
 
-/* Return the precision of the type, or for a complex or vector type the
-   precision of the type of its elements.  */
+/* Return the type, or for a complex or vector type the type of its
+   elements.  */
 
-unsigned int
-element_precision (const_tree type)
+tree
+element_type (const_tree type)
 {
   if (!TYPE_P (type))
     type = TREE_TYPE (type);
@@ -6657,7 +6657,16 @@ element_precision (const_tree type)
   if (code == COMPLEX_TYPE || code == VECTOR_TYPE)
     type = TREE_TYPE (type);
 
-  return TYPE_PRECISION (type);
+  return (tree) type;
+}
+
+/* Return the precision of the type, or for a complex or vector type the
+   precision of the type of its elements.  */
+
+unsigned int
+element_precision (const_tree type)
+{
+  return TYPE_PRECISION (element_type (type));
 }
 
 /* Return true if CODE represents an associative tree code.  Otherwise