This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] External vectorization library (target) support


This is the alternative way to tackle vectorization of intrinsics
(the other way is libgcc-math which lacks reviewers, approvers,
political willingness?).  I expect the set of libraries supported
to grow, but as only ACML provides the "simple" two-value wrappers
for sin, cos, etc. it is a natural start.

The patch doesn't add automagic linking - just the interface is
specified (so OSS implementations are possible, of course the primary
target would be ACML itself or an off-gcc libgcc-math).

Use of more of the routines in those kind of libraries requires
changes to the vectorizer infrastructure (support for v4df and
v8sf mode interfaces, support for whole-array functions which
also Intel MKL supports (sin(n, double*, double*)-style), more
idiom recognition to dispatch to the blas/lapack routines, etc.)

I made this a target option - we can move it to common.opt if ppc
folks or others want to use it, too, but -march and -mtune are
also targte specific, so I just followed that example.  A mechanism
for automatic linking to the libraries via some configury can be
added later (or omitted), likewise selecting a (configurable) default.

Richard.


2006-12-10  Richard Guenther  <rguenther@suse.de>

	* doc/invoke.texi (-mveclib): Document new target option.
	* config/i386/i386.opt (-mveclib): New target option.
	* config/i386/i386.c (ix86_veclib_handler): Handler for
	vectorization library support.
	(override_options): Handle the -mveclib option, initialize
	the vectorization library handler.
	(ix86_builtin_vectorized_function): As fallback call the
	vectorization library handler, if set.
	(ix86_veclib_acml): New static function for ACML style
	vectorization support.

Index: doc/invoke.texi
===================================================================
*** doc/invoke.texi	(revision 119706)
--- doc/invoke.texi	(working copy)
*************** Objective-C and Objective-C++ Dialects}.
*** 537,543 ****
  -mthreads  -mno-align-stringops  -minline-all-stringops @gol
  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
  -m96bit-long-double  -mregparm=@var{num}  -mx87regparm @gol
! -msseregparm @gol  -mstackrealign @gol
  -momit-leaf-frame-pointer  -mno-red-zone -mno-tls-direct-seg-refs @gol
  -mcmodel=@var{code-model} @gol
  -m32  -m64 -mlarge-data-threshold=@var{num}}
--- 537,543 ----
  -mthreads  -mno-align-stringops  -minline-all-stringops @gol
  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
  -m96bit-long-double  -mregparm=@var{num}  -mx87regparm @gol
! -msseregparm -mveclib=@var{type} -mstackrealign @gol
  -momit-leaf-frame-pointer  -mno-red-zone -mno-tls-direct-seg-refs @gol
  -mcmodel=@var{code-model} @gol
  -m32  -m64 -mlarge-data-threshold=@var{num}}
*************** supported architecture, using the approp
*** 9693,9698 ****
--- 9693,9709 ----
  the file containing the CPU detection code should be compiled without
  these options.
  
+ @item -mveclib=@var{type}
+ @opindex mveclib
+ Specifies the ABI type to use for vectorizing intrinsics using an
+ external library.  Supported types are @code{acml} for the AMD
+ math core library style of interfacing.  GCC will currently emit
+ calls to @code{__vrd2_sin}, @code{__vrd2_cos}, @code{__vrd2_exp},
+ @code{__vrd2_log}, @code{__vrd2_log2}, @code{__vrd2_log10},
+ @code{__vrs4_sinf}, @code{__vrs4_cosf}, @code{__vrs4_expf},
+ @code{__vrs4_logf}, @code{__vrs4_log2f}, @code{__vrs4_log10f}
+ and @code{__vrs4_powf} when using this type.
+ 
  @item -mpush-args
  @itemx -mno-push-args
  @opindex mpush-args
Index: config/i386/i386.opt
===================================================================
*** config/i386/i386.opt	(revision 119706)
--- config/i386/i386.opt	(working copy)
*************** mtune=
*** 241,245 ****
--- 241,249 ----
  Target RejectNegative Joined Var(ix86_tune_string)
  Schedule code for given CPU
  
+ mveclib=
+ Target RejectNegative Joined Var(ix86_veclib_string)
+ Vector library interface to use
+ 
  ;; Support Athlon 3Dnow builtins
  Mask(3DNOW_A)
Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c	(revision 119706)
--- config/i386/i386.c	(working copy)
*************** static void x86_64_elf_unique_section (t
*** 1403,1408 ****
--- 1403,1412 ----
  static section *x86_64_elf_select_section (tree decl, int reloc,
  					   unsigned HOST_WIDE_INT align)
  					     ATTRIBUTE_UNUSED;
+ 
+ /* Vectorization library interface and handlers.  */
+ tree (*ix86_veclib_handler)(enum built_in_function, tree) = NULL;
+ static tree ix86_veclib_acml (enum built_in_function, tree);
  
  /* Initialize the GCC target structure.  */
  #undef TARGET_ATTRIBUTE_TABLE
*************** override_options (void)
*** 2199,2204 ****
--- 2203,2218 ----
    if (!TARGET_80387)
      target_flags &= ~MASK_FLOAT_RETURNS;
  
+   /* Use external vectorized library in vectorizing intrinsics.  */
+   if (ix86_veclib_string)
+     {
+       if (strcmp (ix86_veclib_string, "acml") == 0)
+         ix86_veclib_handler = ix86_veclib_acml;
+       else
+ 	error ("unknown vectorization library type (%s) for -mveclib= switch",
+ 	       ix86_veclib_string);
+     }
+ 
    if ((x86_accumulate_outgoing_args & TUNEMASK)
        && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
        && !optimize_size)
*************** ix86_builtin_vectorized_function (enum b
*** 17625,17644 ****
      case BUILT_IN_SQRT:
        if (el_mode == DFmode && n == 2)
  	return ix86_builtins[IX86_BUILTIN_SQRTPD];
!       return NULL_TREE;
  
      case BUILT_IN_SQRTF:
        if (el_mode == SFmode && n == 4)
  	return ix86_builtins[IX86_BUILTIN_SQRTPS];
!       return NULL_TREE;
  
      default:
        ;
      }
  
    return NULL_TREE;
  }
  
  /* Store OPERAND to the memory after reload is completed.  This means
     that we can't easily use assign_stack_local.  */
  rtx
--- 17639,17741 ----
      case BUILT_IN_SQRT:
        if (el_mode == DFmode && n == 2)
  	return ix86_builtins[IX86_BUILTIN_SQRTPD];
!       break;
  
      case BUILT_IN_SQRTF:
        if (el_mode == SFmode && n == 4)
  	return ix86_builtins[IX86_BUILTIN_SQRTPS];
!       break;
  
      default:
        ;
      }
  
+   /* Dispatch to a handler for a vectorization library.  */
+   if (ix86_veclib_handler)
+     return (*ix86_veclib_handler)(fn, type);
+ 
    return NULL_TREE;
  }
  
+ /* Handler for an ACML-style interface to a library with vectorized
+    intrinsics.  */
+ 
+ static tree
+ ix86_veclib_acml (enum built_in_function fn, tree type)
+ {
+   char name[20] = "__vr.._";
+   tree fntype, new_fndecl, args;
+   unsigned arity;
+   const char *bname;
+   enum machine_mode el_mode;
+   int n;
+ 
+   /* The ACML is 64bits only and suitable for unsafe math only as
+      it does not correctly support parts of IEEE with the required
+      precision such as denormals.  */
+   if (!TARGET_64BIT
+       || !flag_unsafe_math_optimizations)
+     return NULL_TREE;
+ 
+   el_mode = TYPE_MODE (TREE_TYPE (type));
+   n = TYPE_VECTOR_SUBPARTS (type);
+ 
+   switch (fn)
+     {
+     case BUILT_IN_SIN:
+     case BUILT_IN_COS:
+     case BUILT_IN_EXP:
+     case BUILT_IN_LOG:
+     case BUILT_IN_LOG2:
+     case BUILT_IN_LOG10:
+       name[4] = 'd';
+       name[5] = '2';
+       if (el_mode != DFmode
+ 	  || n != 2)
+ 	return NULL_TREE;
+       break;
+ 
+     case BUILT_IN_SINF:
+     case BUILT_IN_COSF:
+     case BUILT_IN_EXPF:
+     case BUILT_IN_POWF:
+     case BUILT_IN_LOGF:
+     case BUILT_IN_LOG2F:
+     case BUILT_IN_LOG10F:
+       name[4] = 's';
+       name[5] = '4';
+       if (el_mode != SFmode
+ 	  || n != 4)
+ 	return NULL_TREE;
+       break;
+ 
+     default:
+       return NULL_TREE;
+     }
+ 
+   bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
+   sprintf (name + 7, "%s", bname+10);
+ 
+   arity = 0;
+   for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
+        args = TREE_CHAIN (args))
+     arity++;
+ 
+   if (arity == 1)
+     fntype = build_function_type_list (type, type, NULL);
+   else
+     fntype = build_function_type_list (type, type, type, NULL);
+ 
+   /* Build a function declaration for the vectorized function.  */
+   new_fndecl = build_decl (FUNCTION_DECL, get_identifier (name), fntype);
+   TREE_PUBLIC (new_fndecl) = 1;
+   DECL_EXTERNAL (new_fndecl) = 1;
+   DECL_IS_NOVOPS (new_fndecl) = 1;
+   TREE_READONLY (new_fndecl) = 1;
+ 
+   return new_fndecl;
+ }
+ 
  /* Store OPERAND to the memory after reload is completed.  This means
     that we can't easily use assign_stack_local.  */
  rtx


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]