Target Specific Optimization

The target specific optimization has several internal stages. These stages can be delivered in different GCC releases. The first two stages are geared towards people who need to build high performance libraries that must span several different underlying architectures, while the third stage is meant to be usable by the majority of programmers, since it will not involve source code modifications to use. While the focus of this work is to allow ix86 programmers to code for various AMD and Intel platforms, other GCC backends will be able to use target specific optimization by adding the appropriate machine dependent parts.

The stages are:

Stage1: Compile single function with specific options

Stage1: Objective of compiling a single function with specific options

Stage1: Details of compiling a single function with specific options

Stage1: Syntax for target specific optimization

There are two alternative methods to specify target specific optimization. Using attributes on a per-function basis is the traditional GCC method for controling functions, and it works inside of macros. Using pragmas mirrors the ways other compilers support such options, and is likely to be more comfortable for the potential audience to use. Using #pragma however allows preprocessor macros to be adjusted, so that you can include files like bmmintrin.h if you have SSE5 options enabled. If the wider community prefers one method over another, we can restrict the work proposal to just that method.

Stage1: attribute syntax

I propose we add the following attributes to the compiler:

Stage1: pragma syntax

All of these pragmas would turn on the equivalent *attribute* support for the succeeding functions until they are reset. If a function has an attribute declared, it would override the *#pragma*:

The following pragmas will allow header files to turn on/off the options in a global fashion:

Stage1: Example using #pragma

Here is an example of how you might use target specific functions using *#pragma*. It uses the common compiler intrinsics include files (and needs pragma because bmmintrin.h and smmintrin.h check for SSE5 and SSE4_1 being defined). The code calculates a minimum of a vector of 32-bit signed integers, using the pcomd and pcmov instructions under SSE5 and the pminsd instruction under SSE4.1.

   1 #pragma GCC push-options
   2 #pragma GCC sse5
   3 #include <bmmintrin.h>
   4 
   5 void sse5_min (__m128i *a, __m128i *b, __m128i *c, int n) {
   6     int i;
   7     for (i = 0; i < n; i++) {
   8         __m128i test = _mm_comlt_epi32 (b[i], c[i]);
   9         a[i] = _mm_cmov_si128 (b[i], c[i], test);
  10     }
  11 }
  12 
  13 #pragma GCC initial-options
  14 #pragma GCC sse4_1
  15 #include <smmintrin.h>
  16 
  17 void sse4_1_min (__m128i *a, __m128i *b, __m128i *c, int n) {
  18     int i;
  19     for (i = 0; i < n; i++) {
  20         a[i] = _mm_min_epi32 (b[i], c[i]);
  21     }
  22 }
  23 
  24 #pragma GCC pop-options
  25 void generic_min (__m128i *a, __m128i *b, __m128i *c, int n) {
  26     int i;
  27     int n_int = 4 * n;
  28     int *a_int = (int *) a;
  29     int *b_int = (int *) b;
  30     int *c_int = (int *) c;
  31     for (i = 0; i < n_int; i++) {
  32         a_int[i] = (b_int[i] < c_int[i]) ? b_int[i] : c_int[i];
  33     }
  34 }
  35 void do_min (__m128i *a, __m128i *b, __m128i *c, int n) {
  36     if (HAVE_SSE5) {
  37         sse5_min (a, b, c, n);
  38     } else if (HAVE_SSE4_1) {
  39         sse4_1_min (a, b, c, n);
  40     } else {
  41         generic_min (a, b, c, n);
  42     }
  43 }

Stage1: Example using attribute

Here is an example of how you might use target specific functions using attributes. It uses the GCC intrinsics. The code calculates a minimum of a vector of 32-bit signed integers, using the pcomd and pcmov instructions under SSE5 and the pminsd instruction under SSE4.1.

   1 typedef int __v4si __attribute__ ((__vector_size__ (16), __may_alias__));
   2 void sse5_min (__v4si *, __v4si *, __v4si *, int) __attribute__ ((__sse5__));
   3 void sse4_1_min (__v4si *, __v4si *, __v4si *, int) __attribute__ ((__sse4_1__));
   4 void generic_min (__v4si *, __v4si *, __v4si *, int);
   5 void sse5_min (__v4si *a, __v4si *b, __v4si *c, int n) {
   6     int i;
   7     for (i = 0; i < n; i++) {
   8         __v4si test = __builtin_ia32_pcomltd (b[i], c[i]);
   9         a[i] = __builtin_ia32_pcmov_v4si (b[i], c[i], test);
  10     }
  11 }
  12 void sse4_1_min (__v4si *a, __v4si *b, __v4si *c, int n) {
  13     int i;
  14     for (i = 0; i < n; i++) {
  15         a[i] = __builtin_ia32_pminsd (b[i], c[i]);
  16     }
  17 }
  18 void generic_min (__v4si *a, __v4si *b, __v4si *c, int n) {
  19     int i;
  20     int n_int = 4 * n;
  21     int *a_int = (int *) a;
  22     int *b_int = (int *) b;
  23     int *c_int = (int *) c;
  24     for (i = 0; i < n_int; i++) {
  25         a_int[i] = (b_int[i] < c_int[i]) ? b_int[i] : c_int[i];
  26     }
  27 }
  28 void do_min (__v4si *a, __v4si *b, __v4si *c, int n) {
  29     if (HAVE_SSE5) {
  30         sse5_min (a, b, c, n);
  31     } else if (HAVE_SSE4_1) {
  32         sse4_1_min (a, b, c, n);
  33     } else {
  34         generic_min (a, b, c, n);
  35     }
  36 }

Stage1: Work items

This section is an attempt to break down the stage1 work into smaller chunks, with separate deliverables.

Stage1: Create a branch.

A subversion branch will be created at the FSF to host this project. All work will be done in this branch. All people contributing to this branch must have the appropriate FSF paperwork so that their work can be incorporated into the mainstream GCC. All FSF coding guidelines will be used. Merges from the mainline will occur at least monthly. It will take 1 day to create the branch. It is anticipated that each merge will take 1 day to do the merge, and do any updates to the target specific work that is needed.

Stage1: Move command line options into a global structure.

Currently, each individual command line option is a separate external variable. This work item will modify the opt*.awk scripts so that all of the options are collected into one global structure. Each field will be an option that previously was a global variable will be a *#define* so that the rest of the compiler will not need source modifications. I expect this work item to take about 1 week of time. When the 4.4 tree opens up, this work item will be migrated to the mainline.

Stage1: Add target hook support for changing options

We will add target hooks that allow the backend to be notified when the user issues a *#pragma* or *attribute* that changes the current set of optimization and warning options. In addition, in the ix86 backend, we will add the ix86 support for the various *-msse* type options. I expect this to take 2 weeks of work.

Stage1: Add #pragma support

We will add the necessary *#pragma* support to add function specific optimizations, calling the the appropriate target hooks where needed, pushing/popping the options as needed. I expect this to take 3 weeks of work.

Stage1: Add attribute support

Once #pragma support is added, the same work will be done to attribute's. I would expect this to take 1 week of time, since the #pragma support will have ironed out the bugs.

Stage1: Add support in the tree/RTL structure for remembering the options used

We will add support in the tree and RTL structures for remembering what the current options are. This work should interface with the LTO team so that these options can be saved and used as part of the LTO work. I would estimate that it will take 4 weeks of investigation and neogotiation with the other groups to come up with a workable design. The design should be general enough so that in the future, if desired we can have if {...} blocks that use different compilation options than the main function.

Stage1: Teach the inliner about target specific functions

We will teach the inliner not to inline functions compiled with target specific optimizations inside of a general function. However, if a function that has target specific optimizations it should be able to inline normal functions, or functions compiled with the same set of target specific optimizations. I estimate that this should take 2 weeks of time.

Stage1: Convert ix86 intrinsics to know about target specific optimizations

We will rewrite the ix86 intrinsic code so that all intriniscs are added to the symbol table at compiler startup, but when the intrinsic is invoked, it will check whether the current compilation options allow it to be generated. I estimate this will take 2 weeks of time.

Stage1: Ix86 preprocessor macro support

The ix86 backend will define/undefine the appropriate processor specific macros (like SSE) based on the current function optimization options. It is anticipated that the ix86 backend will do this in the target hook created above, and there may be some modifications to the preprocessor. I estimate that this will take 2 weeks of time.

Stage1: Merge into mainline

Assuming all of this works, it will be merged into the mainline in pieces. I anticipate that this may take 4 weeks of effort.

Stage2: Details of compiling a single function multiple times manually

Stage2: Example

If you have a function declared as a clone, such as:

   1 void my_min (int *, int *, int *) __attribute__((__clone__));
   2 void my_min (int *a, int *b, int *c, int n) {
   3     int i;
   4     for (i = 0; i < n; i++) {
   5         a[i] = (b[i] < c[i]) ? b[i] : c[i];
   6     }
   7 }

The compiler would logically generate code that would be equivalent to:

   1 static void __do_cpuid (void) __attribute__ ((__constructor__));
   2 static void my_min__clone_generic (int *, int *, int *, int);
   3 static void my_min__clone_sse5 (int *, int *, int *, int) __attribute__((__sse5__));
   4 static void my_min__clone_sse4_1 (int *, int *, int *, int) __attribute__((__sse4_1__));
   5 static void (*my_min__clone_ptr)(int *, int *, int *, int) = my_min__clone_generic;
   6 static void __do_cpuid (void) {
   7     int have_sse5;
   8     int have_sse4_1;
   9     /* code to initialize have_sse5 and have_sse4_1 via CPUID.  */
  10     /* Update all clone pointers generated in this module */
  11     if (have_sse5) {
  12        my_min__clone_ptr = my_min__clone_sse5;
  13     } else if (have_sse4_1) {
  14         my_min__clone_ptr = my_min__clone_sse4_1;
  15     } else {
  16         my_min__clone_ptr = my_min__clone_generic;
  17     }
  18 }
  19 void my_min (int *a, int *b, int *c, int n) {
  20     (* my_min__clone_ptr) (a, b, c, n);
  21 }
  22 static void my_min__clone_generic (int *a, int *b, int *c, int n) {
  23     int i;
  24     for (i = 0; i < n; i++) {
  25         a[i] = (b[i] < c[i]) ? b[i] : c[i];
  26     }
  27 }
  28 /* compile with -msse5 as per the attribute in the declaration.  */
  29 static void my_min__clone_sse5 (int *a, int *b, int *c, int n) {
  30     int i;
  31     for (i = 0; i < n; i++) {
  32         a[i] = (b[i] < c[i]) ? b[i] : c[i];
  33     }
  34 }
  35 /* compile with -msse4.1 as per the attribute in the declaration.  */
  36 void my_min__clone_sse4_1 (int *a, int *b, int *c, int n) {
  37     int i;
  38     for (i = 0; i < n; i++) {
  39         a[i] = (b[i] < c[i]) ? b[i] : c[i];
  40     }
  41 }

Stage3: Compile functions with multiple different options automatically

Stage3: Objective of compiling a single function multiple times automatically

Branch


CategoryTemplate

None: FunctionSpecificOpt (last edited 2008-01-10 19:39:00 by localhost)