Index: vectorization.html =================================================================== RCS file: /cvs/gcc/wwwdocs/htdocs/projects/tree-ssa/vectorization.html,v retrieving revision 1.4 diff -c -3 -p -r1.4 vectorization.html *** vectorization.html 17 Apr 2004 22:22:18 -0000 1.4 --- vectorization.html 21 Jul 2004 13:45:51 -0000 *************** *** 1,89 **** Auto-vectorization in GCC ! !

Auto-vectorization in GCC:
! A preliminary high level plan of implementation

! !

This document outlines a high level plan for the ! implementation of auto-vectorization in the tree-ssa branch of GCC. The main goals of this ! document are to:

!

This document tries to serve the above goals by,

!
    !
  1. providing a list all the issues that vectorization may ! need to handle,
  2. !
  3. breaking the implementation into independent tasks as ! much as possible, and
  4. !
  5. proposing a gradual implementation that consists of a ! very limited vectorizer as a first step, which will be ! gradually extend in different directions.
  6. !

Related discussion threads: http://gcc.gnu.org/ml/gcc/2003-07/msg01309.html.
! This web page is managed by Dorit Naishlos <dorit at il dot ibm dot com>.

!

General Vectorization Scheme

The table below outlines the high level vectorization scheme along with a proposal for an implementation scheme, as --- 1,804 ---- Auto-vectorization in GCC ! !

Auto-vectorization in GCC

! !

The goal of this project is to develop a loop vectorizer in ! GCC, based on the tree-ssa framework. This ! work is taking place in the lno-branch.

! !

Table of Contents

!

Using the Vectorizer

!

Vectorization is enabled by the flag -ftree-vectorize. To ! allow vectorization on powerpc platforms also use -maltivec. On ! i?86 and x86_64 platforms use -msse/-msse2. The vectorizer test ! cases demonstrate the currrent vectorization capabilities; ! these can be found under ! gcc/gcc/testsuite/gcc.dg/tree-ssa-vect/. Information on which ! loops were or were not vectorized and why, can be obtained ! using the flag -fdump-tree-vect-stats. For example, this is the ! output that the vectorizer generated for test case ! tree-ssa-vect-1.c:

!
! loop at tree-ssa-vect-1.c:35: not vectorized: unsupported scalar cycle.
! loop at tree-ssa-vect-1.c:45: not vectorized: multi-dimensional array.
! loop at tree-ssa-vect-1.c:53: LOOP VECTORIZED.
! loop at tree-ssa-vect-1.c:60: LOOP VECTORIZED.
! loop at tree-ssa-vect-1.c:67: not vectorized: complicated access pattern.
! loop at tree-ssa-vect-1.c:75: LOOP VECTORIZED.
! loop at tree-ssa-vect-1.c:86: not vectorized: mixed data-types
! loop at tree-ssa-vect-1.c:95: not vectorized: dependence between refs to same array: a
! vectorized 3 loops in function.
! 
! !

Vectorizable ! Loops

! !

Examples of loops that can currently be vectorized

!
! example1:
! int a[256], b[256], c[256];
! foo () {
!   for (i=0; i<256; i++){
!     a[i] = b[i] + c[i];
!   }
! } 
! 
! example2:
! int a[256], b[256], c[256];
! foo (int n, int x) {
! 
!    /* new feature: support for unknown loop bound  */
!    /* new feature: support for loop invariants  */
!    for (i=0; i<n; i++)
!       b[i] = x;
!    }
! 
!    /* new feature: general loop exit condition  */
!    /* new feature: support for bitwise operations  */
!    while (n--){
!       a[i] = b[i]&c[i]; i++;
!    }
! }
! 
! example3:
! typedef int aint __attribute__ ((__aligned__(16)));
! foo (int n, aint * __restricted__ p, aint * __restricted q) {
! 
!    /* new feature: support for pointer accesses.  */
!    while (n--){
!       *p++ = *q++;
!    }
! }
! 
! example4 (on apple-ppc-branch only):
! typedef int aint __attribute__ ((__aligned__(16)));
! int a[256], b[256], c[256];
! foo (int n, aint * __restricted__ p, aint * __restricted__ q) {
! 
!    /* features: support for pointer accesses and constants  */
!    while (n--){
!       *p++ = *q++ + 5;
!    }
! 
!    /* new feature: support for read accesses with a compile time known misalignment  */
!    for (i=0; i<n; i++){
!       a[i] = b[i+1] + c[i+3];
!    }
! 
!    /* new feature: support for if-conversion  */
!    for (i=0; i<n; i++){
!       j = a[i];
!       b[i] = (j > MAX ? MAX : 0);
!    }
! }
! 
! example5:
! for (i = 0; i < N; i++)
!   {
!     /* new feature: struct access  */
!     s.ca[i] = 5;
!   }
! 
!
! example6 (gfortran):DIMENSION A(1000000), B(1000000), C(1000000)
! READ*, X, Y
! A = LOG(X); B = LOG(Y); C = A + B
! PRINT*, C(500000)
! END
! 
!
! example7 (on apple-ppc-branch only):
! int a[256], b[256];
! foo (int x) {
! 
!    /* new feature: support for read accesses with an unknown misalignment  */
!    for (i=0; i<N; i++){
!       a[i] = b[i+x]; 
!    }
! }
! 
! example8 (on apple-ppc-branch only):
! typedef unsigned char auchar __attribute__ ((__aligned__(16)));
! foo (auchar * __restricted__ srcA,
!      auchar * __restricted__ srcB,
!      auchar * __restricted__ dest, unsigned char alpha){
!   unsigned char not_alpha = 255 - alpha;
! 
!   /* new feature: pattern recognition  */
!   for (i = 0; i < N; i++){
!      unsigned short a1 = srcA[i] * alpha;
!      unsigned short a2 = srcB[i] * not_alpha;
!      unsigned char b1 = a1 >> 8;
!      unsigned char b2 = a2 >> 8;
!      dest[i] = b1 + b2;
!   }
! }
! 
! !

Unvectorizable ! Loops

! !

Examples of loops that currently cannot be vectorized:

!
! example1: uncountable loop:
! while (*p != NULL) {
!   *q++ = *p++;
! }
! 
!
! example2: reduction, induction:
! diff = 0;
! for (i = 0; i < N; i++) {
!   diff += (a[i] - b[i]);
! }
! 
! for (i = 0; i < N; i++) {
!   a[i] = i;
! }
! 
!
! example3: strided access (interleaved data):
! for (i = 0; i < N/2; i++){
!   a[i] = b[2*i+1] * c[2*i+1] - b[2*i] * c[2*i];
!   d[i] = b[2*i] * c[2*i+1] + b[2*i+1] * c[2*i];
! }
! 
!
! example4: multiple data-types, type casting:
! short *sa, *sb, *sc;
! int *ia, *ib, *ic;
! for (i = 0; i < N; i++) {
!   ia[i] = ib[i] + ic[i];
!   sa[i] = sb[i] + sc[i];
! }
! 
! for (i = 0; i < N; i++) {
!   ia[i] = (int) sb[i];
! }
! 
! !

Status

! !
!
The main development branch is the lno-branch. In some ! cases, a certain functionality is first implemented in ! apple-ppc-branch, as a preliminary step before committing ! it to lno-branch in a more generic form.
!
!
!
2004-07-20, lno-branch
!
! Description of vectorizable loops: ! !
    !
  1. Inner most loops, that consist of a single ! basic block (i.e, straight-line code, no ! if-then-else).
  2. ! !
  3. The loop has to be countable - i.e, the number ! of iterations can be evaluated before the loop ! starts to execute.
  4. ! !
  5. The loop bound (number of iterations) can be ! unknown.
  6. ! !
  7. New: Supported memory accesses ! are one-dimensional arrays, which base can ! be a struct field, and which alignment can ! be forced (not extern arrays), and aligned pointers ! that are annotated as __restricted__.
  8. ! !
  9. All memory accesses are consecutive (stride=1) ! and aligned.
  10. ! !
  11. Supportable operations include plus/minus/mult, ! as well as bitwise operations - ! and/or/xor/1's-complement, according to available ! vector support in the target platform.
  12. ! !
  13. All operations operate on data types of the ! same size.
  14. ! !
  15. No reductions (sum += a[i]) or inductions (a[i] ! = i).
  16. ! !
  17. Constants and invariants are supported (a[i] = ! 5, a[i] = x).
  18. ! !
  19. New: first gfortran program ! vectorized.
  20. !
! !

Examples of newly vectorizable loops: loop, loop

! !

New test-cases:

! ! !
!
! !
!
2004-06-25, apple-ppc-branch
! !
! Description of vectorizable loops: ! !
    !
  1. Inner most loops, that consist of a single ! basic block (i.e, straight-line code, no ! if-then-else).
  2. ! !
  3. The loop has to be countable - i.e, the number ! of iterations can be evaluated before the loop ! starts to execute.
  4. ! !
  5. The loop bound (number of iterations) can be ! unknown.
  6. ! !
  7. Supported memory accesses are one-dimensional ! arrays, and pointers. If more than one memory ! access is present in the loop, any pointers that ! are used in the loop have to be annotated as ! __restricted__.
  8. ! !
  9. Store (memory-write) accesses have to be ! aligned.
  10. ! !
  11. New: Loads (memory-reads) can ! be unaligned by an unknown amount (e.g. ! access a[i+x], where the value of x is ! unknown). Misalignment support for ! loads was also made more efficient.
  12. ! !
  13. All memory accesses are consecutive ! (stride=1).
  14. ! !
  15. Supportable operations include plus/minus/mult, ! as well as bitwise operations - ! and/or/xor/1's-complement, according to available ! vector support in the target platform.
  16. ! !
  17. All operations operate on data types of the ! same size.
  18. ! !
  19. Some forms of if-the-else patterns can be ! vectorized.
  20. ! !
  21. New: Infrastructure for idiom ! recognition is added. The first computation idiom ! that is recognized and vectorized is a ! multiplication of unsigned-chars, which (unsigned ! short) product is converted back to unsigned char ! (a similar computation comes up in pixel blending; ! we support a simplified version that does not ! require operating on different data ! types).
  22. ! !
  23. No reductions (sum += a[i]) or inductions (a[i] ! = i).
  24. ! !
  25. Constants and invariants are supported (a[i] = ! 5, a[i] = x). New: Support for invariants ! was made more efficient.
  26. !
! !

Examples of newly vectorizable loops: loop, loop

!
!
! !
!
2004-06-23, lno-branch
! !
! Description of vectorizable loops: ! !
    !
  1. Inner most loops, that consist of a single ! basic block (i.e, straight-line code, no ! if-then-else).
  2. ! !
  3. The loop has to be countable - i.e, the number ! of iterations can be evaluated before the loop ! starts to execute.
  4. ! !
  5. The loop bound (number of iterations) can be ! unknown.
  6. ! !
  7. Supported memory accesses are one-dimensional ! arrays, which alignment can be forced (not extern ! arrays).
  8. ! !
  9. New: Memory accesses can also be ! pointer based. If more than one memory access is ! present in the loop, any pointers that are used in ! the loop have to be annotated as __restricted__. ! The pointers have to point to an aligned ! address.
  10. ! !
  11. All memory accesses are consecutive (stride=1) ! and aligned.
  12. ! !
  13. Supportable operations include plus/minus/mult, ! as well as bitwise operations - ! and/or/xor/1's-complement, according to available ! vector support in the target platform.
  14. ! !
  15. All operations operate on data types of the ! same size.
  16. ! !
  17. No reductions (sum += a[i]) or inductions (a[i] ! = i).
  18. ! !
  19. Constants and invariants are supported (a[i] = ! 5, a[i] = x).
  20. !
! !

Examples of newly vectorizable loops: loop

! !

New test-cases:

! ! !
!
! !
!
2004-06-17, apple-ppc-branch
! !
! Description of vectorizable loops: ! !
    !
  1. Inner most loops, that consist of a single ! basic block (i.e, straight-line code, no ! if-then-else).
  2. ! !
  3. The loop has to be countable - i.e, the number ! of iterations can be evaluated before the loop ! starts to execute.
  4. ! !
  5. The loop bound (number of iterations) can be ! unknown.
  6. ! !
  7. Memory accesses are one dimensional-arrays, ! which alignment can be forced (not extern ! arrays).
  8. ! !
  9. New: Memory accesses can also be ! pointer based. If more than one memory access is ! present in the loop, any pointers that are used in ! the loop have to be annotated as __restricted__. ! (new experimental feature)
  10. ! !
  11. New: Loads (memory reads) can be ! unaligned by a known amount (e.g. access a[i+1], ! where array 'a' is aligned and i starts from ! 0). Stores (memory writes) still have to ! be aligned.
  12. ! !
  13. All memory accesses are consecutive ! (stride=1).
  14. ! !
  15. Supportable operations include plus/minus/mult, ! as well as bitwise operations - ! and/or/xor/1's-complement, according to available ! vector support in the target platform.
  16. ! !
  17. All operations operate on data types of the ! same size.
  18. ! !
  19. New: Some forms of if-the-else patterns ! can be vectorized. (new experimental ! feature).
  20. ! !
  21. No reductions (sum += a[i]) or inductions (a[i] ! = i).
  22. ! !
  23. Constants and invariants are supported (a[i] = ! 5, a[i] = x).
  24. !
! !

Examples of newly vectorizable loops: loop

!
!
! !
!
2004-06-04, lno-branch (and ! apple-ppc-branch)
! !
! Description of vectorizable loops: ! !
    !
  1. Inner most loops, that consist of a single ! basic block (i.e, straight-line code, no ! if-then-else).
  2. ! !
  3. The loop has to be countable - i.e, the number ! of iterations can be evaluated before the loop ! starts to execute.
  4. ! !
  5. New: No restrictions on the form of the ! loop index and the loop exit ! condition.
  6. ! !
  7. New: The loop bound (number of ! iterations) can be unknown. Currently ! known loop-bounds still need to be divisible by the ! vectorization factor.
  8. ! !
  9. All memory accesses are one dimensional-arrays, ! which alignment can be forced (not extern ! arrays).
  10. ! !
  11. All array accesses are consecutive and aligned; ! i,e. all the array references are of the form ! 'a[i]', where 'i' is updated from 0 to N in steps ! of 1.
  12. ! !
  13. New: Supportable operations ! include plus/minus/mult, as well as bitwise ! operations - and/or/xor/1's-complement, ! according to available vector support in the target ! platform.
  14. ! !
  15. All operations operate on data types of the ! same size.
  16. ! !
  17. No reductions (sum += a[i]) or inductions (a[i] ! = i).
  18. ! !
  19. New: Constants and invariants are ! supported (a[i] = 5, a[i] = x).
  20. !
! !

Examples of newly vectorizable loops: loop

! !

New test-cases:

! ! !
!
! !
!
2004-01-01, lno-branch
! !
! Description of vectorizable loops: ! !
    !
  1. Inner most loops, that consist of a single ! basic block (i.e, straight-line code, no ! if-then-else).
  2. ! !
  3. The loop has to be countable - i.e, the number ! of iterations can be evaluated before the loop ! starts to execute.
  4. ! !
  5. Known (constant) loop bound, divisible by the ! vectorization factor.
  6. ! !
  7. The loop index 'i' is updated from 0 to N in ! steps of 1, and the loop exit condition is of the ! form "i<N".
  8. ! !
  9. All memory accesses are one dimensional-arrays, ! which alignment can be forced (not extern ! arrays).
  10. ! !
  11. All array accesses are consecutive (stride=1) ! and aligned.
  12. ! !
  13. Supportable operations include plus/minus/mult, ! according to available vector support in the target ! platform.
  14. ! !
  15. All operations operate on data types of the ! same size.
  16. ! !
  17. No reductions (sum += a[i]) or inductions (a[i] ! = i).
  18. ! !
  19. No Constants or invariants in the loop (a[i] = ! 5).
  20. !
! !

Examples of vectorizable loops: loop

! !

New test-cases:

! ! !
!
! !

Related Links

Related discussion threads: http://gcc.gnu.org/ml/gcc/2003-07/msg01309.html.

!

GCC summit paper: http://people.redhat.com/lockhart/.gcc04/MasterGCC-2side.pdf

! !

This web page is managed by Dorit Naishlos <dorit at il dot ibm dot com>.

!

In The Works

! !

Current contributors to this project include Olga ! Golovanevsky, Devang Patel, Ayal Zaks and Dorit Naishlos. ! Currently in the works:

! !
    !
  1. Merge the vectorizer from lno-branch to mainline
  2. ! !
  3. Port new functionality from apple-ppc-branch to ! lno-branch
  4. ! !
  5. Loop peeling to force alignment of accesses in the ! loop.
  6. ! !
  7. Loop versioning, to create a version of the loop in ! which all accesses are aligned.
  8. ! !
  9. Support for reduction and induction
  10. !
! !

High-Level Plan of ! Implementation

The table below outlines the high level vectorization scheme along with a proposal for an implementation scheme, as *************** *** 104,111 **** vectorizer on candidate loops. Loops that are considered vectorizable by the basic-vectorizer are of the form: for(i=0; i<N; i++) {a[i] = b[i] + ! c[i]; }. The "basic vectorizer" is implemented ! by  Dorit Naishlos.

  • --- 819,826 ---- vectorizer on candidate loops. Loops that are considered vectorizable by the basic-vectorizer are of the form: for(i=0; i<N; i++) {a[i] = b[i] + ! c[i]; }. The "basic vectorizer" is implemented by ! Dorit Naishlos.

  • *************** *** 259,265 ****
  • Support operations that can't be expressed using existing (scalar) tree-codes; this requires developing  a mechanism to expose to the compiler that such operations are supportable.
  • --- 974,980 ----
  • Support operations that can't be expressed using existing (scalar) tree-codes; this requires developing a mechanism to expose to the compiler that such operations are supportable.
  • *************** *** 436,442 **** !

    List of Vectorization Related Tasks

    The following is a list of independent directions by which the basic vectorizer can be enhanced. It should be possible for --- 1151,1158 ---- !

    List of Vectorization Related ! Tasks (Todo List)

    The following is a list of independent directions by which the basic vectorizer can be enhanced. It should be possible for *************** *** 511,517 **** integers, etc.).

    Status: Open for discussion. Contact: Dorit ! Naishlos; Martin Brain.
    Related discussion: http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00469.html

    --- 1227,1233 ---- integers, etc.).

    Status: Open for discussion. Contact: Dorit ! Naishlos.
    Related discussion: http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00469.html

    *************** *** 550,557 **** time tests to decide which version of the loop to execute (scalar or vectorized).

    !

    Status: Open for discussion. Contact: Martin ! Brain.
    Related discussion: http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00469.html

    --- 1266,1273 ---- time tests to decide which version of the loop to execute (scalar or vectorized).

    !

    Status: Open for discussion. Contact:Dorit ! Naishlos.
    Related discussion: http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00469.html

    *************** *** 695,701 ****

    Status: Currently, the access function is determined using the evolution analyzer. Statements ! that are only used for address computations will bw detected and handled accordingly. Contact: Dorit Naishlos.

    --- 1411,1417 ----

    Status: Currently, the access function is determined using the evolution analyzer. Statements ! that are only used for address computations will be detected and handled accordingly. Contact: Dorit Naishlos.

    *************** *** 789,797 **** the RTL level. !

    Status: Some of this functionality can be ! ported from the rtlopt branch. Contact: Steven ! Bosscher.

  • --- 1505,1514 ---- the RTL level.
  • !

    Status: Preliminary implementation of some of ! this functionality is in apple-ppc-branch. Peeling and ! versioning for alignment are in the works. Contact: ! Dorit Naishlos, Olga Golovanevsky.

  • *************** *** 838,849 **** "http://gcc.gnu.org/ml/gcc/2003-07/msg01355.html"> http://gcc.gnu.org/ml/gcc/2003-07/msg01355.html): !

     for (n = 0; n < HASH_SIZE; n++) {
    !   m = head[n];
    !   head[n] = (Pos)(m >= 32768 ? ! m-32768 : 0);
    !  }

  • Detect saturation idioms, and map them to the --- 1555,1566 ---- "http://gcc.gnu.org/ml/gcc/2003-07/msg01355.html"> http://gcc.gnu.org/ml/gcc/2003-07/msg01355.html): !

    for (n = 0; n < HASH_SIZE; n++) {
    ! m = head[n];
    ! head[n] = (Pos)(m >= 32768 ? m-32768 : ! 0);
    ! }

  • Detect saturation idioms, and map them to the *************** *** 892,901 ****
    1. Support general loop bound (unknown, or doesn't ! divide by the vector size).
    2. Support more complex forms of loop termination ! condition and loop index update.
    3. Support outer-loop vectorization (unroll and jam).
    4. --- 1609,1619 ----
      1. Support general loop bound (unknown, or doesn't ! divide by the vector size). [done; contact: Olga ! Golovanevsky]
      2. Support more complex forms of loop termination ! condition and loop index update. [done]
      3. Support outer-loop vectorization (unroll and jam).
      4. *************** *** 929,937 ****
      5. Aliasing and Virtual def-use Chains

        !

         Address the item from the tree-ssa ! todo list - "SSA information for arrays : The existing implementation treats arrays as an opaque object. A definition to an array location is treated as a definition for the whole array"

        --- 1647,1655 ----
      6. Aliasing and Virtual def-use Chains

        !

        Address the item from the tree-ssa ! todo list - "SSA information for arrays : The existing implementation treats arrays as an opaque object. A definition to an array location is treated as a definition for the whole array"

        *************** *** 990,998 ****
      7. loop unswitching.
      8. !

        Status: Some of this functionality can be ! ported from the rtlopt branch. Daniel Berlin is working ! on loop interchange.

      9. --- 1708,1715 ----
      10. loop unswitching.
      11. !

        Status:Linear loop transformations are ! implemented by Daniel Berlin.