[PATCH] Function specific optimization for mainline

Michael Meissner michael.meissner@amd.com
Wed Jun 11 21:21:00 GMT 2008


In the next 4 patches, I will be submitting the changes for the function
specific optimizations that allow 386 users to change what compiler options are
used on a particular function that I and Karthik Kumar <karthikkumar@gmail.com>
have been working on.

I want to thank Karthik for his work on this project, and for the pre-reviewers
that helped me refine the work.

The patches will cover the machine independent changes, the x86 specific
changes, the documentation changes, and the new testsuite files that I and
Karthik wrote.

The twiki for the project is at, and I have updated it to the current syntax
and work:
http://gcc.gnu.org/wiki/FunctionSpecificOpt

These patches are for the first level of the work, that adds function specific
optimization attributes for a given function.  I hope to get to second level
shortly to add the #pragma syntax as well as the attributes.

Here is a simple example that multiplies two vectors and adds a third.  If you
compile it in 32-bit and with -O3, it will show off the utility of function
specific optimization.

The compiler will generate normal 387 code for the fma_generic function (flds,
fmuls, fadds, and fsps, because the 32-bit ABI predates the SSE2 instructions).

The fma_sse2 function will contain the parallel multiplies and adds (movaps,
mulps, addps) that are part of the SSE2 instruction set.

The fma_sse5 function will contain the fused multiply/add (movaps, fmaddps)
that are part of the SSE5 extended instruction set.

/*
 * Test program to demonstrate function specific optimization options.
 */

#include <time.h>
#include <unistd.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include "cpuid.h"

#ifndef ALIGN
#define ALIGN __attribute__((__aligned__(32)))
#endif

#ifndef UNUSED
#define UNUSED __attribute__((__unused__))
#endif

#ifndef SIZE
#define SIZE (2 << 26)
#endif

static float a[SIZE] ALIGN;
static float b[SIZE] ALIGN;
static float c[SIZE] ALIGN;
static float d[SIZE] ALIGN;

static void initialize (void);
static int have_sse2_instructions (void);
static int have_sse5_instructions (void);
static void fma_generic (void);
static void fma_sse2 (void) __attribute__((__option__("sse2")));
static void fma_sse5 (void) __attribute__((__option__("sse5")));

static void
initialize (void)
{
  int i;

  for (i = 0; i < SIZE; i++)
    {
      a[i] = 0.0;
      b[i] = (((((double) random ()) / (double)RAND_MAX)) * 100.0) + 1.0;
      c[i] = (((((double) random ()) / (double)RAND_MAX)) * 100.0) + 1.0;
      d[i] = (((((double) random ()) / (double)RAND_MAX)) * 100.0) + 1.0;
    }
}

static int
have_sse5_instructions (void)
{
  unsigned int eax, ebx, ecx, edx;
 
  if (!__get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx))
    return 0;

  /* Run SSE5 test only if host has SSE5 support.  */
  if (ecx & bit_SSE5)
    return 1;

  return 0;
}

static int
have_sse2_instructions (void)
{
  unsigned int eax, ebx, ecx, edx;
 
  if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx))
    return 0;

  /* Run SSE2 test only if host has SSE2 support.  */
  if (edx & bit_SSE2)
    return 1;

  return 0;
}

/*
 * Simple function to do a floating point multiply and accumulate using the
 * default options.  In 32-bit mode, this is done using the 387 floating point
 * stack, and it is not vectorized.
 */

static void
fma_generic (void)
{
  int i;

  for (i = 0; i < SIZE; i++)
    a[i] = (b[i] * c[i]) + d[i];
}

/*
 * Simple function to do a floating point multiply and accumulate using the
 * SSE2 instruction set, which allows for vectorizing the multiplies and adds
 * to do 4 operations at a time, using the mulps/addps instructions.
 */

static void
fma_sse2 (void)
{
  int i;

  for (i = 0; i < SIZE; i++)
    a[i] = (b[i] * c[i]) + d[i];
}

/*
 * Simple function to do a floating point multiply and accumulate using the
 * SSE5 instruction set, which allows for vectorizing the multiplies and adds
 * to do 4 operations at a time, with the multiply and add fused into a fmaddps
 * instruction.
 */

static void
fma_sse5 (void)
{
  int i;

  for (i = 0; i < SIZE; i++)
    a[i] = (b[i] * c[i]) + d[i];
}

/*
 * Time one test.
 */

void
time_test (void (*func) (void), const char *str)
{
  clock_t start, end;

  printf ("\n%s start\n", str);
  start = clock ();
  func ();
  end = clock ();
  printf ("%s time is %g\n", str, (((double) (end - start)) / ((double) CLOCKS_PER_SEC)));
}

/*
 * Run the tests.
 */

int
main (int argc UNUSED, const char *argv[] UNUSED)
{
  time_test (initialize, "Initialize");
  time_test (fma_generic, "Generic");

  if (have_sse2_instructions ())
    time_test (fma_sse2, "Sse2");
  else
    printf ("\nMachine does not have sse2 instructions.\n");

  if (have_sse5_instructions ())
    time_test (fma_sse5, "Sse5");
  else
    printf ("\nMachine does not have sse5 instructions.\n");

  return 0;
}

-- 
Michael Meissner, AMD
90 Central Street, MS 83-29, Boxborough, MA, 01719, USA
michael.meissner@amd.com



More information about the Gcc-patches mailing list