[PATCH] Function specific optimization for mainline
Michael Meissner
michael.meissner@amd.com
Wed Jun 11 21:21:00 GMT 2008
In the next 4 patches, I will be submitting the changes for the function
specific optimizations that allow 386 users to change what compiler options are
used on a particular function that I and Karthik Kumar <karthikkumar@gmail.com>
have been working on.
I want to thank Karthik for his work on this project, and for the pre-reviewers
that helped me refine the work.
The patches will cover the machine independent changes, the x86 specific
changes, the documentation changes, and the new testsuite files that I and
Karthik wrote.
The twiki for the project is at, and I have updated it to the current syntax
and work:
http://gcc.gnu.org/wiki/FunctionSpecificOpt
These patches are for the first level of the work, that adds function specific
optimization attributes for a given function. I hope to get to second level
shortly to add the #pragma syntax as well as the attributes.
Here is a simple example that multiplies two vectors and adds a third. If you
compile it in 32-bit and with -O3, it will show off the utility of function
specific optimization.
The compiler will generate normal 387 code for the fma_generic function (flds,
fmuls, fadds, and fsps, because the 32-bit ABI predates the SSE2 instructions).
The fma_sse2 function will contain the parallel multiplies and adds (movaps,
mulps, addps) that are part of the SSE2 instruction set.
The fma_sse5 function will contain the fused multiply/add (movaps, fmaddps)
that are part of the SSE5 extended instruction set.
/*
* Test program to demonstrate function specific optimization options.
*/
#include <time.h>
#include <unistd.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include "cpuid.h"
#ifndef ALIGN
#define ALIGN __attribute__((__aligned__(32)))
#endif
#ifndef UNUSED
#define UNUSED __attribute__((__unused__))
#endif
#ifndef SIZE
#define SIZE (2 << 26)
#endif
static float a[SIZE] ALIGN;
static float b[SIZE] ALIGN;
static float c[SIZE] ALIGN;
static float d[SIZE] ALIGN;
static void initialize (void);
static int have_sse2_instructions (void);
static int have_sse5_instructions (void);
static void fma_generic (void);
static void fma_sse2 (void) __attribute__((__option__("sse2")));
static void fma_sse5 (void) __attribute__((__option__("sse5")));
static void
initialize (void)
{
int i;
for (i = 0; i < SIZE; i++)
{
a[i] = 0.0;
b[i] = (((((double) random ()) / (double)RAND_MAX)) * 100.0) + 1.0;
c[i] = (((((double) random ()) / (double)RAND_MAX)) * 100.0) + 1.0;
d[i] = (((((double) random ()) / (double)RAND_MAX)) * 100.0) + 1.0;
}
}
static int
have_sse5_instructions (void)
{
unsigned int eax, ebx, ecx, edx;
if (!__get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx))
return 0;
/* Run SSE5 test only if host has SSE5 support. */
if (ecx & bit_SSE5)
return 1;
return 0;
}
static int
have_sse2_instructions (void)
{
unsigned int eax, ebx, ecx, edx;
if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx))
return 0;
/* Run SSE2 test only if host has SSE2 support. */
if (edx & bit_SSE2)
return 1;
return 0;
}
/*
* Simple function to do a floating point multiply and accumulate using the
* default options. In 32-bit mode, this is done using the 387 floating point
* stack, and it is not vectorized.
*/
static void
fma_generic (void)
{
int i;
for (i = 0; i < SIZE; i++)
a[i] = (b[i] * c[i]) + d[i];
}
/*
* Simple function to do a floating point multiply and accumulate using the
* SSE2 instruction set, which allows for vectorizing the multiplies and adds
* to do 4 operations at a time, using the mulps/addps instructions.
*/
static void
fma_sse2 (void)
{
int i;
for (i = 0; i < SIZE; i++)
a[i] = (b[i] * c[i]) + d[i];
}
/*
* Simple function to do a floating point multiply and accumulate using the
* SSE5 instruction set, which allows for vectorizing the multiplies and adds
* to do 4 operations at a time, with the multiply and add fused into a fmaddps
* instruction.
*/
static void
fma_sse5 (void)
{
int i;
for (i = 0; i < SIZE; i++)
a[i] = (b[i] * c[i]) + d[i];
}
/*
* Time one test.
*/
void
time_test (void (*func) (void), const char *str)
{
clock_t start, end;
printf ("\n%s start\n", str);
start = clock ();
func ();
end = clock ();
printf ("%s time is %g\n", str, (((double) (end - start)) / ((double) CLOCKS_PER_SEC)));
}
/*
* Run the tests.
*/
int
main (int argc UNUSED, const char *argv[] UNUSED)
{
time_test (initialize, "Initialize");
time_test (fma_generic, "Generic");
if (have_sse2_instructions ())
time_test (fma_sse2, "Sse2");
else
printf ("\nMachine does not have sse2 instructions.\n");
if (have_sse5_instructions ())
time_test (fma_sse5, "Sse5");
else
printf ("\nMachine does not have sse5 instructions.\n");
return 0;
}
--
Michael Meissner, AMD
90 Central Street, MS 83-29, Boxborough, MA, 01719, USA
michael.meissner@amd.com
More information about the Gcc-patches
mailing list