// // This code represents the fundamental 'convolution' algorithm which // is central to image processing and is used by GraphicsMagick. On // x86 and AMD_64 CPUs, it is observed that -mfpmath=387 produces 2X // the performance of -mfpmath=sse on a wide variety of CPUs. // Unfortunately, -mfpmath=sse is the default for -m64 builds. // // Note that even with -mfpmath=387 it is observed that other // compilers (LLVM, Open64, Oracle Studio) produce considerably faster // code on AMD_64. Whole program performance differences of up to 3X // have been observed with other compilers as compared with GCC's -m64 // -march=native defaults. // // The input is a rectangular stripe of input pixels, a per-pixel // stride (to process a rectangular region of the stripe around the // current pixel), a square convolution kernel, and the dimension of // the convolution kernel. #include // INTYPE can be 'unsigned char', 'unsigned short', or 'unsigned int' #if !defined(INTYPE) # define INTYPE unsigned short #endif // OUTTYPE can be 'float' or 'double' #if !defined(OUTTYPE) # define OUTTYPE float #endif // KERNEL type can be 'float' or 'double' #if !defined(KERNEL) # define KERNEL float #endif typedef INTYPE intype; typedef OUTTYPE outtype; typedef KERNEL kernel; typedef struct { intype red; intype green; intype blue; } inpix; typedef struct { outtype red; outtype green; outtype blue; } outpix; outpix func(inpix *in, ptrdiff_t stride, kernel *k, ptrdiff_t kw) { ptrdiff_t u, v; outpix p; p.red = p.green = p.blue = 0; for (v=0; vred; p.green += kv * in->green; p.blue += kv * in->blue; ++in; } in += stride - kw; } return p; }