#include typedef struct DSPContext { void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); } DSPContext; static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ asm volatile( //FIXME could save 1 instruction if done as 8x4 ... "movd %4, %%mm0 \n\t" "movd %5, %%mm1 \n\t" "movd %6, %%mm2 \n\t" "movd %7, %%mm3 \n\t" "punpcklbw %%mm1, %%mm0 \n\t" "punpcklbw %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "punpcklwd %%mm2, %%mm0 \n\t" "punpckhwd %%mm2, %%mm1 \n\t" "movd %%mm0, %0 \n\t" "punpckhdq %%mm0, %%mm0 \n\t" "movd %%mm0, %1 \n\t" "movd %%mm1, %2 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movd %%mm1, %3 \n\t" : "=m" (*(uint32_t*)(dst + 0*dst_stride)), "=m" (*(uint32_t*)(dst + 1*dst_stride)), "=m" (*(uint32_t*)(dst + 2*dst_stride)), "=m" (*(uint32_t*)(dst + 3*dst_stride)) : "m" (*(uint32_t*)(src + 0*src_stride)), "m" (*(uint32_t*)(src + 1*src_stride)), "m" (*(uint32_t*)(src + 2*src_stride)), "m" (*(uint32_t*)(src + 3*src_stride)) ); } static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ uint64_t temp[4] __attribute__ ((aligned(8))); uint8_t *btemp= (uint8_t*)temp; src -= 2; transpose4x4(btemp , src , 8, stride); asm volatile( "movq %%mm5, %%mm1 \n\t" "movq %%mm4, %%mm0 \n\t" "punpcklbw %%mm3, %%mm5 \n\t" "punpcklbw %%mm6, %%mm4 \n\t" "punpckhbw %%mm3, %%mm1 \n\t" "punpckhbw %%mm6, %%mm0 \n\t" "movq %%mm5, %%mm3 \n\t" "movq %%mm1, %%mm6 \n\t" "punpcklwd %%mm4, %%mm5 \n\t" "punpcklwd %%mm0, %%mm1 \n\t" "punpckhwd %%mm4, %%mm3 \n\t" "punpckhwd %%mm0, %%mm6 \n\t" "movd %%mm5, %0 \n\t" "punpckhdq %%mm5, %%mm5 \n\t" "movd %%mm5, %1 \n\t" "movd %%mm3, %2 \n\t" "punpckhdq %%mm3, %%mm3 \n\t" "movd %%mm3, %3 \n\t" "movd %%mm1, %4 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movd %%mm1, %5 \n\t" "movd %%mm6, %6 \n\t" "punpckhdq %%mm6, %%mm6 \n\t" "movd %%mm6, %7 \n\t" : "=m" (*(uint32_t*)(src + 0*stride)), "=m" (*(uint32_t*)(src + 1*stride)), "=m" (*(uint32_t*)(src + 2*stride)), "=m" (*(uint32_t*)(src + 3*stride)), "=m" (*(uint32_t*)(src + 4*stride)), "=m" (*(uint32_t*)(src + 5*stride)), "=m" (*(uint32_t*)(src + 6*stride)), "=m" (*(uint32_t*)(src + 7*stride)) ); } void dsputil_init_mmx(DSPContext* c) { c->h263_h_loop_filter= h263_h_loop_filter_mmx; }