This is the mail archive of the
gcc-help@gcc.gnu.org
mailing list for the GCC project.
problems with gcc inline assembly using xmm registers
- From: David Palao <david dot palao at uv dot es>
- To: gcc-help at gcc dot gnu dot org
- Date: Fri, 3 Dec 2004 16:28:53 +0100
- Subject: problems with gcc inline assembly using xmm registers
- Organization: Universidad de Valencia
Hi!
I'm newbie using assembly, but I'm working in computational physics and we
need to design a very fast function doing an operation which is repeated
billions of times in our calculations.
In order to gain performance we decided to use some of the SSE features
(properly used, xmm registers should provide a very nice increase in
performance).
However, I'm having serious troubles using xmm[0...7] registers with gcc
inlines.
One example of the code in which I got these troubles is at the end of the
message.
The error I get is:
../Libraries/fermiqcd_fermi_actions_sse2.h:818: can't find a register in class
`GENERAL_REGS' while reloading `asm'
what I understand is that the function I'm trying to compile uses too many
`GENERAL_REGS' registers but...
...do xmm registers belong to this group? aren't they a special set of
registers in the SSE area?
...Could it be that the inlines use too many intermediate registers from the
general registers group? In that case, how could I avoid such behaviour?
The problem occurs whenever I try to use the xmm intensively; if I don't use
SSE extensions eveything goes ok, but I NEED xmm registers!
Thanks in advance!!!
Best regards
David
PS gcc -v:
gcc version 3.2.3 20030502 (Red Hat Linux 3.2.3-34)
CODE:
__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \
"movsd %1, %%xmm6 \n\t" \
"movsd %2, %%xmm4 \n\t" \
"movsd %3, %%xmm7 \n\t" \
"movsd %4, %%xmm5 \n\t" \
"unpcklpd %%xmm3, %%xmm3 \n\t" \
"unpcklpd %%xmm6, %%xmm6 \n\t" \
"unpcklpd %%xmm4, %%xmm4 \n\t" \
"mulpd %%xmm0, %%xmm3 \n\t" \
"unpcklpd %%xmm7, %%xmm7 \n\t" \
"mulpd %%xmm1, %%xmm6 \n\t" \
"unpcklpd %%xmm5, %%xmm5 \n\t" \
"mulpd %%xmm0, %%xmm4 \n\t" \
"addpd %%xmm6, %%xmm3 \n\t" \
"mulpd %%xmm2, %%xmm7 \n\t" \
"mulpd %%xmm0, %%xmm5 \n\t" \
"addpd %%xmm7, %%xmm4 \n\t" \
"movsd %5, %%xmm6 \n\t" \
"movsd %6, %%xmm7 \n\t" \
"unpcklpd %%xmm6, %%xmm6 \n\t" \
"unpcklpd %%xmm7, %%xmm7 \n\t" \
"mulpd %%xmm1, %%xmm6 \n\t" \
"mulpd %%xmm2, %%xmm7 \n\t" \
"addpd %%xmm6, %%xmm5 \n\t" \
"addpd %%xmm7, %%xmm3 \n\t" \
"movsd %7, %%xmm6 \n\t" \
"movsd %8, %%xmm7 \n\t" \
"unpcklpd %%xmm6, %%xmm6 \n\t" \
"unpcklpd %%xmm7, %%xmm7 \n\t" \
"mulpd %%xmm1, %%xmm6 \n\t" \
"mulpd %%xmm2, %%xmm7 \n\t" \
"addpd %%xmm6, %%xmm4 \n\t" \
"addpd %%xmm7, %%xmm5" \
: \
: \
"m" ((u).c11.real()), \
"m" ((u).c12.real()), \
"m" ((u).c21.real()), \
"m" ((u).c23.real()), \
"m" ((u).c31.real()), \
"m" ((u).c32.real()), \
"m" ((u).c13.real()), \
"m" ((u).c22.real()), \
"m" ((u).c33.real())); \
__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \
"movsd %1, %%xmm7 \n\t" \
"shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
"shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
"shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
"unpcklpd %%xmm6, %%xmm6 \n\t" \
"unpcklpd %%xmm7, %%xmm7 \n\t" \
"xorpd %9, %%xmm0 \n\t" \
"xorpd %9, %%xmm1 \n\t" \
"xorpd %9, %%xmm2 \n\t" \
"mulpd %%xmm0, %%xmm6 \n\t" \
"mulpd %%xmm1, %%xmm7 \n\t" \
"addpd %%xmm6, %%xmm3 \n\t" \
"addpd %%xmm7, %%xmm4 \n\t" \
"movsd %2, %%xmm6 \n\t" \
"movsd %3, %%xmm7 \n\t" \
"unpcklpd %%xmm6, %%xmm6 \n\t" \
"unpcklpd %%xmm7, %%xmm7 \n\t" \
"mulpd %%xmm2, %%xmm6 \n\t" \
"mulpd %%xmm0, %%xmm7 \n\t" \
"addpd %%xmm6, %%xmm5 \n\t" \
"addpd %%xmm7, %%xmm4 \n\t" \
"movsd %4, %%xmm6 \n\t" \
"movsd %5, %%xmm7 \n\t" \
"unpcklpd %%xmm6, %%xmm6 \n\t" \
"unpcklpd %%xmm7, %%xmm7 \n\t" \
"mulpd %%xmm1, %%xmm6 \n\t" \
"mulpd %%xmm0, %%xmm7 \n\t" \
"addpd %%xmm6, %%xmm3 \n\t" \
"addpd %%xmm7, %%xmm5 \n\t" \
"movsd %6, %%xmm0 \n\t" \
"movsd %7, %%xmm6 \n\t" \
"movsd %8, %%xmm7 \n\t" \
"unpcklpd %%xmm0, %%xmm0 \n\t" \
"unpcklpd %%xmm6, %%xmm6 \n\t" \
"unpcklpd %%xmm7, %%xmm7 \n\t" \
"mulpd %%xmm2, %%xmm0 \n\t" \
"mulpd %%xmm1, %%xmm6 \n\t" \
"mulpd %%xmm2, %%xmm7 \n\t" \
"addpd %%xmm0, %%xmm3 \n\t" \
"addpd %%xmm6, %%xmm5 \n\t" \
"addpd %%xmm7, %%xmm4" \
: \
: \
"m" ((u).c11.imag()), \
"m" ((u).c22.imag()), \
"m" ((u).c33.imag()), \
"m" ((u).c21.imag()), \
"m" ((u).c12.imag()), \
"m" ((u).c31.imag()), \
"m" ((u).c13.imag()), \
"m" ((u).c32.imag()), \
"m" ((u).c23.imag()), \
"m" (_sse_double_sgn));