Bug 11628

Summary:	movd support for _mm_cvtsi32_si64 and _mm_cvtsi64_si32
Product:	gcc	Reporter:	dean
Component:	target	Assignee:	Not yet assigned to anyone <unassigned>
Status:	RESOLVED FIXED
Severity:	enhancement	CC:	gcc-bugs
Priority:	P2	Keywords:	missed-optimization, ssemmx
Version:	3.4.0
Target Milestone:	4.0.0
Host:	i686-pc-linux-gnu	Target:	i686-pc-linux-gnu
Build:	i686-pc-linux-gnu	Known to work:
Known to fail:		Last reconfirmed:	2004-08-20 06:58:11

Description dean 2003-07-22 04:17:08 UTC

the x86 MOVD opcode is a way more efficient method of handling 32-bit movements 
to/from mmx registers than the 64-bit casts which are currently used... the 
patch below enables the use of MOVD through the _mm_cvtsi32_si64 and 
_mm_cvtsi64_si32 intrinsics.

a more ideal solution would be to handle the 64-bit casting using movd, but 
that's way beyond what i would know how to do to gcc :)

this is essentially a clone of the similar MOVD support for xmm registers.

-dean

here's an example comparing the old method (using casts) to the new method 
using the intrinsics i've added:

$ cat t32.c
#include <stdint.h>
#ifdef __INTEL_COMPILER
#include <emmintrin.h>
#else
#include <xmmintrin.h>
#endif

uint32_t old_64_to_32(__m64 m)
{
        return (unsigned long long)m;
}

uint32_t new_64_to_32(__m64 m)
{
        return _mm_cvtsi64_si32(m);
}

__m64 old_32_to_64(uint32_t a)
{
        long long tmp = (unsigned int)a;
        return (__m64)tmp;
}

__m64 new_32_to_64(uint32_t a)
{
        return _mm_cvtsi32_si64(a);
}

$ ~/gcc/bin/gcc -O -march=pentium4 -fomit-frame-pointer -c t32.c
$ objdump -dr t32.o

t32.o:     file format elf32-i386

Disassembly of section .text:

00000000 <old_64_to_32>:
   0:   83 ec 0c                sub    $0xc,%esp
   3:   0f 6f 44 24 10          movq   0x10(%esp,1),%mm0
   8:   0f 7f 04 24             movq   %mm0,(%esp,1)
   c:   8b 04 24                mov    (%esp,1),%eax
   f:   83 c4 0c                add    $0xc,%esp
  12:   c3                      ret

00000013 <new_64_to_32>:
  13:   0f 6f 44 24 04          movq   0x4(%esp,1),%mm0
  18:   0f 7e c0                movd   %mm0,%eax
  1b:   c3                      ret

0000001c <old_32_to_64>:
  1c:   83 ec 0c                sub    $0xc,%esp
  1f:   8b 44 24 10             mov    0x10(%esp,1),%eax
  23:   8b 54 24 14             mov    0x14(%esp,1),%edx
  27:   b9 00 00 00 00          mov    $0x0,%ecx
  2c:   89 14 24                mov    %edx,(%esp,1)
  2f:   89 4c 24 04             mov    %ecx,0x4(%esp,1)
  33:   0f 6f 04 24             movq   (%esp,1),%mm0
  37:   0f 7f 00                movq   %mm0,(%eax)
  3a:   83 c4 0c                add    $0xc,%esp
  3d:   c2 04 00                ret    $0x4

00000040 <new_32_to_64>:
  40:   8b 44 24 04             mov    0x4(%esp,1),%eax
  44:   0f 6e 44 24 08          movd   0x8(%esp,1),%mm0
  49:   0f 7f 00                movq   %mm0,(%eax)
  4c:   c2 04 00                ret    $0x4

begin 644 movd.patch
M26YD97@Z(&=C8R]C;VYF:6<O:3,X-B]I,S@V+F,*/3T]/3T]/3T]/3T]/3T]
M/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]
M/3T]/3T]/0I20U,@9FEL93H@+V-V<W)O;W0O9V-C+V=C8R]G8V,O8V]N9FEG
M+VDS.#8O:3,X-BYC+'8*<F5T<FEE=FEN9R!R979I<VEO;B`Q+C4X.0ID:69F
M("UU("UR,2XU.#D@:3,X-BYC"BTM+2!G8V,O8V]N9FEG+VDS.#8O:3,X-BYC
M"3$U($IU;"`R,#`S(#$S.C0R.C,Y("TP,#`P"3$N-3@Y"BLK*R!G8V,O8V]N
M9FEG+VDS.#8O:3,X-BYC"3(R($IU;"`R,#`S(#`T.C`R.C`T("TP,#`P"D!`
M("TQ,S$R-"PV("LQ,S$R-"PQ,2!`0`H@("!T<F5E('9O:61?9G1Y<&5?<&-H
M87)?=C$V<6D*("`@("`](&)U:6QD7V9U;F-T:6]N7W1Y<&5?;&ES="`H=F]I
M9%]T>7!E7VYO9&4L"B`)"0D@("`@("`@('!C:&%R7W1Y<&5?;F]D92P@5C$V
M44E?='EP95]N;V1E+"!.54Q,7U12144I.PHK("!T<F5E('8R<VE?9G1Y<&5?
M<&-I;G0**R`@("`](&)U:6QD7V9U;F-T:6]N7W1Y<&5?;&ES="`H5C)325]T
M>7!E7VYO9&4L('!C:6YT7W1Y<&5?;F]D92P@3E5,3%]44D5%*3L**R`@=')E
M92!V;VED7V9T>7!E7W!C:6YT7W8R<VD**R`@("`](&)U:6QD7V9U;F-T:6]N
M7W1Y<&5?;&ES="`H=F]I9%]T>7!E7VYO9&4L"BL)"0D@("`@("`@('!C:6YT
M7W1Y<&5?;F]D92P@5C)325]T>7!E7VYO9&4L($Y53$Q?5%)%12D["B`@('1R
M964@=C1S:5]F='EP95]P8VEN=`H@("`@(#T@8G5I;&1?9G5N8W1I;VY?='EP
M95]L:7-T("A6-%-)7W1Y<&5?;F]D92P@<&-I;G1?='EP95]N;V1E+"!.54Q,
M7U12144I.PH@("!T<F5E('9O:61?9G1Y<&5?<&-I;G1?=C1S:0I`0"`M,3,R
M,CDL-B`K,3,R,S0L.2!`0`H@("!D969?8G5I;'1I;B`H34%32U]-35@L(")?
M7V)U:6QT:6Y?:6$S,E]P86-K<W-D=R(L('8T:&E?9G1Y<&5?=C)S:5]V,G-I
M+"!)6#@V7T)524Q424Y?4$%#2U-31%<I.PH@("!D969?8G5I;'1I;B`H34%3
M2U]-35@L(")?7V)U:6QT:6Y?:6$S,E]P86-K=7-W8B(L('8X<6E?9G1Y<&5?
M=C1H:5]V-&AI+"!)6#@V7T)524Q424Y?4$%#2U535T(I.PH@"BL@(&1E9E]B
M=6EL=&EN("A-05-+7TU-6"P@(E]?8G5I;'1I;E]I83,R7VQO861D7W-I-C0B
M+"!V,G-I7V9T>7!E7W!C:6YT+"!)6#@V7T)524Q424Y?3$]!1$1?4TDV-"D[
M"BL@(&1E9E]B=6EL=&EN("A-05-+7TU-6"P@(E]?8G5I;'1I;E]I83,R7W-I
M-C1?<W1O<F5D(BP@=F]I9%]F='EP95]P8VEN=%]V,G-I+"!)6#@V7T)524Q4
M24Y?4TDV-%]35$]2140I.PHK"B`@(&1E9E]B=6EL=&EN("A-05-+7U-312P@
M(E]?8G5I;'1I;E]I83,R7VQD;7AC<W(B+"!V;VED7V9T>7!E7W5N<VEG;F5D
M+"!)6#@V7T)524Q424Y?3$1-6$-34BD["B`@(&1E9E]B=6EL=&EN("A-05-+
M7U-312P@(E]?8G5I;'1I;E]I83,R7W-T;7AC<W(B+"!U;G-I9VYE9%]F='EP
M95]V;VED+"!)6#@V7T)524Q424Y?4U1-6$-34BD["B`@(&1E9E]B=6EL=&EN
M("A-05-+7U-312P@(E]?8G5I;'1I;E]I83,R7V-V='!I,G!S(BP@=C1S9E]F
M='EP95]V-'-F7W8R<VDL($E8.#9?0E5)3%1)3E]#5E1023)04RD["D!`("TQ
M-#(R-"PV("LQ-#(S,BPQ,2!`0`H@("`@("`@<F5T=7)N(&EX.#9?97AP86YD
M7W-T;W)E7V)U:6QT:6X@*$-/1$5?1D]27W-S93)?;6]V9'%U+"!A<F=L:7-T
M*3L*("`@("!C87-E($E8.#9?0E5)3%1)3E]35$]2140Z"B`@("`@("!R971U
M<FX@:7@X-E]E>'!A;F1?<W1O<F5?8G5I;'1I;B`H0T]$15]&3U)?<W-E,E]S
M=&]R960L(&%R9VQI<W0I.PHK"BL@("`@8V%S92!)6#@V7T)524Q424Y?3$]!
M1$1?4TDV-#H**R`@("`@(')E='5R;B!I>#@V7V5X<&%N9%]U;F]P7V)U:6QT
M:6X@*$-/1$5?1D]27VUM>%]L;V%D9"P@87)G;&ES="P@=&%R9V5T+"`Q*3L*
M*R`@("!C87-E($E8.#9?0E5)3%1)3E]3238T7U-43U)%1#H**R`@("`@(')E
M='5R;B!I>#@V7V5X<&%N9%]S=&]R95]B=6EL=&EN("A#3T1%7T9/4E]M;7A?
M<W1O<F5D+"!A<F=L:7-T*3L*(`H@("`@(&-A<V4@25@X-E]"54E,5$E.7TU/
M3DE43U(Z"B`@("`@("!A<F<P(#T@5%)%15]604Q512`H87)G;&ES="D["DEN
M9&5X.B!G8V,O8V]N9FEG+VDS.#8O:3,X-BYH"CT]/3T]/3T]/3T]/3T]/3T]
M/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]
M/3T]/3T*4D-3(&9I;&4Z("]C=G-R;V]T+V=C8R]G8V,O9V-C+V-O;F9I9R]I
M,S@V+VDS.#8N:"QV"G)E=')I979I;F<@<F5V:7-I;VX@,2XS-3`*9&EF9B`M
M=2`M<C$N,S4P(&DS.#8N:`HM+2T@9V-C+V-O;F9I9R]I,S@V+VDS.#8N:`DQ
M,2!*=6P@,C`P,R`Q-#HP.3HR.2`M,#`P,`DQ+C,U,`HK*RL@9V-C+V-O;F9I
M9R]I,S@V+VDS.#8N:`DR,B!*=6P@,C`P,R`P-#HP,CHP-"`M,#`P,`I`0"`M
M,C$X,BPV("LR,3@R+#@@0$`*("`@25@X-E]"54E,5$E.7TU/5E$L"B`@($E8
M.#9?0E5)3%1)3E],3T%$1"P*("`@25@X-E]"54E,5$E.7U-43U)%1"P**R`@
M25@X-E]"54E,5$E.7TQ/041$7U-)-C0L"BL@($E8.#9?0E5)3%1)3E]3238T
M7U-43U)%1"P*(`H@("!)6#@V7T)524Q424Y?0TQ25$DL"B`*26YD97@Z(&=C
M8R]C;VYF:6<O:3,X-B]I,S@V+FUD"CT]/3T]/3T]/3T]/3T]/3T]/3T]/3T]
M/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T*
M4D-3(&9I;&4Z("]C=G-R;V]T+V=C8R]G8V,O9V-C+V-O;F9I9R]I,S@V+VDS
M.#8N;60L=@IR971R:65V:6YG(')E=FES:6]N(#$N-#<Y"F1I9F8@+74@+7(Q
M+C0W.2!I,S@V+FUD"BTM+2!G8V,O8V]N9FEG+VDS.#8O:3,X-BYM9`DR,B!*
M=6P@,C`P,R`P,#HS-CHU,B`M,#`P,`DQ+C0W.0HK*RL@9V-C+V-O;F9I9R]I
M,S@V+VDS.#8N;60),C(@2G5L(#(P,#,@,#0Z,#(Z,#@@+3`P,#`*0$`@+3$Y
M-#,P+#8@*S$Y-#,P+#(X($!`"B`@(&]P97)A;F1S6S)=(#T@0T].4U0P7U)4
M6"`H5C)$1FUO9&4I.PH@?2D*(`HK*&1E9FEN95]I;G-N(")M;7A?;&]A9&0B
M"BL@(%LH<V5T("AM871C:%]O<&5R86YD.E8R4TD@,"`B<F5G:7-T97)?;W!E
M<F%N9"(@(CUY(BD**PDH=F5C7VUE<F=E.E8R4TD**PD@*'9E8U]D=7!L:6-A
M=&4Z5C)322`H;6%T8VA?;W!E<F%N9#I322`Q(")N;VYI;6UE9&EA=&5?;W!E
M<F%N9"(@(FUR(BDI"BL)("AC;VYS=%]V96-T;W(Z5C)322!;*&-O;G-T7VEN
M="`P*0HK"0D)("`@("`H8V]N<W1?:6YT(#`I72D**PD@*&-O;G-T7VEN="`Q
M*2DI70HK("`B5$%21T547TU-6"(**R`@(FUO=F1<='LE,2P@)3!\)3`L("4Q
M?2(**R`@6RAS971?871T<B`B='EP92(@(FUM>&UO=B(I"BL@("`H<V5T7V%T
M='(@(FUO9&4B(")$22(I72D**PHK*&1E9FEN95]I;G-N(")M;7A?<W1O<F5D
M(@HK("!;*'-E="`H;6%T8VA?;W!E<F%N9#I322`P(")N;VYI;6UE9&EA=&5?
M;W!E<F%N9"(@(CUM<B(I"BL)*'9E8U]S96QE8W0Z4TD**PD@*&UA=&-H7V]P
M97)A;F0Z5C)322`Q(")R96=I<W1E<E]O<&5R86YD(B`B>2(I"BL)("AP87)A
M;&QE;"!;*&-O;G-T7VEN="`P*5TI*2E="BL@(")405)'151?34U8(@HK("`B
M;6]V9%QT>R4Q+"`E,'PE,"P@)3%](@HK("!;*'-E=%]A='1R(")T>7!E(B`B
M;6UX;6]V(BD**R`@("AS971?871T<B`B;6]D92(@(D1)(BE=*0HK"B`H9&5F
M:6YE7VEN<VX@(FUO=G8X<6E?:6YT97)N86PB"B`@(%LH<V5T("AM871C:%]O
M<&5R86YD.E8X44D@,"`B;F]N:6UM961I871E7V]P97)A;F0B("(]>2QY+&TB
M*0H@"2AM871C:%]O<&5R86YD.E8X44D@,2`B=F5C=&]R7VUO=F5?;W!E<F%N
M9"(@(D,L>6TL>2(I*5T*26YD97@Z(&=C8R]C;VYF:6<O:3,X-B]M;6EN=')I
M;BYH"CT]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]
M/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T*4D-3(&9I;&4Z("]C=G-R;V]T
M+V=C8R]G8V,O9V-C+V-O;F9I9R]I,S@V+VUM:6YT<FEN+F@L=@IR971R:65V
M:6YG(')E=FES:6]N(#$N.`ID:69F("UU("UR,2XX(&UM:6YT<FEN+F@*+2TM
M(&=C8R]C;VYF:6<O:3,X-B]M;6EN=')I;BYH"38@2G5N(#(P,#,@,30Z,#8Z
M-#$@+3`P,#`),2XX"BLK*R!G8V,O8V]N9FEG+VDS.#8O;6UI;G1R:6XN:`DR
M,B!*=6P@,C`P,R`P-#HP,CHP."`M,#`P,`I`0"`M-34L,3$@*S4U+#$P($!`
M"B!]"B`*("\J($-O;G9E<G0@22!T;R!A(%]?;38T(&]B:F5C="X@(%1H92!I
M;G1E9V5R(&ES('IE<F\M97AT96YD960@=&\@-C0M8FET<RX@("HO"BUS=&%T
M:6,@7U]I;FQI;F4@7U]M-C0@"BU?;6U?8W9T<VDS,E]S:38T("AI;G0@7U]I
M*0HK<W1A=&EC(%]?:6YL:6YE(%]?;38T"BM?;6U?8W9T<VDS,E]S:38T("AI
M;G0@7U]!*0H@>PHM("!L;VYG(&QO;F<@7U]T;7`@/2`H=6YS:6=N960@:6YT
M*5]?:3L*+2`@<F5T=7)N("A?7VTV-"D@7U]T;7`["BL@(')E='5R;B`H7U]M
M-C0I(%]?8G5I;'1I;E]I83,R7VQO861D7W-I-C0@*"9?7T$I.PH@?0H@"B!S
M=&%T:6,@7U]I;FQI;F4@7U]M-C0@"D!`("TX."PW("LX-RPX($!`"B!S=&%T
M:6,@7U]I;FQI;F4@:6YT"B!?;6U?8W9T<VDV-%]S:3,R("A?7VTV-"!?7VDI
M"B!["BT@(&QO;F<@;&]N9R!?7W1M<"`]("AL;VYG(&QO;F<I7U]I.PHK("!I
M;G0@7U]T;7`["BL@(%]?8G5I;'1I;E]I83,R7W-I-C1?<W1O<F5D("@F7U]T
H;7`L("A?7W8R<VDI7U]I*3L*("`@<F5T=7)N(%]?=&UP.PH@?0H@"@``
`
end

Comment 1 Andrew Pinski 2003-07-22 13:04:49 UTC

Could you send you patch to gcc-patches@gcc.gnu.org after reading 
<http://gcc.gnu.org/contribute.html>.

I can confirm his way is much faster.

Comment 2 dean 2003-07-22 16:45:42 UTC

Subject: Re:  movd support for _mm_cvtsi32_si64 and
 _mm_cvtsi64_si32

On Tue, 22 Jul 2003, pinskia at physics dot uc dot edu wrote:

> Could you send you patch to gcc-patches@gcc.gnu.org after reading
> <http://gcc.gnu.org/contribute.html>.

i sent the patch there months ago and got no response.

-dean

Comment 3 Andrew Pinski 2003-07-23 20:19:08 UTC

Might be related to bug 8871.

Comment 4 otaylor 2003-07-23 20:34:03 UTC

Indeed 8871 is about fixing the 64-bit casting to use movd. I think this bug 
should be marked as a duplicate.

(Ading extra builtins to work around the broken patterns for
casting, which it sounds like what this patch does, is, IMO,
not a good idea.)

Comment 5 Andrew Pinski 2003-07-23 20:38:42 UTC

A dup of bug 8871.

*** This bug has been marked as a duplicate of 8871 ***

Comment 6 dean 2003-07-23 21:46:02 UTC

Subject: Re:  movd support for _mm_cvtsi32_si64 and
 _mm_cvtsi64_si32

On Wed, 23 Jul 2003, otaylor at redhat dot com wrote:

> (Ading extra builtins to work around the broken patterns for
> casting, which it sounds like what this patch does, is, IMO,
> not a good idea.)

i agree!  thanks :)  i just took the wussy approach 'cause i don't know
enough gcc to do the right appraoch.

-dean

Comment 7 Andrew Pinski 2003-08-23 23:15:27 UTC

The patch which fixed PR 8871 did not fix old_64_to_32. So reopening bug.

Comment 8 Uroš Bizjak 2004-11-30 14:58:33 UTC

'gcc -O -march=pentium4 -fomit-frame-pointer -S t32.c' now produces:

        .file   "t32.c"
        .text
.globl old_64_to_32
        .type   old_64_to_32, @function
old_64_to_32:
        subl $12, %esp
        movq %mm0, (%esp)
        movl (%esp), %eax
        addl $12, %esp
        ret
        .size   old_64_to_32, .-old_64_to_32
.globl new_64_to_32
        .type   new_64_to_32, @function
new_64_to_32:
        subl $12, %esp
        movq %mm0, (%esp)
        movl (%esp), %eax
        addl $12, %esp
        ret
        .size   new_64_to_32, .-new_64_to_32
.globl old_32_to_64
        .type   old_32_to_64, @function
old_32_to_64:
        movl 4(%esp), %eax
        movd 8(%esp), %mm0
        movq %mm0, (%eax)
        ret $4
        .size   old_32_to_64, .-old_32_to_64
.globl new_32_to_64
        .type   new_32_to_64, @function
new_32_to_64:
        movl 4(%esp), %eax
        movd 8(%esp), %mm0
        movq %mm0, (%eax)
        ret $4
        .size   new_32_to_64, .-new_32_to_64
        .ident  "GCC: (GNU) 4.0.0 20041130 (experimental)"
        .section .note.GNU-stack,"",@progbits

new_* and old_* functions produce the same code.

Comment 9 Uroš Bizjak 2005-01-13 09:26:16 UTC

Fixed on mainline.