[PATCH 24/46] i386: Emulate MMX maskmovq with SSE2 maskmovdqu
H.J. Lu
hjl.tools@gmail.com
Mon Feb 4 14:23:00 GMT 2019
On Mon, Feb 4, 2019 at 5:59 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Mon, Feb 04, 2019 at 05:36:12AM -0800, H.J. Lu wrote:
> > + /* Shift __A128 and __N128 to the left by the adjustment. */
> > + switch (offset)
>
> Ah, no, sorry, that is a bad suggestion then. On the other side,
The generated codes aren't too bad:
(gdb) disass test_maskmovq
Dump of assembler code for function test_maskmovq:
0x00000000004011b0 <+0>: mov %rdx,%rax
0x00000000004011b3 <+3>: movq (%rdi),%xmm0
0x00000000004011b7 <+7>: movq (%rsi),%xmm1
0x00000000004011bb <+11>: and $0xf,%eax
0x00000000004011be <+14>: je 0x4011d4 <test_maskmovq+36>
0x00000000004011c0 <+16>: cmp $0x8,%rax
0x00000000004011c4 <+20>: jbe 0x4011e0 <test_maskmovq+48>
0x00000000004011c6 <+22>: sub $0x8,%rdx
0x00000000004011ca <+26>: pslldq $0x8,%xmm0
0x00000000004011cf <+31>: pslldq $0x8,%xmm1
0x00000000004011d4 <+36>: mov %rdx,%rdi
0x00000000004011d7 <+39>: maskmovdqu %xmm1,%xmm0
0x00000000004011db <+43>: retq
0x00000000004011dc <+44>: nopl 0x0(%rax)
0x00000000004011e0 <+48>: sub %rax,%rdx
0x00000000004011e3 <+51>: jmpq *0x402008(,%rax,8)
0x00000000004011ea <+58>: nopw 0x0(%rax,%rax,1)
0x00000000004011f0 <+64>: pslldq $0x7,%xmm0
0x00000000004011f5 <+69>: pslldq $0x7,%xmm1
0x00000000004011fa <+74>: jmp 0x4011d4 <test_maskmovq+36>
0x00000000004011fc <+76>: nopl 0x0(%rax)
0x0000000000401200 <+80>: pslldq $0x2,%xmm0
0x0000000000401205 <+85>: pslldq $0x2,%xmm1
0x000000000040120a <+90>: jmp 0x4011d4 <test_maskmovq+36>
0x000000000040120c <+92>: nopl 0x0(%rax)
0x0000000000401210 <+96>: pslldq $0x3,%xmm0
0x0000000000401215 <+101>: pslldq $0x3,%xmm1
0x000000000040121a <+106>: jmp 0x4011d4 <test_maskmovq+36>
0x000000000040121c <+108>: nopl 0x0(%rax)
0x0000000000401220 <+112>: pslldq $0x4,%xmm0
0x0000000000401225 <+117>: pslldq $0x4,%xmm1
0x000000000040122a <+122>: jmp 0x4011d4 <test_maskmovq+36>
0x000000000040122c <+124>: nopl 0x0(%rax)
0x0000000000401230 <+128>: pslldq $0x5,%xmm0
0x0000000000401235 <+133>: pslldq $0x5,%xmm1
0x000000000040123a <+138>: jmp 0x4011d4 <test_maskmovq+36>
0x000000000040123c <+140>: nopl 0x0(%rax)
0x0000000000401240 <+144>: pslldq $0x6,%xmm0
0x0000000000401245 <+149>: pslldq $0x6,%xmm1
0x000000000040124a <+154>: jmp 0x4011d4 <test_maskmovq+36>
0x000000000040124c <+156>: pslldq $0x1,%xmm0
0x0000000000401251 <+161>: pslldq $0x1,%xmm1
0x0000000000401256 <+166>: jmpq 0x4011d4 <test_maskmovq+36>
End of assembler dump.
__int128 isn't much better:
(gdb) disass test_maskmovq
Dump of assembler code for function test_maskmovq:
0x00000000004011b0 <+0>: mov %rdx,%rcx
0x00000000004011b3 <+3>: mov (%rdi),%rax
0x00000000004011b6 <+6>: mov (%rsi),%rdi
0x00000000004011b9 <+9>: and $0xf,%ecx
0x00000000004011bc <+12>: je 0x401240 <test_maskmovq+144>
0x00000000004011c2 <+18>: cmp $0x8,%rcx
0x00000000004011c6 <+22>: mov $0x8,%esi
0x00000000004011cb <+27>: mov %rax,%r8
0x00000000004011ce <+30>: push %rbx
0x00000000004011cf <+31>: cmova %rsi,%rcx
0x00000000004011d3 <+35>: sar $0x3f,%rax
0x00000000004011d7 <+39>: mov %r8,%r10
0x00000000004011da <+42>: mov %rdi,%rbx
0x00000000004011dd <+45>: mov %rax,%r11
0x00000000004011e0 <+48>: sar $0x3f,%rdi
0x00000000004011e4 <+52>: xor %eax,%eax
0x00000000004011e6 <+54>: sub %rcx,%rdx
0x00000000004011e9 <+57>: shl $0x3,%ecx
0x00000000004011ec <+60>: mov %rdi,%rsi
0x00000000004011ef <+63>: shl %cl,%r10
0x00000000004011f2 <+66>: shld %cl,%r8,%r11
0x00000000004011f6 <+70>: test $0x40,%cl
0x00000000004011f9 <+73>: cmovne %r10,%r11
0x00000000004011fd <+77>: cmovne %rax,%r10
0x0000000000401201 <+81>: shld %cl,%rbx,%rsi
0x0000000000401205 <+85>: xor %edi,%edi
0x0000000000401207 <+87>: shl %cl,%rbx
0x000000000040120a <+90>: test $0x40,%cl
0x000000000040120d <+93>: mov %r11,-0x8(%rsp)
0x0000000000401212 <+98>: cmovne %rbx,%rsi
0x0000000000401216 <+102>: movq %r10,%xmm0
0x000000000040121b <+107>: cmovne %rdi,%rbx
0x000000000040121f <+111>: mov %rdx,%rdi
0x0000000000401222 <+114>: movq %rbx,%xmm1
0x0000000000401227 <+119>: movhps -0x8(%rsp),%xmm0
0x000000000040122c <+124>: mov %rsi,-0x8(%rsp)
0x0000000000401231 <+129>: movhps -0x8(%rsp),%xmm1
0x0000000000401236 <+134>: maskmovdqu %xmm1,%xmm0
0x000000000040123a <+138>: pop %rbx
0x000000000040123b <+139>: retq
0x000000000040123c <+140>: nopl 0x0(%rax)
0x0000000000401240 <+144>: movq %rdi,%xmm1
0x0000000000401245 <+149>: movq %rax,%xmm0
0x000000000040124a <+154>: mov %rdx,%rdi
0x000000000040124d <+157>: maskmovdqu %xmm1,%xmm0
0x0000000000401251 <+161>: retq
End of assembler dump.
> (zext (word_var)) << shift
> where zext is from "word" to double-word and shift is 1 to word bitsize - 1
> can be done as (word_var << shift) | ((word_var >> (word_bitsize - shift) } << word_bitsize))
> so you could avoid the int128 shifts anyway and just shift left and right
> and construct v2di from that.
>
This requires 2 64-bit variables for one 128-bit variable. There is much
a difference for x86-64. I don't think we can emulate MMX with SSE in
32-bit mode since __m64 is passed and returned in MMX registers.
--
H.J.
More information about the Gcc-patches
mailing list