[PATCH 24/46] i386: Emulate MMX maskmovq with SSE2 maskmovdqu

Mon Feb 4 14:23:00 GMT 2019

On Mon, Feb 4, 2019 at 5:59 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Mon, Feb 04, 2019 at 05:36:12AM -0800, H.J. Lu wrote:
> > +      /* Shift __A128 and __N128 to the left by the adjustment.  */
> > +      switch (offset)
>
> Ah, no, sorry, that is a bad suggestion then.  On the other side,

The generated codes aren't too bad:

(gdb) disass test_maskmovq
Dump of assembler code for function test_maskmovq:
   0x00000000004011b0 <+0>: mov    %rdx,%rax
   0x00000000004011b3 <+3>: movq   (%rdi),%xmm0
   0x00000000004011b7 <+7>: movq   (%rsi),%xmm1
   0x00000000004011bb <+11>: and    $0xf,%eax
   0x00000000004011be <+14>: je     0x4011d4 <test_maskmovq+36>
   0x00000000004011c0 <+16>: cmp    $0x8,%rax
   0x00000000004011c4 <+20>: jbe    0x4011e0 <test_maskmovq+48>
   0x00000000004011c6 <+22>: sub    $0x8,%rdx
   0x00000000004011ca <+26>: pslldq $0x8,%xmm0
   0x00000000004011cf <+31>: pslldq $0x8,%xmm1
   0x00000000004011d4 <+36>: mov    %rdx,%rdi
   0x00000000004011d7 <+39>: maskmovdqu %xmm1,%xmm0
   0x00000000004011db <+43>: retq
   0x00000000004011dc <+44>: nopl   0x0(%rax)
   0x00000000004011e0 <+48>: sub    %rax,%rdx
   0x00000000004011e3 <+51>: jmpq   *0x402008(,%rax,8)
   0x00000000004011ea <+58>: nopw   0x0(%rax,%rax,1)
   0x00000000004011f0 <+64>: pslldq $0x7,%xmm0
   0x00000000004011f5 <+69>: pslldq $0x7,%xmm1
   0x00000000004011fa <+74>: jmp    0x4011d4 <test_maskmovq+36>
   0x00000000004011fc <+76>: nopl   0x0(%rax)
   0x0000000000401200 <+80>: pslldq $0x2,%xmm0
   0x0000000000401205 <+85>: pslldq $0x2,%xmm1
   0x000000000040120a <+90>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040120c <+92>: nopl   0x0(%rax)
   0x0000000000401210 <+96>: pslldq $0x3,%xmm0
   0x0000000000401215 <+101>: pslldq $0x3,%xmm1
   0x000000000040121a <+106>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040121c <+108>: nopl   0x0(%rax)
   0x0000000000401220 <+112>: pslldq $0x4,%xmm0
   0x0000000000401225 <+117>: pslldq $0x4,%xmm1
   0x000000000040122a <+122>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040122c <+124>: nopl   0x0(%rax)
   0x0000000000401230 <+128>: pslldq $0x5,%xmm0
   0x0000000000401235 <+133>: pslldq $0x5,%xmm1
   0x000000000040123a <+138>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040123c <+140>: nopl   0x0(%rax)
   0x0000000000401240 <+144>: pslldq $0x6,%xmm0
   0x0000000000401245 <+149>: pslldq $0x6,%xmm1
   0x000000000040124a <+154>: jmp    0x4011d4 <test_maskmovq+36>
   0x000000000040124c <+156>: pslldq $0x1,%xmm0
   0x0000000000401251 <+161>: pslldq $0x1,%xmm1
   0x0000000000401256 <+166>: jmpq   0x4011d4 <test_maskmovq+36>
End of assembler dump.

__int128 isn't much better:

(gdb) disass test_maskmovq
Dump of assembler code for function test_maskmovq:
   0x00000000004011b0 <+0>: mov    %rdx,%rcx
   0x00000000004011b3 <+3>: mov    (%rdi),%rax
   0x00000000004011b6 <+6>: mov    (%rsi),%rdi
   0x00000000004011b9 <+9>: and    $0xf,%ecx
   0x00000000004011bc <+12>: je     0x401240 <test_maskmovq+144>
   0x00000000004011c2 <+18>: cmp    $0x8,%rcx
   0x00000000004011c6 <+22>: mov    $0x8,%esi
   0x00000000004011cb <+27>: mov    %rax,%r8
   0x00000000004011ce <+30>: push   %rbx
   0x00000000004011cf <+31>: cmova  %rsi,%rcx
   0x00000000004011d3 <+35>: sar    $0x3f,%rax
   0x00000000004011d7 <+39>: mov    %r8,%r10
   0x00000000004011da <+42>: mov    %rdi,%rbx
   0x00000000004011dd <+45>: mov    %rax,%r11
   0x00000000004011e0 <+48>: sar    $0x3f,%rdi
   0x00000000004011e4 <+52>: xor    %eax,%eax
   0x00000000004011e6 <+54>: sub    %rcx,%rdx
   0x00000000004011e9 <+57>: shl    $0x3,%ecx
   0x00000000004011ec <+60>: mov    %rdi,%rsi
   0x00000000004011ef <+63>: shl    %cl,%r10
   0x00000000004011f2 <+66>: shld   %cl,%r8,%r11
   0x00000000004011f6 <+70>: test   $0x40,%cl
   0x00000000004011f9 <+73>: cmovne %r10,%r11
   0x00000000004011fd <+77>: cmovne %rax,%r10
   0x0000000000401201 <+81>: shld   %cl,%rbx,%rsi
   0x0000000000401205 <+85>: xor    %edi,%edi
   0x0000000000401207 <+87>: shl    %cl,%rbx
   0x000000000040120a <+90>: test   $0x40,%cl
   0x000000000040120d <+93>: mov    %r11,-0x8(%rsp)
   0x0000000000401212 <+98>: cmovne %rbx,%rsi
   0x0000000000401216 <+102>: movq   %r10,%xmm0
   0x000000000040121b <+107>: cmovne %rdi,%rbx
   0x000000000040121f <+111>: mov    %rdx,%rdi
   0x0000000000401222 <+114>: movq   %rbx,%xmm1
   0x0000000000401227 <+119>: movhps -0x8(%rsp),%xmm0
   0x000000000040122c <+124>: mov    %rsi,-0x8(%rsp)
   0x0000000000401231 <+129>: movhps -0x8(%rsp),%xmm1
   0x0000000000401236 <+134>: maskmovdqu %xmm1,%xmm0
   0x000000000040123a <+138>: pop    %rbx
   0x000000000040123b <+139>: retq
   0x000000000040123c <+140>: nopl   0x0(%rax)
   0x0000000000401240 <+144>: movq   %rdi,%xmm1
   0x0000000000401245 <+149>: movq   %rax,%xmm0
   0x000000000040124a <+154>: mov    %rdx,%rdi
   0x000000000040124d <+157>: maskmovdqu %xmm1,%xmm0
   0x0000000000401251 <+161>: retq
End of assembler dump.

> (zext (word_var)) << shift
> where zext is from "word" to double-word and shift is 1 to word bitsize - 1
> can be done as (word_var << shift) | ((word_var >> (word_bitsize - shift) } << word_bitsize))
> so you could avoid the int128 shifts anyway and just shift left and right
> and construct v2di from that.
>

This requires 2 64-bit variables for one 128-bit variable.  There is much
a difference for x86-64.  I don't think we can emulate MMX with SSE in
32-bit mode since __m64 is passed and returned in MMX registers.

--
H.J.