[PATCH v2, libcpp] Faster line lexer.

Fri Jul 10 13:26:00 GMT 2015

On Fri, Jul 10, 2015 at 12:43:48PM +0200, Jakub Jelinek wrote:
> On Fri, Jul 10, 2015 at 11:37:18AM +0200, Uros Bizjak wrote:
> > Have you tried new SSE4.2 implementation (the one with asm flags) with
> > unrolled loop?
> 
> Also, the SSE4.2 implementation looks shorter, so more I-cache friendly,
> so I wouldn't really say it is redundant if they are roughly same speed.
> 
Ok, I tried to also optimize sse4 and found that main problem was
checking that index==16 caused high latency.

Trick was checking first 64 bytes in header using flags. Then loop is 
relatively unlikely as lines longer than 64 bytes are relatively rare.

I tested that on more machines. On haswell sse4 is noticable faster, on
nehalem a sse2 is still bit faster and on amd fx10 its lot slower. How
do I check processor to select sse2 on amd processors where its
considerably slower?

nehalem

real	0m12.091s
user	0m12.088s
sys	0m0.000s

real	0m11.350s
user	0m11.347s
sys	0m0.000s

real	0m8.521s
user	0m8.519s
sys	0m0.000s

real	0m10.407s
user	0m10.402s
sys	0m0.004s

real	0m8.325s
user	0m8.323s
sys	0m0.000s

ivy bridge

real	0m5.005s
user	0m5.004s
sys	0m0.001s

real	0m4.149s
user	0m4.150s
sys	0m0.000s

real	0m3.836s
user	0m3.837s
sys	0m0.000s

real	0m4.097s
user	0m4.098s
sys	0m0.000s

real	0m3.928s
user	0m3.930s
sys	0m0.000s

fx10

real	0m10.456s
user	0m10.469s
sys	0m0.000s

real	0m9.356s
user	0m9.371s
sys	0m0.000s

real	0m9.418s
user	0m9.433s
sys	0m0.000s

real	0m9.039s
user	0m9.050s
sys	0m0.003s

real	0m7.983s
user	0m7.992s
sys	0m0.003s

	* libcpp/lex.c (search_line_sse2): Use better header.
	(search_line_sse42): Likewise.

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 0ad9660..3bf9eae 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -374,35 +374,107 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 
   unsigned int misalign, found, mask;
   const v16qi *p;
-  v16qi data, t;
+  v16qi data, t, tx;
+
+   if (s + 80 < end)
+    {
+      v16qi x0 = __builtin_ia32_loaddqu ((char const *) s);
+      tx =  __builtin_ia32_pcmpeqb128 (x0, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + found;
+        }
+      v16qi x1 = __builtin_ia32_loaddqu ((char const *) (s + 16));
+      v16qi x2 = __builtin_ia32_loaddqu ((char const *) (s + 32));
+      v16qi x3 = __builtin_ia32_loaddqu ((char const *) (s + 48));
+      v16qi x4 = __builtin_ia32_loaddqu ((char const *) (s + 64));
+
+      tx =  __builtin_ia32_pcmpeqb128 (x1, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 16 + found;
+        }
+
+      tx =  __builtin_ia32_pcmpeqb128 (x2, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_qm);
+
+      found = __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 32 + found;
+        }
+
+
+      tx =  __builtin_ia32_pcmpeqb128 (x3, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 48 + found;
+        }
+
+      tx =  __builtin_ia32_pcmpeqb128 (x4, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 64 + found;
+        }
+
+      s += 80;
+    }
 
   /* Align the source pointer.  */
   misalign = (uintptr_t)s & 15;
   p = (const v16qi *)((uintptr_t)s & -16);
   data = *p;
 
-  /* Create a mask for the bytes that are valid within the first
-     16-byte block.  The Idea here is that the AND with the mask
-     within the loop is "free", since we need some AND or TEST
-     insn in order to set the flags for the branch anyway.  */
   mask = -1u << misalign;
 
-  /* Main loop processing 16 bytes at a time.  */
-  goto start;
-  do
+  t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+  found = __builtin_ia32_pmovmskb128 (t);
+  found &= mask;
+
+  while (!found)
     {
       data = *++p;
-      mask = -1;
 
-    start:
       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
       found = __builtin_ia32_pmovmskb128 (t);
-      found &= mask;
     }
-  while (!found);
 
   /* FOUND contains 1 in bits for which we matched a relevant
      character.  Conversion to the byte index is trivial.  */
@@ -421,48 +493,38 @@ search_line_sse42 (const uchar *s, const uchar *end)
 {
   typedef char v16qi __attribute__ ((__vector_size__ (16)));
   static const v16qi search = { '\n', '\r', '?', '\\' };
-
+  const uchar *ret;
   uintptr_t si = (uintptr_t)s;
-  uintptr_t index;
 
-  /* Check for unaligned input.  */
-  if (si & 15)
+  if (__builtin_expect (end - s < 64, 0)
+      && __builtin_expect ((si & 4096) > 4096 - 64, 0))
     {
-      if (__builtin_expect (end - s < 16, 0)
-	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
-	{
-	  /* There are less than 16 bytes left in the buffer, and less
-	     than 16 bytes left on the page.  Reading 16 bytes at this
+	  /* There are less than 64 bytes left in the buffer, and less
+	     than 64 bytes left on the page.  Reading 64 bytes at this
 	     point might generate a spurious page fault.  Defer to the
 	     SSE2 implementation, which already handles alignment.  */
-	  return search_line_sse2 (s, end);
-	}
+       return search_line_sse2 (s, end);
+     }
+__asm __volatile__ ("\
+			pcmpestri $0, (%1), %2; \
+			jnc 1f; lea (%1, %%rcx), %%rax; jmp found; 1: \
+			pcmpestri $0, 16(%1), %2; \
+			jnc 1f; lea 16(%1, %%rcx), %%rax; jmp found; 1: \
+			pcmpestri $0, 32(%1), %2; \
+			jnc 1f; lea 32(%1, %%rcx), %%rax; jmp found; 1: \
+			pcmpestri $0, 48(%1), %2; \
+			jnc 1f; lea 48(%1, %%rcx), %%rax; jmp found; 1: \
+			add $48, %1; \
+			and $-16, %1; \
+			loop:  ;   \
+			addq $16, %1; \
+			pcmpestri $0, (%1), %2; \
+			jnc loop; lea (%1, %%rcx), %%rax; \
+			found: " \
+                      : "=a"(ret) : "D"(s), "x"(search), "a"(4), "d"(16));
 
-      /* ??? The builtin doesn't understand that the PCMPESTRI read from
-	 memory need not be aligned.  */
-      __asm ("%vpcmpestri $0, (%1), %2"
-	     : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
-      if (__builtin_expect (index < 16, 0))
-	goto found;
-
-      /* Advance the pointer to an aligned address.  We will re-scan a
-	 few bytes, but we no longer need care for reading past the
-	 end of a page, since we're guaranteed a match.  */
-      s = (const uchar *)((si + 16) & -16);
-    }
 
-  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
-     in inline assembly, we can make proper use of the flags set.  */
-  __asm (      "sub $16, %1\n"
-	"	.balign 16\n"
-	"0:	add $16, %1\n"
-	"	%vpcmpestri $0, (%1), %2\n"
-	"	jnc 0b"
-	: "=&c"(index), "+r"(s)
-	: "x"(search), "a"(4), "d"(16));
-
- found:
-  return s + index;
+  return ret;
 }
 
 #else