This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH, libcpp] Faster line lexer.

From: OndÅej BÃlka <neleai at seznam dot cz>
To: gcc-patches at gcc dot gnu dot org
Date: Fri, 10 Jul 2015 10:56:32 +0200
Subject: [PATCH, libcpp] Faster line lexer.
Authentication-results: sourceware.org; auth=none
Hi,

As I wrote at

[PATCH, libcpp]: Use asm flag outputs in search_line_sse42 main loop

https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg113610.html

I wont repeat myself with reasons summary is that current sse4.2 code is 
reduntant as it has same performance as sse2 one. 
This improves sse2 performance by around 10% vs sse4.2 code by
using better header.

A updated benchmark attached. It counts number of lines of given c
source, I selected itself for replicable results, on sandy bridge
runtime is following, fx10 and nehalem are similar.


time ./a.out line.c 1 100000;  time ./a.out line.c 2 100000;  time
./a.out line.c 3 100000;  time ./a.out line.c 4 100000;  time ./a.out
line.c 5 100000

# strpbrk
real	0m0.507s
user	0m0.505s
sys	0m0.000s
# current sse2
real	0m0.490s
user	0m0.490s
sys	0m0.000s
# current sse4.2
real	0m0.423s
user	0m0.420s
sys	0m0.003s
# improved header
real	0m0.450s
user	0m0.451s
sys	0m0.000s
# proposed version
real	0m0.426s
user	0m0.426s
sys	0m0.000s



	* lex.c (search_line_sse2): Improve performance by using
	proper header.
	(search_line_sse42): Delete.

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 0ad9660..8032e6e 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -373,36 +373,110 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 
   unsigned int misalign, found, mask;
+
   const v16qi *p;
-  v16qi data, t;
+  v16qi data, t, tx;
+ 
+  if (s + 80 < end)
+    {
+      v16qi x0 = __builtin_ia32_loaddqu ((char const *) s);
+      tx =  __builtin_ia32_pcmpeqb128 (x0, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + found;
+        }
+      v16qi x1 = __builtin_ia32_loaddqu ((char const *) (s + 16));
+      v16qi x2 = __builtin_ia32_loaddqu ((char const *) (s + 32));
+      v16qi x3 = __builtin_ia32_loaddqu ((char const *) (s + 48));
+      v16qi x4 = __builtin_ia32_loaddqu ((char const *) (s + 64));
+
+      tx =  __builtin_ia32_pcmpeqb128 (x1, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 16 + found;
+        }
+
+      tx =  __builtin_ia32_pcmpeqb128 (x2, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_qm);
+
+      found = __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 32 + found;
+        }
+
+
+      tx =  __builtin_ia32_pcmpeqb128 (x3, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 48 + found;
+        }
+
+      tx =  __builtin_ia32_pcmpeqb128 (x4, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 64 + found;
+        }
+
+      s += 80;
+    }
 
   /* Align the source pointer.  */
   misalign = (uintptr_t)s & 15;
   p = (const v16qi *)((uintptr_t)s & -16);
   data = *p;
 
-  /* Create a mask for the bytes that are valid within the first
-     16-byte block.  The Idea here is that the AND with the mask
-     within the loop is "free", since we need some AND or TEST
-     insn in order to set the flags for the branch anyway.  */
   mask = -1u << misalign;
 
-  /* Main loop processing 16 bytes at a time.  */
-  goto start;
-  do
+  t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+  found = __builtin_ia32_pmovmskb128 (t);
+  found &= mask;
+
+  while (!found)
     {
       data = *++p;
-      mask = -1;
 
-    start:
       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
       found = __builtin_ia32_pmovmskb128 (t);
-      found &= mask;
     }
-  while (!found);
+
 
   /* FOUND contains 1 in bits for which we matched a relevant
      character.  Conversion to the byte index is trivial.  */
@@ -410,65 +484,7 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
   return (const uchar *)p + found;
 }
 
-#ifdef HAVE_SSE4
-/* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
-
-static const uchar *
-#ifndef __SSE4_2__
-__attribute__((__target__("sse4.2")))
-#endif
-search_line_sse42 (const uchar *s, const uchar *end)
-{
-  typedef char v16qi __attribute__ ((__vector_size__ (16)));
-  static const v16qi search = { '\n', '\r', '?', '\\' };
-
-  uintptr_t si = (uintptr_t)s;
-  uintptr_t index;
-
-  /* Check for unaligned input.  */
-  if (si & 15)
-    {
-      if (__builtin_expect (end - s < 16, 0)
-	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
-	{
-	  /* There are less than 16 bytes left in the buffer, and less
-	     than 16 bytes left on the page.  Reading 16 bytes at this
-	     point might generate a spurious page fault.  Defer to the
-	     SSE2 implementation, which already handles alignment.  */
-	  return search_line_sse2 (s, end);
-	}
-
-      /* ??? The builtin doesn't understand that the PCMPESTRI read from
-	 memory need not be aligned.  */
-      __asm ("%vpcmpestri $0, (%1), %2"
-	     : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
-      if (__builtin_expect (index < 16, 0))
-	goto found;
-
-      /* Advance the pointer to an aligned address.  We will re-scan a
-	 few bytes, but we no longer need care for reading past the
-	 end of a page, since we're guaranteed a match.  */
-      s = (const uchar *)((si + 16) & -16);
-    }
 
-  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
-     in inline assembly, we can make proper use of the flags set.  */
-  __asm (      "sub $16, %1\n"
-	"	.balign 16\n"
-	"0:	add $16, %1\n"
-	"	%vpcmpestri $0, (%1), %2\n"
-	"	jnc 0b"
-	: "=&c"(index), "+r"(s)
-	: "x"(search), "a"(4), "d"(16));
-
- found:
-  return s + index;
-}
-
-#else
-/* Work around out-dated assemblers without sse4 support.  */
-#define search_line_sse42 search_line_sse2
-#endif
 
 /* Check the CPU capabilities.  */
 
@@ -485,21 +501,15 @@ init_vectorized_lexer (void)
   search_line_fast_type impl = search_line_acc_char;
   int minimum = 0;
 
-#if defined(__SSE4_2__)
-  minimum = 3;
-#elif defined(__SSE2__)
+#if defined(__SSE2__)
   minimum = 2;
 #elif defined(__SSE__)
   minimum = 1;
 #endif
 
-  if (minimum == 3)
-    impl = search_line_sse42;
-  else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
+  if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
     {
-      if (minimum == 3 || (ecx & bit_SSE4_2))
-        impl = search_line_sse42;
-      else if (minimum == 2 || (edx & bit_SSE2))
+      if (minimum == 2 || (edx & bit_SSE2))
 	impl = search_line_sse2;
       else if (minimum == 1 || (edx & bit_SSE))
 	impl = search_line_mmx;
Attachment: line.c
Description: Text document
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]