This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: The speed of the compiler, was: Re: Combine four insns
Chris Lattner <clattner@apple.com> writes:
>
> e. General speedups: Clang's preprocessor is roughly 2x faster than GCC's and the frontend is generally much faster. For example, it uses hash tables instead of lists where appropriate, so it doesn't get N^2 cases in silly situations as often. I don't what what else GCC is doing wrong, I haven't looked at its frontends much.
I looked at this a weekend or two ago. The two hot functions in the
preprocessor are cpp_clean_line and the lexer.
At least cpp_clean_line was pretty easy to speed up using SSE 4.2
string instructions and vectorizing it.
That change made it drop down from top 10 in a unoptimized build to
lower top 40 or so. I suspect with that change the clang advantage
is much less than 2x.
Drawback: the patch broke some of the PCH test cases in the test
suite and I never quite figured out why (that's why I didn't post
the patch)
Other drawback: the optimization only helps on x86 systems
that support SSE 4.2 (but presumably that's a popular build system)
Here's the patch if anyone is interested.
Vectorizing the lexer might be possible too, but it's somewhat
harder.
The other problem I found is that cpplib is not using profile
feedback, that is likely giving some performance away too.
-Andi
diff --git a/libcpp/init.c b/libcpp/init.c
index c5b8c28..769aa50 100644
--- a/libcpp/init.c
+++ b/libcpp/init.c
@@ -137,6 +137,8 @@ init_library (void)
#ifdef ENABLE_NLS
(void) bindtextdomain (PACKAGE, LOCALEDIR);
#endif
+
+ init_vectorized_lexer ();
}
}
diff --git a/libcpp/internal.h b/libcpp/internal.h
index 9209b55..10ed033 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f)
return fputs ((const char *)s, f);
}
+extern void init_vectorized_lexer (void);
+
#ifdef __cplusplus
}
#endif
diff --git a/libcpp/lex.c b/libcpp/lex.c
index f628272..589fa64 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -96,6 +96,82 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
buffer->notes_used++;
}
+#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
+
+#define HAVE_SSE42 1
+
+#include <stdint.h>
+#include "../gcc/config/i386/cpuid.h"
+
+bool cpu_has_sse42;
+
+/* Check if CPU supports vectorized string instructions. */
+
+void
+init_vectorized_lexer (void)
+{
+ unsigned dummy, ecx;
+
+ if (__get_cpuid (1, &dummy, &dummy, &ecx, &dummy))
+ cpu_has_sse42 = !!(ecx & (1 << 20));
+}
+
+/* Fast path to find line special characters using SSE 4.2 vectorized string
+ instructions. Anything complicated falls back to the slow path below.
+ Since this loop is very hot it's worth doing these kinds of
+ optimizations. Returns true if stopper character found.
+
+ We should be using the _mm intrinsics, but the xxxintr headers do things
+ not allowed in gcc. So instead use direct builtins. */
+
+static bool __attribute__((__target__("sse4.2")))
+search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
+{
+ typedef char m128i __attribute__ ((__vector_size__ (16)));
+ int left;
+ int index;
+ static char searchstr[16] __attribute__ ((aligned(16))) = "\n\r?\\";
+ m128i search = *(m128i *)searchstr;
+ m128i data;
+
+ for (left = end - (uchar *)s; left > 0; left -= 16)
+ {
+ if (((uintptr_t)s & 0xfff) > 0xff0)
+ {
+ /* Too near page boundary. Use slow path. This could be
+ avoided if we ensure suitable padding or alignment in
+ the input buffer. */
+ *out = s;
+ return false;
+ }
+
+ /* Use vectorized string comparison, looking for the 4 stoppers. */
+ data = (m128i) __builtin_ia32_loaddqu((const char *)s);
+ index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
+ if (index < 16)
+ {
+ *out = s + index;
+ return true;
+ }
+ s += 16;
+ }
+
+ /* Ran out of buffer. Should not happen? */
+ *out = end;
+ return false;
+}
+
+#else
+
+/* Dummy */
+
+void
+init_vectorized_lexer (void)
+{
+}
+
+#endif
+
/* Returns with a logical line that contains no escaped newlines or
trigraphs. This is a time-critical inner loop. */
void
@@ -109,12 +185,41 @@ _cpp_clean_line (cpp_reader *pfile)
buffer->cur_note = buffer->notes_used = 0;
buffer->cur = buffer->line_base = buffer->next_line;
buffer->need_line = false;
- s = buffer->next_line - 1;
+ s = buffer->next_line;
if (!buffer->from_stage3)
{
const uchar *pbackslash = NULL;
+#ifdef HAVE_SSE42
+ if (cpu_has_sse42)
+ {
+ for (;;)
+ {
+ /* Drop into slow path if ? or nothing is found. */
+ if (search_line_sse42 (s, buffer->rlimit, &s) == false
+ || *s == '?')
+ break;
+
+ c = *s;
+
+ /* Special case for backslash which is reasonably common.
+ Continue searching using the fast path */
+ if (c == '\\')
+ {
+ pbackslash = s;
+ s++;
+ continue;
+ }
+
+ /* \n or \r here. Process it below. */
+ goto found;
+ }
+ }
+#endif
+
+ s--;
+
/* Short circuit for the common case of an un-escaped line with
no trigraphs. The primary win here is by not writing any
data back to memory until we have to. */
@@ -124,6 +229,9 @@ _cpp_clean_line (cpp_reader *pfile)
if (__builtin_expect (c == '\n', false)
|| __builtin_expect (c == '\r', false))
{
+#ifdef HAVE_SSE42
+ found:
+#endif
d = (uchar *) s;
if (__builtin_expect (s == buffer->rlimit, false))
--
ak@linux.intel.com -- Speaking for myself only.