The speed of the compiler, was: Re: Combine four insns

Richard Guenther richard.guenther@gmail.com
Tue Aug 10 15:03:00 GMT 2010


On Tue, Aug 10, 2010 at 4:48 PM, Andi Kleen <andi@firstfloor.org> wrote:
> Chris Lattner <clattner@apple.com> writes:
>>
>>   e. General speedups: Clang's preprocessor is roughly 2x faster than GCC's and the frontend is generally much faster.  For example, it uses hash tables instead of lists where appropriate, so it doesn't get N^2 cases in silly situations as often.  I don't what what else GCC is doing wrong, I haven't looked at its frontends much.
>
> I looked at this a weekend or two ago. The two hot functions in the
> preprocessor are cpp_clean_line and the lexer.
>
> At least cpp_clean_line was pretty easy to speed up using SSE 4.2
> string instructions and vectorizing it.
>
> That change made it drop down from top 10 in a unoptimized build to
> lower top 40 or so. I suspect with that change the clang advantage
> is much less than 2x.
>
> Drawback: the patch broke some of the PCH test cases in the test
> suite and I never quite figured out why (that's why I didn't post
> the patch)
>
> Other drawback: the optimization only helps on x86 systems
> that support SSE 4.2 (but presumably that's a popular build system)
>
> Here's the patch if anyone is interested.
>
> Vectorizing the lexer might be possible too, but it's somewhat
> harder.
>
> The other problem I found is that cpplib is not using profile
> feedback, that is likely giving some performance away too.

I'm sure there is a way to open-code this using integer math.
Likely the performance issue is both that we use byte loads
and 4 comparisons per char.  Maybe 4 parallel strchr optimized
searches are comparable fast?

Richard.

> -Andi
>
>
> diff --git a/libcpp/init.c b/libcpp/init.c
> index c5b8c28..769aa50 100644
> --- a/libcpp/init.c
> +++ b/libcpp/init.c
> @@ -137,6 +137,8 @@ init_library (void)
>  #ifdef ENABLE_NLS
>        (void) bindtextdomain (PACKAGE, LOCALEDIR);
>  #endif
> +
> +       init_vectorized_lexer ();
>     }
>  }
>
> diff --git a/libcpp/internal.h b/libcpp/internal.h
> index 9209b55..10ed033 100644
> --- a/libcpp/internal.h
> +++ b/libcpp/internal.h
> @@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f)
>   return fputs ((const char *)s, f);
>  }
>
> +extern void init_vectorized_lexer (void);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/libcpp/lex.c b/libcpp/lex.c
> index f628272..589fa64 100644
> --- a/libcpp/lex.c
> +++ b/libcpp/lex.c
> @@ -96,6 +96,82 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
>   buffer->notes_used++;
>  }
>
> +#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
> +
> +#define HAVE_SSE42 1
> +
> +#include <stdint.h>
> +#include "../gcc/config/i386/cpuid.h"
> +
> +bool cpu_has_sse42;
> +
> +/* Check if CPU supports vectorized string instructions. */
> +
> +void
> +init_vectorized_lexer (void)
> +{
> +  unsigned dummy, ecx;
> +
> +  if (__get_cpuid (1, &dummy, &dummy, &ecx, &dummy))
> +      cpu_has_sse42 = !!(ecx & (1 << 20));
> +}
> +
> +/* Fast path to find line special characters using SSE 4.2 vectorized string
> +   instructions. Anything complicated falls back to the slow path below.
> +   Since this loop is very hot it's worth doing these kinds of
> +   optimizations. Returns true if stopper character found.
> +
> +   We should be using the _mm intrinsics, but the xxxintr headers do things
> +   not allowed in gcc. So instead use direct builtins. */
> +
> +static bool __attribute__((__target__("sse4.2")))
> +search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
> +{
> +  typedef char m128i __attribute__ ((__vector_size__ (16)));
> +  int left;
> +  int index;
> +  static char searchstr[16] __attribute__ ((aligned(16))) = "\n\r?\\";
> +  m128i search = *(m128i *)searchstr;
> +  m128i data;
> +
> +  for (left = end - (uchar *)s; left > 0; left -= 16)
> +    {
> +      if (((uintptr_t)s & 0xfff) > 0xff0)
> +       {
> +         /* Too near page boundary. Use slow path. This could be
> +            avoided if we ensure suitable padding or alignment in
> +            the input buffer. */
> +         *out = s;
> +         return false;
> +       }
> +
> +      /* Use vectorized string comparison, looking for the 4 stoppers. */
> +      data = (m128i) __builtin_ia32_loaddqu((const char *)s);
> +      index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
> +      if (index < 16)
> +       {
> +         *out = s + index;
> +         return true;
> +       }
> +      s += 16;
> +    }
> +
> +  /* Ran out of buffer. Should not happen? */
> +  *out = end;
> +  return false;
> +}
> +
> +#else
> +
> +/* Dummy */
> +
> +void
> +init_vectorized_lexer (void)
> +{
> +}
> +
> +#endif
> +
>  /* Returns with a logical line that contains no escaped newlines or
>    trigraphs.  This is a time-critical inner loop.  */
>  void
> @@ -109,12 +185,41 @@ _cpp_clean_line (cpp_reader *pfile)
>   buffer->cur_note = buffer->notes_used = 0;
>   buffer->cur = buffer->line_base = buffer->next_line;
>   buffer->need_line = false;
> -  s = buffer->next_line - 1;
> +  s = buffer->next_line;
>
>   if (!buffer->from_stage3)
>     {
>       const uchar *pbackslash = NULL;
>
> +#ifdef HAVE_SSE42
> +      if (cpu_has_sse42)
> +       {
> +         for (;;)
> +           {
> +             /* Drop into slow path if ? or nothing is found. */
> +             if (search_line_sse42 (s, buffer->rlimit, &s) == false
> +                 || *s == '?')
> +               break;
> +
> +             c = *s;
> +
> +             /* Special case for backslash which is reasonably common.
> +                Continue searching using the fast path */
> +             if (c == '\\')
> +               {
> +                 pbackslash = s;
> +                 s++;
> +                 continue;
> +               }
> +
> +             /* \n or \r here. Process it below. */
> +             goto found;
> +           }
> +       }
> +#endif
> +
> +      s--;
> +
>       /* Short circuit for the common case of an un-escaped line with
>         no trigraphs.  The primary win here is by not writing any
>         data back to memory until we have to.  */
> @@ -124,6 +229,9 @@ _cpp_clean_line (cpp_reader *pfile)
>          if (__builtin_expect (c == '\n', false)
>              || __builtin_expect (c == '\r', false))
>            {
> +#ifdef HAVE_SSE42
> +           found:
> +#endif
>              d = (uchar *) s;
>
>              if (__builtin_expect (s == buffer->rlimit, false))
>
>
>
> --
> ak@linux.intel.com -- Speaking for myself only.
>



More information about the Gcc-patches mailing list