This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: The speed of the compiler, was: Re: Combine four insns


On Tue, Aug 10, 2010 at 4:48 PM, Andi Kleen <andi@firstfloor.org> wrote:
> Chris Lattner <clattner@apple.com> writes:
>>
>> ? e. General speedups: Clang's preprocessor is roughly 2x faster than GCC's and the frontend is generally much faster. ?For example, it uses hash tables instead of lists where appropriate, so it doesn't get N^2 cases in silly situations as often. ?I don't what what else GCC is doing wrong, I haven't looked at its frontends much.
>
> I looked at this a weekend or two ago. The two hot functions in the
> preprocessor are cpp_clean_line and the lexer.
>
> At least cpp_clean_line was pretty easy to speed up using SSE 4.2
> string instructions and vectorizing it.
>
> That change made it drop down from top 10 in a unoptimized build to
> lower top 40 or so. I suspect with that change the clang advantage
> is much less than 2x.
>
> Drawback: the patch broke some of the PCH test cases in the test
> suite and I never quite figured out why (that's why I didn't post
> the patch)
>
> Other drawback: the optimization only helps on x86 systems
> that support SSE 4.2 (but presumably that's a popular build system)
>
> Here's the patch if anyone is interested.
>
> Vectorizing the lexer might be possible too, but it's somewhat
> harder.
>
> The other problem I found is that cpplib is not using profile
> feedback, that is likely giving some performance away too.

I'm sure there is a way to open-code this using integer math.
Likely the performance issue is both that we use byte loads
and 4 comparisons per char.  Maybe 4 parallel strchr optimized
searches are comparable fast?

Richard.

> -Andi
>
>
> diff --git a/libcpp/init.c b/libcpp/init.c
> index c5b8c28..769aa50 100644
> --- a/libcpp/init.c
> +++ b/libcpp/init.c
> @@ -137,6 +137,8 @@ init_library (void)
> ?#ifdef ENABLE_NLS
> ? ? ? ?(void) bindtextdomain (PACKAGE, LOCALEDIR);
> ?#endif
> +
> + ? ? ? init_vectorized_lexer ();
> ? ? }
> ?}
>
> diff --git a/libcpp/internal.h b/libcpp/internal.h
> index 9209b55..10ed033 100644
> --- a/libcpp/internal.h
> +++ b/libcpp/internal.h
> @@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f)
> ? return fputs ((const char *)s, f);
> ?}
>
> +extern void init_vectorized_lexer (void);
> +
> ?#ifdef __cplusplus
> ?}
> ?#endif
> diff --git a/libcpp/lex.c b/libcpp/lex.c
> index f628272..589fa64 100644
> --- a/libcpp/lex.c
> +++ b/libcpp/lex.c
> @@ -96,6 +96,82 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
> ? buffer->notes_used++;
> ?}
>
> +#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
> +
> +#define HAVE_SSE42 1
> +
> +#include <stdint.h>
> +#include "../gcc/config/i386/cpuid.h"
> +
> +bool cpu_has_sse42;
> +
> +/* Check if CPU supports vectorized string instructions. */
> +
> +void
> +init_vectorized_lexer (void)
> +{
> + ?unsigned dummy, ecx;
> +
> + ?if (__get_cpuid (1, &dummy, &dummy, &ecx, &dummy))
> + ? ? ?cpu_has_sse42 = !!(ecx & (1 << 20));
> +}
> +
> +/* Fast path to find line special characters using SSE 4.2 vectorized string
> + ? instructions. Anything complicated falls back to the slow path below.
> + ? Since this loop is very hot it's worth doing these kinds of
> + ? optimizations. Returns true if stopper character found.
> +
> + ? We should be using the _mm intrinsics, but the xxxintr headers do things
> + ? not allowed in gcc. So instead use direct builtins. */
> +
> +static bool __attribute__((__target__("sse4.2")))
> +search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
> +{
> + ?typedef char m128i __attribute__ ((__vector_size__ (16)));
> + ?int left;
> + ?int index;
> + ?static char searchstr[16] __attribute__ ((aligned(16))) = "\n\r?\\";
> + ?m128i search = *(m128i *)searchstr;
> + ?m128i data;
> +
> + ?for (left = end - (uchar *)s; left > 0; left -= 16)
> + ? ?{
> + ? ? ?if (((uintptr_t)s & 0xfff) > 0xff0)
> + ? ? ? {
> + ? ? ? ? /* Too near page boundary. Use slow path. This could be
> + ? ? ? ? ? ?avoided if we ensure suitable padding or alignment in
> + ? ? ? ? ? ?the input buffer. */
> + ? ? ? ? *out = s;
> + ? ? ? ? return false;
> + ? ? ? }
> +
> + ? ? ?/* Use vectorized string comparison, looking for the 4 stoppers. */
> + ? ? ?data = (m128i) __builtin_ia32_loaddqu((const char *)s);
> + ? ? ?index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
> + ? ? ?if (index < 16)
> + ? ? ? {
> + ? ? ? ? *out = s + index;
> + ? ? ? ? return true;
> + ? ? ? }
> + ? ? ?s += 16;
> + ? ?}
> +
> + ?/* Ran out of buffer. Should not happen? */
> + ?*out = end;
> + ?return false;
> +}
> +
> +#else
> +
> +/* Dummy */
> +
> +void
> +init_vectorized_lexer (void)
> +{
> +}
> +
> +#endif
> +
> ?/* Returns with a logical line that contains no escaped newlines or
> ? ?trigraphs. ?This is a time-critical inner loop. ?*/
> ?void
> @@ -109,12 +185,41 @@ _cpp_clean_line (cpp_reader *pfile)
> ? buffer->cur_note = buffer->notes_used = 0;
> ? buffer->cur = buffer->line_base = buffer->next_line;
> ? buffer->need_line = false;
> - ?s = buffer->next_line - 1;
> + ?s = buffer->next_line;
>
> ? if (!buffer->from_stage3)
> ? ? {
> ? ? ? const uchar *pbackslash = NULL;
>
> +#ifdef HAVE_SSE42
> + ? ? ?if (cpu_has_sse42)
> + ? ? ? {
> + ? ? ? ? for (;;)
> + ? ? ? ? ? {
> + ? ? ? ? ? ? /* Drop into slow path if ? or nothing is found. */
> + ? ? ? ? ? ? if (search_line_sse42 (s, buffer->rlimit, &s) == false
> + ? ? ? ? ? ? ? ? || *s == '?')
> + ? ? ? ? ? ? ? break;
> +
> + ? ? ? ? ? ? c = *s;
> +
> + ? ? ? ? ? ? /* Special case for backslash which is reasonably common.
> + ? ? ? ? ? ? ? ?Continue searching using the fast path */
> + ? ? ? ? ? ? if (c == '\\')
> + ? ? ? ? ? ? ? {
> + ? ? ? ? ? ? ? ? pbackslash = s;
> + ? ? ? ? ? ? ? ? s++;
> + ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? /* \n or \r here. Process it below. */
> + ? ? ? ? ? ? goto found;
> + ? ? ? ? ? }
> + ? ? ? }
> +#endif
> +
> + ? ? ?s--;
> +
> ? ? ? /* Short circuit for the common case of an un-escaped line with
> ? ? ? ? no trigraphs. ?The primary win here is by not writing any
> ? ? ? ? data back to memory until we have to. ?*/
> @@ -124,6 +229,9 @@ _cpp_clean_line (cpp_reader *pfile)
> ? ? ? ? ?if (__builtin_expect (c == '\n', false)
> ? ? ? ? ? ? ?|| __builtin_expect (c == '\r', false))
> ? ? ? ? ? ?{
> +#ifdef HAVE_SSE42
> + ? ? ? ? ? found:
> +#endif
> ? ? ? ? ? ? ?d = (uchar *) s;
>
> ? ? ? ? ? ? ?if (__builtin_expect (s == buffer->rlimit, false))
>
>
>
> --
> ak@linux.intel.com -- Speaking for myself only.
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]