This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hi, As I wrote at [PATCH, libcpp]: Use asm flag outputs in search_line_sse42 main loop https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg113610.html I wont repeat myself with reasons summary is that current sse4.2 code is reduntant as it has same performance as sse2 one. This improves sse2 performance by around 10% vs sse4.2 code by using better header. A updated benchmark attached. It counts number of lines of given c source, I selected itself for replicable results, on sandy bridge runtime is following, fx10 and nehalem are similar. time ./a.out line.c 1 100000; time ./a.out line.c 2 100000; time ./a.out line.c 3 100000; time ./a.out line.c 4 100000; time ./a.out line.c 5 100000 # strpbrk real 0m0.507s user 0m0.505s sys 0m0.000s # current sse2 real 0m0.490s user 0m0.490s sys 0m0.000s # current sse4.2 real 0m0.423s user 0m0.420s sys 0m0.003s # improved header real 0m0.450s user 0m0.451s sys 0m0.000s # proposed version real 0m0.426s user 0m0.426s sys 0m0.000s * lex.c (search_line_sse2): Improve performance by using proper header. (search_line_sse42): Delete. diff --git a/libcpp/lex.c b/libcpp/lex.c index 0ad9660..8032e6e 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -373,36 +373,110 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) const v16qi repl_qm = *(const v16qi *)repl_chars[3]; unsigned int misalign, found, mask; + const v16qi *p; - v16qi data, t; + v16qi data, t, tx; + + if (s + 80 < end) + { + v16qi x0 = __builtin_ia32_loaddqu ((char const *) s); + tx = __builtin_ia32_pcmpeqb128 (x0, repl_nl); + tx |= __builtin_ia32_pcmpeqb128 (x0, repl_cr); + tx |= __builtin_ia32_pcmpeqb128 (x0, repl_bs); + tx |= __builtin_ia32_pcmpeqb128 (x0, repl_qm); + + found = __builtin_ia32_pmovmskb128 (tx); + if (found) + { + found = __builtin_ctz (found); + return (const uchar *) s + found; + } + v16qi x1 = __builtin_ia32_loaddqu ((char const *) (s + 16)); + v16qi x2 = __builtin_ia32_loaddqu ((char const *) (s + 32)); + v16qi x3 = __builtin_ia32_loaddqu ((char const *) (s + 48)); + v16qi x4 = __builtin_ia32_loaddqu ((char const *) (s + 64)); + + tx = __builtin_ia32_pcmpeqb128 (x1, repl_nl); + tx |= __builtin_ia32_pcmpeqb128 (x1, repl_cr); + tx |= __builtin_ia32_pcmpeqb128 (x1, repl_bs); + tx |= __builtin_ia32_pcmpeqb128 (x1, repl_qm); + + found = __builtin_ia32_pmovmskb128 (tx); + + if (found) + { + found = __builtin_ctz (found); + return (const uchar *) s + 16 + found; + } + + tx = __builtin_ia32_pcmpeqb128 (x2, repl_nl); + tx |= __builtin_ia32_pcmpeqb128 (x2, repl_cr); + tx |= __builtin_ia32_pcmpeqb128 (x2, repl_bs); + tx |= __builtin_ia32_pcmpeqb128 (x2, repl_qm); + + found = __builtin_ia32_pmovmskb128 (tx); + + if (found) + { + found = __builtin_ctz (found); + return (const uchar *) s + 32 + found; + } + + + tx = __builtin_ia32_pcmpeqb128 (x3, repl_nl); + tx |= __builtin_ia32_pcmpeqb128 (x3, repl_cr); + tx |= __builtin_ia32_pcmpeqb128 (x3, repl_bs); + tx |= __builtin_ia32_pcmpeqb128 (x3, repl_qm); + + found = __builtin_ia32_pmovmskb128 (tx); + + if (found) + { + found = __builtin_ctz (found); + return (const uchar *) s + 48 + found; + } + + tx = __builtin_ia32_pcmpeqb128 (x4, repl_nl); + tx |= __builtin_ia32_pcmpeqb128 (x4, repl_cr); + tx |= __builtin_ia32_pcmpeqb128 (x4, repl_bs); + tx |= __builtin_ia32_pcmpeqb128 (x4, repl_qm); + + found = __builtin_ia32_pmovmskb128 (tx); + + if (found) + { + found = __builtin_ctz (found); + return (const uchar *) s + 64 + found; + } + + s += 80; + } /* Align the source pointer. */ misalign = (uintptr_t)s & 15; p = (const v16qi *)((uintptr_t)s & -16); data = *p; - /* Create a mask for the bytes that are valid within the first - 16-byte block. The Idea here is that the AND with the mask - within the loop is "free", since we need some AND or TEST - insn in order to set the flags for the branch anyway. */ mask = -1u << misalign; - /* Main loop processing 16 bytes at a time. */ - goto start; - do + t = __builtin_ia32_pcmpeqb128(data, repl_nl); + t |= __builtin_ia32_pcmpeqb128(data, repl_cr); + t |= __builtin_ia32_pcmpeqb128(data, repl_bs); + t |= __builtin_ia32_pcmpeqb128(data, repl_qm); + found = __builtin_ia32_pmovmskb128 (t); + found &= mask; + + while (!found) { data = *++p; - mask = -1; - start: t = __builtin_ia32_pcmpeqb128(data, repl_nl); t |= __builtin_ia32_pcmpeqb128(data, repl_cr); t |= __builtin_ia32_pcmpeqb128(data, repl_bs); t |= __builtin_ia32_pcmpeqb128(data, repl_qm); found = __builtin_ia32_pmovmskb128 (t); - found &= mask; } - while (!found); + /* FOUND contains 1 in bits for which we matched a relevant character. Conversion to the byte index is trivial. */ @@ -410,65 +484,7 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) return (const uchar *)p + found; } -#ifdef HAVE_SSE4 -/* A version of the fast scanner using SSE 4.2 vectorized string insns. */ - -static const uchar * -#ifndef __SSE4_2__ -__attribute__((__target__("sse4.2"))) -#endif -search_line_sse42 (const uchar *s, const uchar *end) -{ - typedef char v16qi __attribute__ ((__vector_size__ (16))); - static const v16qi search = { '\n', '\r', '?', '\\' }; - - uintptr_t si = (uintptr_t)s; - uintptr_t index; - - /* Check for unaligned input. */ - if (si & 15) - { - if (__builtin_expect (end - s < 16, 0) - && __builtin_expect ((si & 0xfff) > 0xff0, 0)) - { - /* There are less than 16 bytes left in the buffer, and less - than 16 bytes left on the page. Reading 16 bytes at this - point might generate a spurious page fault. Defer to the - SSE2 implementation, which already handles alignment. */ - return search_line_sse2 (s, end); - } - - /* ??? The builtin doesn't understand that the PCMPESTRI read from - memory need not be aligned. */ - __asm ("%vpcmpestri $0, (%1), %2" - : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16)); - if (__builtin_expect (index < 16, 0)) - goto found; - - /* Advance the pointer to an aligned address. We will re-scan a - few bytes, but we no longer need care for reading past the - end of a page, since we're guaranteed a match. */ - s = (const uchar *)((si + 16) & -16); - } - /* Main loop, processing 16 bytes at a time. By doing the whole loop - in inline assembly, we can make proper use of the flags set. */ - __asm ( "sub $16, %1\n" - " .balign 16\n" - "0: add $16, %1\n" - " %vpcmpestri $0, (%1), %2\n" - " jnc 0b" - : "=&c"(index), "+r"(s) - : "x"(search), "a"(4), "d"(16)); - - found: - return s + index; -} - -#else -/* Work around out-dated assemblers without sse4 support. */ -#define search_line_sse42 search_line_sse2 -#endif /* Check the CPU capabilities. */ @@ -485,21 +501,15 @@ init_vectorized_lexer (void) search_line_fast_type impl = search_line_acc_char; int minimum = 0; -#if defined(__SSE4_2__) - minimum = 3; -#elif defined(__SSE2__) +#if defined(__SSE2__) minimum = 2; #elif defined(__SSE__) minimum = 1; #endif - if (minimum == 3) - impl = search_line_sse42; - else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) + if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) { - if (minimum == 3 || (ecx & bit_SSE4_2)) - impl = search_line_sse42; - else if (minimum == 2 || (edx & bit_SSE2)) + if (minimum == 2 || (edx & bit_SSE2)) impl = search_line_sse2; else if (minimum == 1 || (edx & bit_SSE)) impl = search_line_mmx;
Attachment:
line.c
Description: Text document
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |