This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH] Re: Rewrite i386 string operation expansion

From: Jan Hubicka <jh at suse dot cz>
To: Uros Bizjak <ubizjak at gmail dot com>
Cc: Jan Hubicka <jh at suse dot cz>, GCC Patches <gcc-patches at gcc dot gnu dot org>
Date: Thu, 30 Nov 2006 15:36:30 +0100
Subject: Re: [PATCH] Re: Rewrite i386 string operation expansion
References: <5787cf470611280249i2bc38423gb26d5d50b9ed3c3e@mail.gmail.com> <5787cf470611290604k51f403ebs797add6a5b3f7825@mail.gmail.com> <84fc9c000611290612p28816a2fi94523dea73a0befd@mail.gmail.com> <5787cf470611290625o41cc11c9v5ec7459731533c4d@mail.gmail.com> <20061130011604.GC536@kam.mff.cuni.cz> <5787cf470611292312l3ca7caa8n999179388d6a6953@mail.gmail.com> <20061130091504.GF26260@kam.mff.cuni.cz> <20061130091855.GG26260@kam.mff.cuni.cz> <5787cf470611300610j6fc8d7efh6fddf0397b9da5e7@mail.gmail.com>

> On 11/30/06, Jan Hubicka <jh@suse.cz> wrote:
> >> Uros,
> >> It would be probably good idea to fill in the pentium4 code generation
> >> descriptor with correct values, since the code is probably just picking
> >> up the wrong alternatives.  If you really do have pentium4 core (and nod
> >> nocona or prescott), could you please run the attached benchmark with
> >> ./test_stringop 32 640000000 ./xgcc -B ./ -march=pentium4
> >> With properlly filed descriptor, GCC ought to do best withtout
> >> -minline-all-stringops (or with -minline-stringops-dynamically) on the
> >> benchmark.  Is povray memcpy/memset bound? Where I find the sources?
> 
> Jan,
> 
> povray source can be downloaded from http://www.povray.org/download/
> 
> Attached to this message, please find the results of your script for
> "Intel(R) Pentium(R) 4 CPU 3.20GHz" and "Intel(R) Xeon(TM) CPU
> 3.60GHz".
> 
> Uros.

Hi,
thanks for the results!  I've filled in the tables (and noticed that
nocona doesn't have 32bit codegen filled either), so I would be curious
if you can benchmark any speedups now.

Your memcpy runtime has amusing property of being slower than simple
"rep; movsl" for blocks of large size.  I see this on K8 too. Hopefully
glibc will be fixed soonish and we can update the descriptors to offline
for large blocks.  I am still keeping the unknown sized memcpy to
libcall even if "rep movsl" might be very sane strategy for this ugly
library implementation, but still noticeably longer (for known sized at
least large part of epilogue goes away)

Concerning the ICE, I've changed the decide_alg loop to simply fall back
into the logic picking proper replacement for unknown sized blocks.
THis should pick "rep movsl" on all the current architectures and behave
pseudo-sanely in future.

I am testing the attached patch.

Honza

2006-11-30  Jan Hubicka  <jh@suse.cz>
	    Uros Bizjak  <uros@kss-loka.si>

	* config/i386/i386.c (pentium4_cost, nocona_cost): Update memcpy/memset decriptors.
	(decide_alg): With -minline-all-stringops and sizes that are best to be copied via
	libcall still work hard enough to pick non-libcall strategy.

Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c	(revision 119360)
--- config/i386/i386.c	(working copy)
*************** struct processor_costs pentium4_cost = {
*** 655,664 ****
    COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
!    {libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}},
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
!    {libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}}
  };
  
  static const
--- 655,665 ----
    COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
!   {{libcall, {{12, loop_1_byte}, {64, loop}, {-1, rep_prefix_4_byte}}},
!    DUMMY_STRINGOP_ALGS},
!   {{libcall, {{6, loop_1_byte}, {64, loop}, {20480, rep_prefix_4_byte},
!    {-1, libcall}}},
!    DUMMY_STRINGOP_ALGS},
  };
  
  static const
*************** struct processor_costs nocona_cost = {
*** 712,721 ****
    COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
     {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
  	      {100000, unrolled_loop}, {-1, libcall}}}},
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
     {libcall, {{24, loop}, {64, unrolled_loop},
  	      {8192, rep_prefix_8_byte}, {-1, libcall}}}}
  };
--- 713,723 ----
    COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
!   {{libcall, {{12, loop_1_byte}, {64, loop}, {-1, rep_prefix_4_byte}}},
     {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
  	      {100000, unrolled_loop}, {-1, libcall}}}},
!   {{libcall, {{6, loop_1_byte}, {64, loop}, {20480, rep_prefix_4_byte},
!    {-1, libcall}}},
     {libcall, {{24, loop}, {64, unrolled_loop},
  	      {8192, rep_prefix_8_byte}, {-1, libcall}}}}
  };
*************** decide_alg (HOST_WIDE_INT count, HOST_WI
*** 13507,13520 ****
  	         last non-libcall inline algorithm.  */
  	      if (TARGET_INLINE_ALL_STRINGOPS)
  		{
! 		  gcc_assert (alg != libcall);
! 		  return alg;
  		}
  	      else
  		return algs->size[i].alg;
  	    }
  	}
!       gcc_unreachable ();
      }
    /* When asked to inline the call anyway, try to pick meaningful choice.
       We look for maximal size of block that is faster to copy by hand and
--- 13509,13526 ----
  	         last non-libcall inline algorithm.  */
  	      if (TARGET_INLINE_ALL_STRINGOPS)
  		{
! 		  /* When the current size is best to be copied by a libcall,
! 		     but we are still forced to inline, run the heuristic bellow
! 		     that will pick code for medium sized blocks.  */
! 		  if (alg != libcall)
! 		    return alg;
! 		  break;
  		}
  	      else
  		return algs->size[i].alg;
  	    }
  	}
!       gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
      }
    /* When asked to inline the call anyway, try to pick meaningful choice.
       We look for maximal size of block that is faster to copy by hand and

References:
- Re: Rewrite i386 string operation expansion
  - From: Uros Bizjak
- [PATCH] Re: Rewrite i386 string operation expansion
  - From: Uros Bizjak
- Re: [PATCH] Re: Rewrite i386 string operation expansion
  - From: Richard Guenther
- Re: [PATCH] Re: Rewrite i386 string operation expansion
  - From: Uros Bizjak
- Re: [PATCH] Re: Rewrite i386 string operation expansion
  - From: Jan Hubicka
- Re: [PATCH] Re: Rewrite i386 string operation expansion
  - From: Uros Bizjak
- Re: [PATCH] Re: Rewrite i386 string operation expansion
  - From: Jan Hubicka
- Re: [PATCH] Re: Rewrite i386 string operation expansion
  - From: Jan Hubicka
- Re: [PATCH] Re: Rewrite i386 string operation expansion
  - From: Uros Bizjak

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]