[PATCH] Re: Rewrite i386 string operation expansion

Jan Hubicka jh@suse.cz
Thu Nov 30 14:53:00 GMT 2006


> On 11/30/06, Jan Hubicka <jh@suse.cz> wrote:
> >> Uros,
> >> It would be probably good idea to fill in the pentium4 code generation
> >> descriptor with correct values, since the code is probably just picking
> >> up the wrong alternatives.  If you really do have pentium4 core (and nod
> >> nocona or prescott), could you please run the attached benchmark with
> >> ./test_stringop 32 640000000 ./xgcc -B ./ -march=pentium4
> >> With properlly filed descriptor, GCC ought to do best withtout
> >> -minline-all-stringops (or with -minline-stringops-dynamically) on the
> >> benchmark.  Is povray memcpy/memset bound? Where I find the sources?
> 
> Jan,
> 
> povray source can be downloaded from http://www.povray.org/download/
> 
> Attached to this message, please find the results of your script for
> "Intel(R) Pentium(R) 4 CPU 3.20GHz" and "Intel(R) Xeon(TM) CPU
> 3.60GHz".
> 
> Uros.

Hi,
thanks for the results!  I've filled in the tables (and noticed that
nocona doesn't have 32bit codegen filled either), so I would be curious
if you can benchmark any speedups now.

Your memcpy runtime has amusing property of being slower than simple
"rep; movsl" for blocks of large size.  I see this on K8 too. Hopefully
glibc will be fixed soonish and we can update the descriptors to offline
for large blocks.  I am still keeping the unknown sized memcpy to
libcall even if "rep movsl" might be very sane strategy for this ugly
library implementation, but still noticeably longer (for known sized at
least large part of epilogue goes away)

Concerning the ICE, I've changed the decide_alg loop to simply fall back
into the logic picking proper replacement for unknown sized blocks.
THis should pick "rep movsl" on all the current architectures and behave
pseudo-sanely in future.

I am testing the attached patch.

Honza

2006-11-30  Jan Hubicka  <jh@suse.cz>
	    Uros Bizjak  <uros@kss-loka.si>

	* config/i386/i386.c (pentium4_cost, nocona_cost): Update memcpy/memset decriptors.
	(decide_alg): With -minline-all-stringops and sizes that are best to be copied via
	libcall still work hard enough to pick non-libcall strategy.

Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c	(revision 119360)
--- config/i386/i386.c	(working copy)
*************** struct processor_costs pentium4_cost = {
*** 655,664 ****
    COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
!    {libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}},
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
!    {libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}}
  };
  
  static const
--- 655,665 ----
    COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
!   {{libcall, {{12, loop_1_byte}, {64, loop}, {-1, rep_prefix_4_byte}}},
!    DUMMY_STRINGOP_ALGS},
!   {{libcall, {{6, loop_1_byte}, {64, loop}, {20480, rep_prefix_4_byte},
!    {-1, libcall}}},
!    DUMMY_STRINGOP_ALGS},
  };
  
  static const
*************** struct processor_costs nocona_cost = {
*** 712,721 ****
    COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
     {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
  	      {100000, unrolled_loop}, {-1, libcall}}}},
!   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
     {libcall, {{24, loop}, {64, unrolled_loop},
  	      {8192, rep_prefix_8_byte}, {-1, libcall}}}}
  };
--- 713,723 ----
    COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
    COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
    COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
!   {{libcall, {{12, loop_1_byte}, {64, loop}, {-1, rep_prefix_4_byte}}},
     {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
  	      {100000, unrolled_loop}, {-1, libcall}}}},
!   {{libcall, {{6, loop_1_byte}, {64, loop}, {20480, rep_prefix_4_byte},
!    {-1, libcall}}},
     {libcall, {{24, loop}, {64, unrolled_loop},
  	      {8192, rep_prefix_8_byte}, {-1, libcall}}}}
  };
*************** decide_alg (HOST_WIDE_INT count, HOST_WI
*** 13507,13520 ****
  	         last non-libcall inline algorithm.  */
  	      if (TARGET_INLINE_ALL_STRINGOPS)
  		{
! 		  gcc_assert (alg != libcall);
! 		  return alg;
  		}
  	      else
  		return algs->size[i].alg;
  	    }
  	}
!       gcc_unreachable ();
      }
    /* When asked to inline the call anyway, try to pick meaningful choice.
       We look for maximal size of block that is faster to copy by hand and
--- 13509,13526 ----
  	         last non-libcall inline algorithm.  */
  	      if (TARGET_INLINE_ALL_STRINGOPS)
  		{
! 		  /* When the current size is best to be copied by a libcall,
! 		     but we are still forced to inline, run the heuristic bellow
! 		     that will pick code for medium sized blocks.  */
! 		  if (alg != libcall)
! 		    return alg;
! 		  break;
  		}
  	      else
  		return algs->size[i].alg;
  	    }
  	}
!       gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
      }
    /* When asked to inline the call anyway, try to pick meaningful choice.
       We look for maximal size of block that is faster to copy by hand and



More information about the Gcc-patches mailing list