expand_strlen_unroll/i386 fix+rewrite

Jan Hubicka hubicka@atrey.karlin.mff.cuni.cz
Mon Oct 4 03:45:00 GMT 1999


Hi
The unroll_strlen code enabled by my previous fix does contain bug near
the end of code.
After looking on it for a while, I've redesigned the main loop to avoid
3 branches and many cycles. It now takes 4 cycles per 4 bytes on Pentium
and 3 cycles per 4 bytes on PPro.

It can be better on Pentium if gcc were able to split leas in case of AGI
stall.
It is faster then previous strlen code for strings greater than 4 bytes.
(60% speedup asymptotically) and it is three times as fast as rep cmpsb.

If I am counting right, it ought to be faster on 486, Athlon, K6 and Pentium
Pro too.
If you have these CPUs I would be happy to hear about the results.

Honza
Mon Oct  4 12:42:28 MET DST 1999  Jan Hubicka  <hubicka@freesoft.cz>
	* i386.c (x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
	(x86_expand_strlensi_1): Rewrite the main loop.
	* i386.md (strlen expander): Do not unroll when optimizing for size.
	(anonymous patern): Rename to ashlcqi3

*** i386.c.orig	Mon Oct  4 10:10:28 1999
--- i386.c	Mon Oct  4 12:12:59 1999
*************** const int x86_zero_extend_with_and = m_4
*** 137,143 ****
  const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
! const int x86_unroll_strlen = m_486 | m_PENT;
  const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
  const int x86_use_any_reg = m_486;
  const int x86_cmove = m_PPRO | m_ATHLON;
--- 137,143 ----
  const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
! const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
  const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
  const int x86_use_any_reg = m_486;
  const int x86_cmove = m_PPRO | m_ATHLON;
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 4871,4880 ****
    rtx align_3_label = NULL_RTX;
    rtx align_4_label = gen_label_rtx ();
    rtx end_0_label = gen_label_rtx ();
-   rtx end_2_label = gen_label_rtx ();
-   rtx end_3_label = gen_label_rtx ();
    rtx mem;
    rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
  
    align = 0;
    if (GET_CODE (align_rtx) == CONST_INT)
--- 4871,4879 ----
    rtx align_3_label = NULL_RTX;
    rtx align_4_label = gen_label_rtx ();
    rtx end_0_label = gen_label_rtx ();
    rtx mem;
    rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+   rtx tmpreg = gen_reg_rtx (SImode);
  
    align = 0;
    if (GET_CODE (align_rtx) == CONST_INT)
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 4991,5038 ****
  
    mem = gen_rtx_MEM (SImode, out);
    emit_move_insn (scratch, mem);
  
!   /* Check first byte. */
!   emit_insn (gen_cmpqi_0 (gen_lowpart (QImode, scratch), const0_rtx));
!   tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, end_0_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
! 
!   /* Check second byte. */
!   emit_insn (gen_cmpqi_ext_3 (scratch, const0_rtx));
!   tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, end_3_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
! 
!   /* Check third byte. */
!   emit_insn (gen_testsi_1 (scratch, GEN_INT (0x00ff0000)));
!   tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, end_2_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
  
!   /* Check fourth byte and increment address. */
!   emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
!   emit_insn (gen_testsi_1 (scratch, GEN_INT (0xff000000)));
!   tmp = gen_rtx_NE (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, align_4_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
  
!   /* Now generate fixups when the compare stops within a 4-byte word. */
!   emit_insn (gen_subsi3 (out, out, GEN_INT (3)));
!   
!   emit_label (end_2_label);
!   emit_insn (gen_addsi3 (out, out, const1_rtx));
  
!   emit_label (end_3_label);
!   emit_insn (gen_addsi3 (out, out, const1_rtx));
  
    emit_label (end_0_label);
  }
--- 4990,5058 ----
  
    mem = gen_rtx_MEM (SImode, out);
    emit_move_insn (scratch, mem);
+   emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
  
!   /* Use formula that gives nonzero result ifif one of the bytes is zero.
!      This saves three branches inside loop and many cycles.  */
  
!   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
!   emit_insn (gen_one_cmplsi2 (scratch, scratch));
!   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
!   emit_insn (gen_andsi3 (tmpreg, tmpreg, GEN_INT (0x80808080)));
!   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 0, align_4_label);
! 
!   if (TARGET_CMOVE)
!     {
!        rtx reg = gen_reg_rtx (SImode);
!        emit_move_insn (reg, tmpreg);
!        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
! 
!        /* If zero is not in the first two bytes, move two bytes forward. */
!        emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
!        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
!        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
!        emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
! 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
! 				       		     reg, 
! 				       		     tmpreg)));
!        /* Emit lea manually to avoid clobbering of flags.  */
!        emit_insn (gen_rtx_SET (SImode, reg,
! 			       gen_rtx_PLUS (SImode, out, GEN_INT (2))));
! 
!        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
!        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
!        emit_insn (gen_rtx_SET (VOIDmode, out,
! 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
! 				       		     reg,
! 				       		     out)));
  
!     }
!   else
!     {
!        rtx end_2_label = gen_label_rtx ();
!        /* Is zero in the first two bytes? */
! 
!        emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
!        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
!        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
!        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
!                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
!                             pc_rtx);
!        tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
!        JUMP_LABEL (tmp) = end_2_label;
! 
!        /* Not in the first two.  Move two bytes forward. */
!        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
!        emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
! 
!        emit_label (end_2_label);
! 
!     }
  
!   /* Avoid branch in fixing the byte. */
!   emit_insn (gen_ashlcqi3 (gen_rtx_SUBREG (QImode, tmpreg, 0),
!       		           gen_rtx_SUBREG (QImode, tmpreg, 0), const1_rtx));
!   emit_insn (gen_subxsi3 (out, out, GEN_INT (3)));
  
    emit_label (end_0_label);
  }
*** i386.md.orig	Mon Oct  4 10:10:33 1999
--- i386.md	Mon Oct  4 12:13:29 1999
***************
*** 5478,5484 ****
  	   ]
  	   (const_string "ishift")))])
  
! (define_insn ""
    [(set (reg:CCNO 17)
  	(compare:CCNO
  	  (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0")
--- 5478,5484 ----
  	   ]
  	   (const_string "ishift")))])
  
! (define_insn "ashlcqi3"
    [(set (reg:CCNO 17)
  	(compare:CCNO
  	  (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0")
***************
*** 7679,7685 ****
    align = operands[3];
    scratch1 = gen_reg_rtx (SImode);
  
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1)
      {
        /* Well it seems that some optimizer does not combine a call like
  	     foo(strlen(bar), strlen(bar));
--- 7679,7686 ----
    align = operands[3];
    scratch1 = gen_reg_rtx (SImode);
  
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
!       && !optimize_size)
      {
        /* Well it seems that some optimizer does not combine a call like
  	     foo(strlen(bar), strlen(bar));


More information about the Gcc-patches mailing list