More i386 string ops changes
Jan Hubicka
hubicka@atrey.karlin.mff.cuni.cz
Thu Jan 13 05:09:00 GMT 2000
Hi
This patch rewrites rest of string operations in the same spirit as I did
for the memcpy.
It also adds MOVE_RATION to ix86_costs structure so it can be set per-cpu
basis and include my rewrite of strlen expander.
It basically changes the internal loop to emit similar code to glibc's strlen
that is approx 60% faster than our original implementation.
Thu Jan 13 13:51:28 MET 2000 Jan Hubicka <jh@suse.cz>
* i386.md (memstr): Use do not use rep stosb for counts divisible by 4
when optimize_size.
(clrstrsi): Rewrite.
(strsethi, strsetqi): New expanders.
(strsethi_1, strsetqi_1, rep_stossi, rep_stosqi): New insn patterns.
(cmpstrsi): Emit compare insn before cmpstrsi_1
(cmpstrsi_nz): use flags, set type to str, prefix_length to 1.
(strlensi_1): Likewise.
(cmpstrsi_1): Likewise; do not output compare.
(strlen expander): Do not unroll when optimizing for size.
(*subsi3_carry): Rename to subsi3_carry
(*ashlqi3_cmpno): Likewise.
* i386.h (processor_costs): Add move_ratio field.
(MOVE_RATIO): Use move_ratio field, set to 3 for OPTIMIZE_SIZE
* i386.c (*_cost): Set move_ratio.
(x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
(x86_expand_strlensi_1): Rewrite the main loop.
? egcs/gcc/config/i386/moje
Index: egcs/gcc/config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.122
diff -c -3 -p -r1.122 i386.c
*** i386.c 2000/01/11 23:52:07 1.122
--- i386.c 2000/01/13 12:50:48
*************** struct processor_costs i386_cost = { /*
*** 64,69 ****
--- 64,70 ----
1, /* cost of multiply per each bit set */
23, /* cost of a divide/mod */
15, /* "large" insn */
+ 3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
*************** struct processor_costs i486_cost = { /*
*** 84,89 ****
--- 85,91 ----
1, /* cost of multiply per each bit set */
40, /* cost of a divide/mod */
15, /* "large" insn */
+ 3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
*************** struct processor_costs pentium_cost = {
*** 104,109 ****
--- 106,112 ----
0, /* cost of multiply per each bit set */
25, /* cost of a divide/mod */
8, /* "large" insn */
+ 6, /* MOVE_RATIO */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
*************** struct processor_costs pentiumpro_cost =
*** 124,129 ****
--- 127,133 ----
0, /* cost of multiply per each bit set */
17, /* cost of a divide/mod */
8, /* "large" insn */
+ 6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
*************** struct processor_costs k6_cost = {
*** 144,149 ****
--- 148,154 ----
0, /* cost of multiply per each bit set */
18, /* cost of a divide/mod */
8, /* "large" insn */
+ 4, /* MOVE_RATIO */
3, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
*************** struct processor_costs athlon_cost = {
*** 164,169 ****
--- 169,175 ----
0, /* cost of multiply per each bit set */
19, /* cost of a divide/mod */
8, /* "large" insn */
+ 9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
*************** const int x86_zero_extend_with_and = m_4
*** 191,197 ****
const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
const int x86_double_with_add = ~m_386;
const int x86_use_bit_test = m_386;
! const int x86_unroll_strlen = m_486 | m_PENT;
const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
const int x86_use_any_reg = m_486;
const int x86_cmove = m_PPRO | m_ATHLON;
--- 197,203 ----
const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
const int x86_double_with_add = ~m_386;
const int x86_use_bit_test = m_386;
! const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
const int x86_use_any_reg = m_486;
const int x86_cmove = m_PPRO | m_ATHLON;
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 5149,5158 ****
rtx align_3_label = NULL_RTX;
rtx align_4_label = gen_label_rtx ();
rtx end_0_label = gen_label_rtx ();
- rtx end_2_label = gen_label_rtx ();
- rtx end_3_label = gen_label_rtx ();
rtx mem;
rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
align = 0;
if (GET_CODE (align_rtx) == CONST_INT)
--- 5155,5163 ----
rtx align_3_label = NULL_RTX;
rtx align_4_label = gen_label_rtx ();
rtx end_0_label = gen_label_rtx ();
rtx mem;
rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ rtx tmpreg = gen_reg_rtx (SImode);
align = 0;
if (GET_CODE (align_rtx) == CONST_INT)
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 5269,5316 ****
mem = gen_rtx_MEM (SImode, out);
emit_move_insn (scratch, mem);
! /* Check first byte. */
! emit_insn (gen_cmpqi_0 (gen_lowpart (QImode, scratch), const0_rtx));
! tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
! tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
! gen_rtx_LABEL_REF (VOIDmode, end_0_label),
! pc_rtx);
! emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
!
! /* Check second byte. */
! emit_insn (gen_cmpqi_ext_3 (scratch, const0_rtx));
! tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
! tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
! gen_rtx_LABEL_REF (VOIDmode, end_3_label),
! pc_rtx);
! emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
!
! /* Check third byte. */
! emit_insn (gen_testsi_1 (scratch, GEN_INT (0x00ff0000)));
! tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
! tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
! gen_rtx_LABEL_REF (VOIDmode, end_2_label),
! pc_rtx);
! emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
! /* Check fourth byte and increment address. */
! emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
! emit_insn (gen_testsi_1 (scratch, GEN_INT (0xff000000)));
! tmp = gen_rtx_NE (VOIDmode, flags, const0_rtx);
! tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
! gen_rtx_LABEL_REF (VOIDmode, align_4_label),
! pc_rtx);
! emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
! /* Now generate fixups when the compare stops within a 4-byte word. */
! emit_insn (gen_subsi3 (out, out, GEN_INT (3)));
!
! emit_label (end_2_label);
! emit_insn (gen_addsi3 (out, out, const1_rtx));
! emit_label (end_3_label);
! emit_insn (gen_addsi3 (out, out, const1_rtx));
emit_label (end_0_label);
}
--- 5274,5343 ----
mem = gen_rtx_MEM (SImode, out);
emit_move_insn (scratch, mem);
+ emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
! /* Use formula that gives nonzero result ifif one of the bytes is zero.
! This saves three branches inside loop and many cycles. */
! emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
! emit_insn (gen_one_cmplsi2 (scratch, scratch));
! emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
! emit_insn (gen_andsi3 (tmpreg, tmpreg, GEN_INT (0x80808080)));
! emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 0, align_4_label);
!
! if (TARGET_CMOVE)
! {
! rtx reg = gen_reg_rtx (SImode);
! emit_move_insn (reg, tmpreg);
! emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
!
! /* If zero is not in the first two bytes, move two bytes forward. */
! emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
! tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
! tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
! emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
! gen_rtx_IF_THEN_ELSE (SImode, tmp,
! reg,
! tmpreg)));
! /* Emit lea manually to avoid clobbering of flags. */
! emit_insn (gen_rtx_SET (SImode, reg,
! gen_rtx_PLUS (SImode, out, GEN_INT (2))));
!
! tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
! tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
! emit_insn (gen_rtx_SET (VOIDmode, out,
! gen_rtx_IF_THEN_ELSE (SImode, tmp,
! reg,
! out)));
! }
! else
! {
! rtx end_2_label = gen_label_rtx ();
! /* Is zero in the first two bytes? */
!
! emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
! tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
! tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
! tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
! gen_rtx_LABEL_REF (VOIDmode, end_2_label),
! pc_rtx);
! tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
! JUMP_LABEL (tmp) = end_2_label;
!
! /* Not in the first two. Move two bytes forward. */
! emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
! emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
!
! emit_label (end_2_label);
!
! }
! /* Avoid branch in fixing the byte. */
! emit_insn (gen_ashlqi3_cmpno (gen_rtx_SUBREG (QImode, tmpreg, 0),
! gen_rtx_SUBREG (QImode, tmpreg, 0),
! const1_rtx));
! emit_insn (gen_subsi3_carry (out, out, GEN_INT (3)));
emit_label (end_0_label);
}
Index: egcs/gcc/config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.89
diff -c -3 -p -r1.89 i386.h
*** i386.h 2000/01/11 18:01:35 1.89
--- i386.h 2000/01/13 12:50:49
*************** struct processor_costs {
*** 62,67 ****
--- 62,69 ----
int mult_bit; /* cost of multiply per each bit set */
int divide; /* cost of a divide/mod */
int large_insn; /* insns larger than this cost more */
+ int move_ratio; /* The threshold of number of scalar memory-to-memory
+ move insns. */
int movzbl_load; /* cost of loading using movzbl */
int int_load[3]; /* cost of loading integer registers
in QImode, HImode and SImode relative
*************** while (0)
*** 1709,1721 ****
Increasing the value will always make code faster, but eventually
incurs high cost in increased code size.
! If you don't define this, a reasonable default is used.
! Make this large on i386, since the block move is very inefficient with small
! blocks, and the hard register needs of the block move require much reload
! work. */
!
! #define MOVE_RATIO 5
/* Define if shifts truncate the shift count
which implies one can omit a sign-extension or zero-extension
--- 1711,1719 ----
Increasing the value will always make code faster, but eventually
incurs high cost in increased code size.
! If you don't define this, a reasonable default is used. */
! #define MOVE_RATIO (optimize_size ? 3 : ix86_cost->move_ratio)
/* Define if shifts truncate the shift count
which implies one can omit a sign-extension or zero-extension
Index: egcs/gcc/config/i386/i386.md
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.md,v
retrieving revision 1.128
diff -c -3 -p -r1.128 i386.md
*** i386.md 2000/01/11 18:01:35 1.128
--- i386.md 2000/01/13 12:50:51
***************
*** 3736,3742 ****
"sub{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
! (define_insn "*subsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
(plus:SI (match_operand:SI 2 "general_operand" "ri,rm")
--- 3736,3742 ----
"sub{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
! (define_insn "subsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
(plus:SI (match_operand:SI 2 "general_operand" "ri,rm")
***************
*** 5861,5867 ****
;; This pattern can't accept a variable shift count, since shifts by
;; zero don't affect the flags. We assume that shifts by constant
;; zero are optimized away.
! (define_insn "*ashlqi3_cmpno"
[(set (reg:CCNO 17)
(compare:CCNO
(ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0")
--- 5861,5867 ----
;; This pattern can't accept a variable shift count, since shifts by
;; zero don't affect the flags. We assume that shifts by constant
;; zero are optimized away.
! (define_insn "ashlqi3_cmpno"
[(set (reg:CCNO 17)
(compare:CCNO
(ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0")
***************
*** 7841,7848 ****
srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
emit_insn (gen_cld());
! /* When optimizing for size emit simple rep ; movsb instruction. */
! if (!optimize || optimize_size)
{
countreg = copy_to_mode_reg (SImode, operands[2]);
emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
--- 7841,7849 ----
srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
emit_insn (gen_cld());
! /* When optimizing for size emit simple rep ; movsb instruction for
! counts not divisible by 4. */
! if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
{
countreg = copy_to_mode_reg (SImode, operands[2]);
emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
***************
*** 7983,8066 ****
(set_attr "memory" "both")])
(define_expand "clrstrsi"
! [(set (reg:SI 19) (const_int 0))
! (set (match_dup 3) (const_int 0))
! (parallel [(set (match_operand:BLK 0 "memory_operand" "")
! (const_int 0))
! (use (match_operand:SI 1 "const_int_operand" ""))
! (use (match_operand:SI 2 "const_int_operand" ""))
! (use (match_dup 3))
! (use (reg:SI 19))
! (clobber (match_scratch:SI 4 ""))
! (clobber (match_dup 5))])]
""
"
{
! rtx addr0;
if (GET_CODE (operands[1]) != CONST_INT)
FAIL;
! addr0 = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
! operands[3] = gen_reg_rtx (SImode);
! operands[5] = addr0;
! operands[0] = gen_rtx_MEM (BLKmode, addr0);
}")
;; It might seem that operand 0 could use predicate register_operand.
;; But strength reduction might offset the MEM expression. So we let
;; reload put the address into %edi.
! (define_insn "*clrstrsi_1"
! [(set (mem:BLK (match_operand:SI 0 "address_operand" "D"))
(const_int 0))
! (use (match_operand:SI 1 "const_int_operand" "n"))
! (use (match_operand:SI 2 "immediate_operand" "i"))
! (use (match_operand:SI 3 "register_operand" "a"))
! (use (reg:SI 19))
! (clobber (match_scratch:SI 4 "=&c"))
! (clobber (match_dup 0))]
""
! "*
! {
! rtx xops[2];
!
! if (GET_CODE (operands[1]) == CONST_INT)
! {
! unsigned int count = INTVAL (operands[1]) & 0xffffffff;
! if (count & ~0x03)
! {
! xops[0] = GEN_INT (count / 4);
! xops[1] = operands[4];
! /* K6: stos takes 1 cycle, rep stos takes 8 + %ecx cycles.
! 80386: 4/5+5n (+2 for set of ecx)
! 80486: 5/7+5n (+1 for set of ecx)
! */
! if (count / 4 < ((int) ix86_cpu < (int)PROCESSOR_PENTIUM ? 4 : 6))
! {
! do
! output_asm_insn (\"{stosl|stosd}\", xops);
! while ((count -= 4) > 3);
! }
! else
! {
! output_asm_insn (\"mov{l}\\t{%0, %1|%1, %0}\", xops);
! output_asm_insn (\"{rep\;stosl|rep stosd}\", xops);
! }
! }
! if (INTVAL (operands[1]) & 0x02)
! output_asm_insn (\"stosw\", operands);
! if (INTVAL (operands[1]) & 0x01)
! output_asm_insn (\"stosb\", operands);
! }
! else
! abort ();
! RET;
! }"
! [(set_attr "type" "multi")])
(define_expand "cmpstrsi"
[(set (match_operand:SI 0 "register_operand" "")
--- 7984,8126 ----
(set_attr "memory" "both")])
(define_expand "clrstrsi"
! [(use (match_operand:BLK 0 "memory_operand" ""))
! (use (match_operand:SI 1 "const_int_operand" ""))
! (use (match_operand:SI 2 "const_int_operand" ""))]
""
"
{
! rtx destreg, zeroreg, countreg;
if (GET_CODE (operands[1]) != CONST_INT)
FAIL;
+
+ destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
+
+ emit_insn (gen_cld());
+
+ /* When optimizing for size emit simple rep ; movsb instruction for
+ counts not divisible by 4. */
+ if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
+ {
+ countreg = copy_to_mode_reg (SImode, operands[1]);
+ zeroreg = copy_to_mode_reg (QImode, const0_rtx);
+ emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
+ destreg, countreg));
+ }
+ else
+ {
+ zeroreg = copy_to_mode_reg (SImode, const0_rtx);
+ if (INTVAL (operands[1]) & ~0x03)
+ {
+ countreg = copy_to_mode_reg (SImode,
+ GEN_INT ((INTVAL (operands[1]) >> 2)
+ & 0x3fffffff));
+ emit_insn (gen_rep_stossi (destreg, countreg, zeroreg,
+ destreg, countreg));
+ }
+ if (INTVAL (operands[1]) & 0x02)
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ if (INTVAL (operands[1]) & 0x01)
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ }
+ DONE;
+ }")
! ;; Most CPUs don't like single string operations
! ;; Handle this case here to simplify previous expander.
! (define_expand "strsethi"
! [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
! (match_operand:HI 1 "register_operand" ""))
! (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
! (clobber (reg:CC 17))])]
! ""
! "
! {
! if (TARGET_SINGLE_STRINGOP || optimize_size)
! {
! emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1]));
! DONE;
! }
! }")
! (define_expand "strsetqi"
! [(set (mem:QI (match_operand:SI 0 "register_operand" ""))
! (match_operand:QI 1 "register_operand" ""))
! (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
! (clobber (reg:CC 17))])]
! ""
! "
! {
! if (TARGET_SINGLE_STRINGOP || optimize_size)
! {
! emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1]));
! DONE;
! }
}")
+ (define_insn "strsethi_1"
+ [(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
+ (match_operand:HI 2 "register_operand" "a"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 2)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "stosw"
+ [(set_attr "type" "str")
+ (set_attr "memory" "store")
+ (set_attr "length_prefix" "1")])
+
+ (define_insn "strsetqi_1"
+ [(set (mem:QI (match_operand:SI 1 "register_operand" "0"))
+ (match_operand:QI 2 "register_operand" "a"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 1)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "stosb"
+ [(set_attr "type" "str")
+ (set_attr "memory" "store")])
+
;; It might seem that operand 0 could use predicate register_operand.
;; But strength reduction might offset the MEM expression. So we let
;; reload put the address into %edi.
! (define_insn "rep_stossi"
! [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
! (use (match_operand:SI 2 "register_operand" "a"))
! (use (match_operand:SI 4 "register_operand" "1"))
! (set (match_operand:SI 0 "register_operand" "=D")
! (plus:SI (match_operand:SI 3 "address_operand" "0")
! (ashift:SI (match_dup 3) (const_int 2))))
! (set (mem:BLK (match_dup 3))
(const_int 0))
! (use (reg:SI 19))]
""
! "rep\;stosl|rep stosd"
! [(set_attr "type" "str")
! (set_attr "length_prefix" "1")
! (set_attr "memory" "store")])
! (define_insn "rep_stosqi"
! [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
! (use (match_operand:QI 2 "register_operand" "a"))
! (use (match_operand:SI 4 "register_operand" "1"))
! (set (match_operand:SI 0 "register_operand" "=D")
! (plus:SI (match_operand:SI 3 "address_operand" "0") (match_dup 3)))
! (set (mem:BLK (match_dup 3))
! (const_int 0))
! (use (reg:SI 19))]
! ""
! "rep\;stosb|rep stosb"
! [(set_attr "type" "str")
! (set_attr "length_prefix" "1")
! (set_attr "memory" "store")])
(define_expand "cmpstrsi"
[(set (match_operand:SI 0 "register_operand" "")
***************
*** 8099,8105 ****
emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align));
}
else
! emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
outlow = gen_lowpart (QImode, out);
emit_insn (gen_cmpintqi (outlow));
--- 8159,8168 ----
emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align));
}
else
! {
! emit_insn (gen_cmpsi_1 (countreg, countreg));
! emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
! }
outlow = gen_lowpart (QImode, out);
emit_insn (gen_cmpintqi (outlow));
***************
*** 8145,8152 ****
(clobber (match_dup 2))]
""
"repz{\;| }cmpsb"
! [(set_attr "type" "multi")
! (set_attr "length" "3")])
;; The same, but the count is not known to not be zero.
--- 8208,8215 ----
(clobber (match_dup 2))]
""
"repz{\;| }cmpsb"
! [(set_attr "type" "str")
! (set_attr "length_prefix" "1")])
;; The same, but the count is not known to not be zero.
***************
*** 8158,8172 ****
(mem:BLK (match_operand:SI 1 "address_operand" "D")))
(const_int 0)))
(use (match_operand:SI 3 "immediate_operand" "i"))
(use (reg:SI 19))
(clobber (match_dup 0))
(clobber (match_dup 1))
(clobber (match_dup 2))]
""
! ;; The initial compare sets the zero flag.
! "cmp{l}\\t%2, %2\;repz{\;| }cmpsb"
! [(set_attr "type" "multi")
! (set_attr "length" "5")])
(define_expand "strlensi"
[(set (match_operand:SI 0 "register_operand" "")
--- 8221,8235 ----
(mem:BLK (match_operand:SI 1 "address_operand" "D")))
(const_int 0)))
(use (match_operand:SI 3 "immediate_operand" "i"))
+ (use (reg:CC 17))
(use (reg:SI 19))
(clobber (match_dup 0))
(clobber (match_dup 1))
(clobber (match_dup 2))]
""
! "repz{\;| }cmpsb"
! [(set_attr "type" "str")
! (set_attr "length_prefix" "1")])
(define_expand "strlensi"
[(set (match_operand:SI 0 "register_operand" "")
***************
*** 8184,8190 ****
align = operands[3];
scratch1 = gen_reg_rtx (SImode);
! if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1)
{
/* Well it seems that some optimizer does not combine a call like
foo(strlen(bar), strlen(bar));
--- 8247,8254 ----
align = operands[3];
scratch1 = gen_reg_rtx (SImode);
! if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
! && !optimize_size)
{
/* Well it seems that some optimizer does not combine a call like
foo(strlen(bar), strlen(bar));
***************
*** 8236,8243 ****
(clobber (reg:CC 17))]
""
"repnz{\;| }scasb"
! [(set_attr "type" "multi")
! (set_attr "length" "3")])
;; Conditional move instructions.
--- 8300,8307 ----
(clobber (reg:CC 17))]
""
"repnz{\;| }scasb"
! [(set_attr "type" "str")
! (set_attr "length_prefix" "1")])
;; Conditional move instructions.
More information about the Gcc-patches
mailing list