4.1 selects a strange instruction to put in the delay slot of a bl,a instruction because in the non-taken case the same instruction will be executed anyway... -O2 code for 4.1 PointToRowCol: save %sp, -112, %sp sethi %hi(term), %g1 ld [%g1+%lo(term)], %l2 add %l2, 136, %l1 ld [%l1+572], %l3 ld [%l1+772], %l0 sub %i0, %l3, %o0 call .div, 0 ld [%l0+20], %o1 sethi %hi(firstValidRow), %g1 ld [%g1+%lo(firstValidRow)], %i0 cmp %o0, %i0 bl,a .LL118 ldsb [%l2+1823], %g1 ;; this instruction sethi %hi(lastValidRow), %g1 ld [%g1+%lo(lastValidRow)], %g1 cmp %o0, %g1 bg .LL116 mov %o0, %i0 .LL105: ldsb [%l2+1823], %g1 ;; this will be executed on the ;; non-taken path .LL118: cmp %g1, 0 bne .LL110 mov 0, %o0 ld [%l0+32], %o0 .LL110: add %o0, %l3, %o0 ld [%l0+16], %o1 call .div, 0 sub %i1, %o0, %o0 cmp %o0, 0 bl .LL113 mov 0, %g2 ld [%l1+888], %g1 add %g1, 1, %g1 cmp %o0, %g1 bg .LL117 mov %o0, %g2 .LL113: st %i0, [%i2] st %g2, [%i3] jmp %i7+8 restore .LL117: st %i0, [%i2] mov %g1, %g2 st %g2, [%i3] jmp %i7+8 restore .LL116: b .LL105 mov %g1, %i0 The 4.0 code is: PointToRowCol: save %sp, -112, %sp sethi %hi(term), %g1 ld [%g1+%lo(term)], %l2 add %l2, 136, %l1 ld [%l1+572], %l3 sub %i0, %l3, %o0 ld [%l1+772], %i0 call .div, 0 ld [%i0+20], %o1 sethi %hi(firstValidRow), %g1 ld [%g1+%lo(firstValidRow)], %g1 cmp %o0, %g1 bl .LL42 mov %o0, %l0 sethi %hi(lastValidRow), %g1 ld [%g1+%lo(lastValidRow)], %g1 cmp %o0, %g1 bg,a .LL32 mov %g1, %l0 .LL32: ldsb [%l2+1823], %g1 cmp %g1, 0 bne .LL36 mov 0, %o0 ld [%i0+32], %o0 .LL36: add %o0, %l3, %o0 ld [%i0+16], %o1 call .div, 0 sub %i1, %o0, %o0 cmp %o0, 0 bl,a .LL43 st %l0, [%i2] ld [%l1+888], %g1 add %g1, 1, %g1 cmp %o0, %g1 bg,a .LL39 mov %g1, %o0 .LL39: st %l0, [%i2] st %o0, [%i3] jmp %i7+8 restore .LL42: b .LL32 mov %g1, %l0 .LL43: mov 0, %o0 st %o0, [%i3] jmp %i7+8 (the 4.0 code a few bytes smaller) I'll attach the preprocessed code.
Created attachment 9889 [details] preprocessed code for this bug
Looks indeed weird.
Current trunk still picks that ldsb insn for the delay slot. Here's what it produces: .file "t.c" .section ".text" .align 4 .global PointToRowCol .type PointToRowCol, #function .proc 020 PointToRowCol: sethi %hi(term+4), %g1 sethi %hi(firstValidRow), %g2 ld [%g1+%lo(term+4)], %g1 ld [%g2+%lo(firstValidRow)], %g2 ld [%g1], %g4 ld [%g1+12], %g3 sub %o0, %g4, %o0 sra %o0, 31, %o5 wr %o5, 0, %y ld [%g3+4], %o5 nop nop sdiv %o0, %o5, %o0 cmp %o0, %g2 bl,a .L12 ldsb [%g1+16], %o4 sethi %hi(lastValidRow), %o5 ld [%o5+%lo(lastValidRow)], %o5 cmp %o0, %o5 bg .L10 mov %o0, %g2 .L2: ldsb [%g1+16], %o4 .L12: cmp %o4, 0 bne .L4 mov 0, %o5 ld [%g3+8], %o5 .L4: add %o5, %g4, %g4 sub %o1, %g4, %o1 sra %o1, 31, %g4 wr %g4, 0, %y ld [%g3], %g4 nop nop sdiv %o1, %g4, %o1 cmp %o1, 0 bl,a .L8 st %g2, [%o2] ld [%g1+4], %g1 add %g1, 1, %g1 cmp %o1, %g1 bg .L11 st %g2, [%o2] jmp %o7+8 st %o1, [%o3] .L11: mov %g1, %o1 jmp %o7+8 st %o1, [%o3] .L10: b .L2 mov %o5, %g2 .L8: mov 0, %o1 jmp %o7+8 st %o1, [%o3] .size PointToRowCol, .-PointToRowCol .ident "GCC: (GNU) 4.9.0 20130418 (experimental) [trunk revision 198052]" .section .note.GNU-stack,"",@progbits for this test case: typedef char Boolean; typedef struct { int width; } SbInfo; struct _vtwin { int f_width; int f_height; SbInfo sb_info; }; typedef struct { int border; int max_col; struct _vtwin *whichVwin; } TScreen; typedef struct _Misc { Boolean useRight; } Misc; typedef struct _XtermWidgetRec { TScreen screen; Misc misc; } XtermWidgetRec, *XtermWidget; extern int firstValidRow, lastValidRow; extern XtermWidget term; void PointToRowCol(int y, int x, int *r, int *c) { TScreen *screen = &term->screen; int row, col; row = (y - screen->border) / screen->whichVwin->f_height; if (row < firstValidRow) row = firstValidRow; else if (row > lastValidRow) row = lastValidRow; col = (x - (((term->misc.useRight) ? 0 : screen->whichVwin->sb_info.width) + screen->border) ) / screen->whichVwin->f_width; if (col < 0) col = 0; else if (col > screen->max_col + 1) col = screen->max_col + 1; *r = row; *c = col; } with options: "-mcpu=v8 -m32 -O2".