This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[SH] PR 6526
- From: Oleg Endo <oleg dot endo at t-online dot de>
- To: gcc-patches <gcc-patches at gcc dot gnu dot org>
- Date: Fri, 24 May 2013 12:34:23 +0200
- Subject: [SH] PR 6526
Hello,
I'd like to fix this ancient PR.
The attached patch picks up the suggested changes mentioned in comment
#3 to avoid changing the FPSCR.FR bit in the sdivsi3_i4 and udivsi3_i4
library functions. As mentioned in the PR, this makes integer division
a bit slower when using -mdiv=call-fp, but it allows better utilization
of the SH4 matrix multiplication feature.
I've also added SH4A versions of the library functions which utilize the
fpchg instruction.
Tested on rev 199102 with
make -k check RUNTESTFLAGS="--target_board=sh-sim
\{-m4/-mb/-mdiv=call-fp,-m4-single/-mb/-mdiv=call-fp,-m4a/-mb/-mdiv=call-fp,
-m4a-single/-mb/-mdiv=call-fp,-m4/-ml/-mdiv=call-fp,
-m4-single/-ml/-mdiv=call-fp,-m4a/-ml/-mdiv=call-fp,
-m4a-single/-ml/-mdiv=call-fp}"
and no new failures.
OK for trunk?
Cheers,
Oleg
libgcc/ChangeLog:
PR target/6526
* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change
bits other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation.
testsuite/ChangeLog:
PR target/6526
* gcc.target/sh/pr6526.c: New.
Index: libgcc/config/sh/lib1funcs.S
===================================================================
--- libgcc/config/sh/lib1funcs.S (revision 199109)
+++ libgcc/config/sh/lib1funcs.S (working copy)
@@ -1003,11 +1003,17 @@
ENDFUNC(GLOBAL(mulsi3))
#endif
#endif /* ! __SH5__ */
+
+/*------------------------------------------------------------------------------
+ 32 bit signed integer division that uses FPU double precision division. */
+
#ifdef L_sdivsi3_i4
.title "SH DIVIDE"
-!! 4 byte integer Divide code for the Renesas SH
+
#if defined (__SH4__) || defined (__SH2A__)
-!! args in r4 and r5, result in fpul, clobber dr0, dr2
+/* This variant is used when FPSCR.PR = 1 (double precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul, clobber dr0, dr2. */
.global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
@@ -1021,8 +1027,13 @@
ftrc dr0,fpul
ENDFUNC(GLOBAL(sdivsi3_i4))
+
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
-!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
+/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul, clobber r2, dr0, dr2.
+ For this to work, we must temporarily switch the FPU do double precision,
+ but we better do not touch FPSCR.FR. See PR 6526. */
#if ! __SH5__ || __SH5__ == 32
#if __SH5__
@@ -1031,24 +1042,43 @@
.global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
- sts.l fpscr,@-r15
- mov #8,r2
- swap.w r2,r2
- lds r2,fpscr
- lds r4,fpul
- float fpul,dr0
- lds r5,fpul
- float fpul,dr2
- fdiv dr2,dr0
- ftrc dr0,fpul
+
+#ifndef __SH4A__
+ mov.l r3,@-r15
+ sts fpscr,r2
+ mov #8,r3
+ swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit)
+ or r2,r3
+ lds r3,fpscr // Set FPSCR.PR = 1.
+ lds r4,fpul
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
+ lds r2,fpscr
rts
- lds.l @r15+,fpscr
+ mov.l @r15+,r3
+#else
+/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */
+ fpchg
+ lds r4,fpul
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
+ rts
+ fpchg
+#endif /* __SH4A__ */
+
ENDFUNC(GLOBAL(sdivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#endif /* ! __SH4__ || __SH2A__ */
-#endif
+#endif /* L_sdivsi3_i4 */
+//------------------------------------------------------------------------------
#ifdef L_sdivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
sh2e/sh3e code. */
@@ -1367,54 +1397,60 @@
mov #0,r0
ENDFUNC(GLOBAL(sdivsi3))
-#endif /* ! __SHMEDIA__ */
-#endif
+#endif /* ! __SHMEDIA__ */
+#endif /* L_sdivsi3 */
+
+/*------------------------------------------------------------------------------
+ 32 bit unsigned integer division that uses FPU double precision division. */
+
#ifdef L_udivsi3_i4
+ .title "SH DIVIDE"
- .title "SH DIVIDE"
-!! 4 byte integer Divide code for the Renesas SH
#if defined (__SH4__) || defined (__SH2A__)
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
-!! and t bit
+/* This variant is used when FPSCR.PR = 1 (double precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul,
+ clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */
.global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
- mov #1,r1
- cmp/hi r1,r5
- bf trivial
- rotr r1
- xor r1,r4
- lds r4,fpul
- mova L1,r0
+ mov #1,r1
+ cmp/hi r1,r5
+ bf/s trivial
+ rotr r1
+ xor r1,r4
+ lds r4,fpul
+ mova L1,r0
#ifdef FMOVD_WORKS
- fmov.d @r0+,dr4
+ fmov.d @r0+,dr4
#else
- fmov.s @r0+,DR40
- fmov.s @r0,DR41
+ fmov.s @r0+,DR40
+ fmov.s @r0,DR41
#endif
- float fpul,dr0
- xor r1,r5
- lds r5,fpul
- float fpul,dr2
- fadd dr4,dr0
- fadd dr4,dr2
- fdiv dr2,dr0
+ float fpul,dr0
+ xor r1,r5
+ lds r5,fpul
+ float fpul,dr2
+ fadd dr4,dr0
+ fadd dr4,dr2
+ fdiv dr2,dr0
rts
- ftrc dr0,fpul
+ ftrc dr0,fpul
trivial:
rts
- lds r4,fpul
+ lds r4,fpul
.align 2
#ifdef FMOVD_WORKS
- .align 3 ! make double below 8 byte aligned.
+ .align 3 // Make the double below 8 byte aligned.
#endif
L1:
.double 2147483648
ENDFUNC(GLOBAL(udivsi3_i4))
+
#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__)
#if ! __SH5__ || __SH5__ == 32
!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
@@ -1436,57 +1472,106 @@
ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
+
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
+/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul,
+ clobber r0, r1, r4, r5, dr0, dr2, dr4.
+ For this to work, we must temporarily switch the FPU do double precision,
+ but we better do not touch FPSCR.FR. See PR 6526. */
.global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
- mov #1,r1
- cmp/hi r1,r5
- bf trivial
- sts.l fpscr,@-r15
- mova L1,r0
- lds.l @r0+,fpscr
- rotr r1
- xor r1,r4
- lds r4,fpul
+
+#ifndef __SH4A__
+ mov #1,r1
+ cmp/hi r1,r5
+ bf/s trivial
+ rotr r1 // r1 = 1 << 31
+ sts.l fpscr,@-r15
+ xor r1,r4
+ mov.l @(0,r15),r0
+ xor r1,r5
+ mov.l L2,r1
+ lds r4,fpul
+ or r0,r1
+ mova L1,r0
+ lds r1,fpscr
#ifdef FMOVD_WORKS
- fmov.d @r0+,dr4
+ fmov.d @r0+,dr4
#else
- fmov.s @r0+,DR40
- fmov.s @r0,DR41
+ fmov.s @r0+,DR40
+ fmov.s @r0,DR41
#endif
- float fpul,dr0
- xor r1,r5
- lds r5,fpul
- float fpul,dr2
- fadd dr4,dr0
- fadd dr4,dr2
- fdiv dr2,dr0
- ftrc dr0,fpul
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fadd dr4,dr0
+ fadd dr4,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
rts
- lds.l @r15+,fpscr
+ lds.l @r15+,fpscr
#ifdef FMOVD_WORKS
- .align 3 ! make double below 8 byte aligned.
+ .align 3 // Make the double below 8 byte aligned.
#endif
trivial:
rts
- lds r4,fpul
+ lds r4,fpul
.align 2
-L1:
-#ifndef FMOVD_WORKS
- .long 0x80000
+L2:
+#ifdef FMOVD_WORKS
+ .long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1
#else
- .long 0x180000
+ .long 0x80000 // FPSCR.PR = 1
#endif
+L1:
.double 2147483648
+#else
+/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit.
+ Although on SH4A fmovd usually works, it would require either additional
+ two fschg instructions or an FPSCR push + pop. It's not worth the effort
+ for loading only one double constant. */
+ mov #1,r1
+ cmp/hi r1,r5
+ bf/s trivial
+ rotr r1 // r1 = 1 << 31
+ fpchg
+ mova L1,r0
+ xor r1,r4
+ fmov.s @r0+,DR40
+ lds r4,fpul
+ fmov.s @r0,DR41
+ xor r1,r5
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fadd dr4,dr0
+ fadd dr4,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
+ rts
+ fpchg
+
+trivial:
+ rts
+ lds r4,fpul
+
+ .align 2
+L1:
+ .double 2147483648
+
+#endif /* __SH4A__ */
+
+
ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH4__ */
-#endif
+#endif /* L_udivsi3_i4 */
#ifdef L_udivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
Index: gcc/testsuite/gcc.target/sh/pr6526.c
===================================================================
--- gcc/testsuite/gcc.target/sh/pr6526.c (revision 0)
+++ gcc/testsuite/gcc.target/sh/pr6526.c (revision 0)
@@ -0,0 +1,64 @@
+/* Check that the XF registers are not clobbered by an integer division
+ that is done using double precision FPU division. */
+/* { dg-do run { target "sh*-*-*" } } */
+/* { dg-options "-O1 -mdiv=call-fp" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } } */
+
+#include <assert.h>
+#include <stdlib.h>
+
+extern void __set_fpscr (int);
+
+void
+write_xf0 (float* f)
+{
+ __asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory");
+}
+
+void
+read_xf0 (float* f)
+{
+ __asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory");
+}
+
+int __attribute__ ((noinline))
+test_00 (int a, int b)
+{
+ return a / b;
+}
+
+unsigned int __attribute__ ((noinline))
+test_01 (unsigned a, unsigned b)
+{
+ return a / b;
+}
+
+int __attribute__ ((noinline))
+test_02 (int x)
+{
+ return x & 0;
+}
+
+int
+main (void)
+{
+ float test_value;
+ int r = 0;
+
+ /* Set FPSCR.FR to 1. */
+ __set_fpscr (0x200000);
+
+ test_value = 123;
+ write_xf0 (&test_value);
+ r += test_00 (40, 4);
+ read_xf0 (&test_value);
+ assert (test_value == 123);
+
+ test_value = 321;
+ write_xf0 (&test_value);
+ r += test_01 (50, 5);
+ read_xf0 (&test_value);
+ assert (test_value == 321);
+
+ return test_02 (r);
+}