This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
linpackc.c regression quick analysis
- To: egcs-bugs at cygnus dot com
- Subject: linpackc.c regression quick analysis
- From: Toshiyasu Morita <tm at netcom dot com>
- Date: Sun, 12 Jul 1998 02:51:10 -0700 (PDT)
I think I've found the cause of the linpackc.c regression on the
i386-linux host:
The problem seems to be that egcs is *much* too aggressive about hoisting
stuff out of loops, and it runs out of registers and starts stuffing it
into stack slots. Here's the head and tail of the critical loop in
dmxpy() as
compiled by gcc-2.7.2.1:
1389:linpackc.c **** for (j = jmin-1; j < n2; j = j + 16) {
3608 .LM456:
3609 1b8e 8D500F leal 15(%eax),%edx
3610 1b91 395510 cmpl %edx,16(%ebp)
3611 1b94 0F8E6701 jle .L296
3611 0000
3612 1b9a 8B5D14 movl 20(%ebp),%ebx
3613 1b9d 0FAFDA imull %edx,%ebx
3614 1ba0 895DD4 movl %ebx,-44(%ebp)
3615 1ba3 90 .align 4
3616 .L328:
1390:linpackc.c **** for (i = 0; i < n1; i++)
3618 .LM457:
3619 1ba4 31C9 xorl %ecx,%ecx
3620 1ba6 394D08 cmpl %ecx,8(%ebp)
3621 1ba9 0F8E3D01 jle .L327
3621 0000
3622 1baf 8D42F1 leal -15(%edx),%eax
3623 1bb2 0FAF4514 imull 20(%ebp),%eax
3624 1bb6 8945D8 movl %eax,-40(%ebp)
3625 1bb9 8B5D0C movl 12(%ebp),%ebx
3626 1bbc 895DD0 movl %ebx,-48(%ebp)
3627 1bbf 90 .align 4
3628 .L332:
1391:linpackc.c **** y[i] = ((((((((((((((( (y[i])
...
3728 .LM459:
3729 1cdc 83C304 addl $4,%ebx
3730 1cdf 895DD0 movl %ebx,-48(%ebp)
3731 1ce2 41 incl %ecx
3732 1ce3 394D08 cmpl %ecx,8(%ebp)
3733 1ce6 0F8FD4FE jg .L332
3733 FFFF
3735 .LM460:
3736 .L327:
3737 1cec 8B4514 movl 20(%ebp),%eax
3738 1cef C1E004 sall $4,%eax
3739 1cf2 0145D4 addl %eax,-44(%ebp)
3740 1cf5 83C210 addl $16,%edx
3741 1cf8 395510 cmpl %edx,16(%ebp)
3742 1cfb 0F8FA3FE jg .L328
Here's the same loop head and tail as compiled by egcs-19980707
- note it's using a *huge* number of stack slots:
1389:/home/tm/linpackc.c **** for (j = jmin-1; j < n2; j = j + 16) {
3969 .LM518:
3970 1d60 8D700F leal 15(%eax),%esi
3971 1d63 3B7510 cmpl 16(%ebp),%esi
3972 1d66 0F8DE801 jge .L297
3972 0000
3973 1d6c 8B4D18 movl 24(%ebp),%ecx
3974 1d6f 8D14B1 leal (%ecx,%esi,4),%edx
3975 1d72 89F1 movl %esi,%ecx
3976 1d74 0FAF4D14 imull 20(%ebp),%ecx
3977 1d78 894DE4 movl %ecx,-28(%ebp)
3978 1d7b 8D480E leal 14(%eax),%ecx
3979 1d7e 0FAF4D14 imull 20(%ebp),%ecx
3980 1d82 894DE0 movl %ecx,-32(%ebp)
3981 1d85 8D480D leal 13(%eax),%ecx
3982 1d88 0FAF4D14 imull 20(%ebp),%ecx
3983 1d8c 894DDC movl %ecx,-36(%ebp)
3984 1d8f 8D480C leal 12(%eax),%ecx
3985 1d92 0FAF4D14 imull 20(%ebp),%ecx
3986 1d96 894DD8 movl %ecx,-40(%ebp)
3987 1d99 8D480B leal 11(%eax),%ecx
3988 1d9c 0FAF4D14 imull 20(%ebp),%ecx
3989 1da0 894DD4 movl %ecx,-44(%ebp)
3990 1da3 8D480A leal 10(%eax),%ecx
3991 1da6 0FAF4D14 imull 20(%ebp),%ecx
3992 1daa 894DD0 movl %ecx,-48(%ebp)
3993 1dad 8D4809 leal 9(%eax),%ecx
3994 1db0 0FAF4D14 imull 20(%ebp),%ecx
3995 1db4 894DCC movl %ecx,-52(%ebp)
3996 1db7 8D4808 leal 8(%eax),%ecx
3997 1dba 0FAF4D14 imull 20(%ebp),%ecx
3998 1dbe 894DC8 movl %ecx,-56(%ebp)
3999 1dc1 8D4807 leal 7(%eax),%ecx
4000 1dc4 0FAF4D14 imull 20(%ebp),%ecx
4001 1dc8 894DC4 movl %ecx,-60(%ebp)
4002 1dcb 8D4806 leal 6(%eax),%ecx
4003 1dce 0FAF4D14 imull 20(%ebp),%ecx
4004 1dd2 894DC0 movl %ecx,-64(%ebp)
4005 1dd5 8D4805 leal 5(%eax),%ecx
4006 1dd8 0FAF4D14 imull 20(%ebp),%ecx
4007 1ddc 894DBC movl %ecx,-68(%ebp)
4008 1ddf 8D4804 leal 4(%eax),%ecx
4009 1de2 0FAF4D14 imull 20(%ebp),%ecx
4010 1de6 894DB8 movl %ecx,-72(%ebp)
4011 1de9 8D4803 leal 3(%eax),%ecx
4012 1dec 0FAF4D14 imull 20(%ebp),%ecx
4013 1df0 894DB4 movl %ecx,-76(%ebp)
4014 1df3 8D4802 leal 2(%eax),%ecx
4015 1df6 0FAF4D14 imull 20(%ebp),%ecx
4016 1dfa 894DB0 movl %ecx,-80(%ebp)
4017 1dfd 8D4801 leal 1(%eax),%ecx
4018 1e00 0FAF4D14 imull 20(%ebp),%ecx
4019 1e04 894DAC movl %ecx,-84(%ebp)
4020 1e07 8B4D14 movl 20(%ebp),%ecx
4021 1e0a 894DA8 movl %ecx,-88(%ebp)
4022 1e0d 0FAFC1 imull %ecx,%eax
4023 1e10 8945A8 movl %eax,-88(%ebp)
4024 1e13 C1E104 sall $4,%ecx
4025 1e16 894D9C movl %ecx,-100(%ebp)
4026 1e19 8D7600 .align 4
4027 .L329:
1390:/home/tm/linpackc.c **** for (i = 0; i < n1; i++)
4029 .LM519:
4030 1e1c 31DB xorl %ebx,%ebx
4031 1e1e 3B5D08 cmpl 8(%ebp),%ebx
4032 1e21 0F8DEB00 jge .L328
4032 0000
4033 1e27 8B4DA8 movl -88(%ebp),%ecx
4034 1e2a 894DE8 movl %ecx,-24(%ebp)
4035 1e2d 8D7600 .align 4
4036 .L333:
1391:/home/tm/linpackc.c **** y[i] = ((((((((((((((( (y[i])
...
4123 .LM521:
4124 1f08 43 incl %ebx
4125 1f09 3B5D08 cmpl 8(%ebp),%ebx
4126 1f0c 0F8C1EFF jl .L333
4126 FFFF
4128 .LM522:
4129 .L328:
4130 1f12 83C240 addl $64,%edx
4131 1f15 8B4D9C movl -100(%ebp),%ecx
4132 1f18 014DE4 addl %ecx,-28(%ebp)
4133 1f1b 014DE0 addl %ecx,-32(%ebp)
4134 1f1e 014DDC addl %ecx,-36(%ebp)
4135 1f21 014DD8 addl %ecx,-40(%ebp)
4136 1f24 014DD4 addl %ecx,-44(%ebp)
4137 1f27 014DD0 addl %ecx,-48(%ebp)
4138 1f2a 014DCC addl %ecx,-52(%ebp)
4139 1f2d 014DC8 addl %ecx,-56(%ebp)
4140 1f30 014DC4 addl %ecx,-60(%ebp)
4141 1f33 014DC0 addl %ecx,-64(%ebp)
4142 1f36 014DBC addl %ecx,-68(%ebp)
4143 1f39 014DB8 addl %ecx,-72(%ebp)
4144 1f3c 014DB4 addl %ecx,-76(%ebp)
4145 1f3f 014DB0 addl %ecx,-80(%ebp)
4146 1f42 014DAC addl %ecx,-84(%ebp)
4147 1f45 014DA8 addl %ecx,-88(%ebp)
4148 1f48 83C610 addl $16,%esi
4149 1f4b 3B7510 cmpl 16(%ebp),%esi
4150 1f4e 0F8CC8FE jl .L329
The basic problem seems to be that egcs is much more efficient than
gcc in finding givs and hoisting them. Unfortunately, on machines
with only a few registers, this is a huge loss because they wind up
in stack slots, and reading them is slower than pointer arithmetic,
especially when the memory load latency can't be hidden.
This appears to be the same problem I reported in a few Cygnus PRs;
namely gcc/14879, gcc/14900, and gcc/14921 (still open).
Perhaps artificially limiting the number of givs to number of
registers / 2 or something of the sort would be beneficial on
machines where SMALL_REGISTER_CLASSES is true?
Toshi