This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

linpackc.c regression quick analysis


I think I've found the cause of the linpackc.c regression on the
i386-linux host:

The problem seems to be that egcs is *much* too aggressive about hoisting
stuff out of loops, and it runs out of registers and starts stuffing it
into stack slots. Here's the head and tail of the critical loop in 
dmxpy() as
compiled by gcc-2.7.2.1:

1389:linpackc.c    ****         for (j = jmin-1; j < n2; j = j + 16) {
 3608                   .LM456:
 3609 1b8e 8D500F               leal 15(%eax),%edx
 3610 1b91 395510               cmpl %edx,16(%ebp)
 3611 1b94 0F8E6701             jle .L296
 3611      0000
 3612 1b9a 8B5D14               movl 20(%ebp),%ebx
 3613 1b9d 0FAFDA               imull %edx,%ebx
 3614 1ba0 895DD4               movl %ebx,-44(%ebp)
 3615 1ba3 90                   .align 4
 3616                   .L328:
1390:linpackc.c    ****                 for (i = 0; i < n1; i++)
 3618                   .LM457:
 3619 1ba4 31C9                 xorl %ecx,%ecx
 3620 1ba6 394D08               cmpl %ecx,8(%ebp)
 3621 1ba9 0F8E3D01             jle .L327
 3621      0000
 3622 1baf 8D42F1               leal -15(%edx),%eax
 3623 1bb2 0FAF4514             imull 20(%ebp),%eax
 3624 1bb6 8945D8               movl %eax,-40(%ebp)
 3625 1bb9 8B5D0C               movl 12(%ebp),%ebx
 3626 1bbc 895DD0               movl %ebx,-48(%ebp)
 3627 1bbf 90                   .align 4
 3628                   .L332:
1391:linpackc.c    ****                         y[i] = ((((((((((((((( (y[i])
...
 3728                   .LM459:
 3729 1cdc 83C304               addl $4,%ebx
 3730 1cdf 895DD0               movl %ebx,-48(%ebp)
 3731 1ce2 41                   incl %ecx
 3732 1ce3 394D08               cmpl %ecx,8(%ebp)
 3733 1ce6 0F8FD4FE             jg .L332
 3733      FFFF
 3735                   .LM460:
 3736                   .L327:
 3737 1cec 8B4514               movl 20(%ebp),%eax
 3738 1cef C1E004               sall $4,%eax
 3739 1cf2 0145D4               addl %eax,-44(%ebp)
 3740 1cf5 83C210               addl $16,%edx
 3741 1cf8 395510               cmpl %edx,16(%ebp)
 3742 1cfb 0F8FA3FE             jg .L328

Here's the same loop head and tail as compiled by egcs-19980707
- note it's using a *huge* number of stack slots:

1389:/home/tm/linpackc.c ****   for (j = jmin-1; j < n2; j = j + 16) {
 3969                   .LM518:
 3970 1d60 8D700F               leal 15(%eax),%esi
 3971 1d63 3B7510               cmpl 16(%ebp),%esi
 3972 1d66 0F8DE801             jge .L297
 3972      0000
 3973 1d6c 8B4D18               movl 24(%ebp),%ecx
 3974 1d6f 8D14B1               leal (%ecx,%esi,4),%edx
 3975 1d72 89F1                 movl %esi,%ecx
 3976 1d74 0FAF4D14             imull 20(%ebp),%ecx
 3977 1d78 894DE4               movl %ecx,-28(%ebp)
 3978 1d7b 8D480E               leal 14(%eax),%ecx
 3979 1d7e 0FAF4D14             imull 20(%ebp),%ecx
 3980 1d82 894DE0               movl %ecx,-32(%ebp)
 3981 1d85 8D480D               leal 13(%eax),%ecx
 3982 1d88 0FAF4D14             imull 20(%ebp),%ecx
 3983 1d8c 894DDC               movl %ecx,-36(%ebp)
 3984 1d8f 8D480C               leal 12(%eax),%ecx
 3985 1d92 0FAF4D14             imull 20(%ebp),%ecx
 3986 1d96 894DD8               movl %ecx,-40(%ebp)
 3987 1d99 8D480B               leal 11(%eax),%ecx
 3988 1d9c 0FAF4D14             imull 20(%ebp),%ecx
 3989 1da0 894DD4               movl %ecx,-44(%ebp)
 3990 1da3 8D480A               leal 10(%eax),%ecx
 3991 1da6 0FAF4D14             imull 20(%ebp),%ecx
 3992 1daa 894DD0               movl %ecx,-48(%ebp)
 3993 1dad 8D4809               leal 9(%eax),%ecx
 3994 1db0 0FAF4D14             imull 20(%ebp),%ecx
 3995 1db4 894DCC               movl %ecx,-52(%ebp)
 3996 1db7 8D4808               leal 8(%eax),%ecx
 3997 1dba 0FAF4D14             imull 20(%ebp),%ecx
 3998 1dbe 894DC8               movl %ecx,-56(%ebp)
 3999 1dc1 8D4807               leal 7(%eax),%ecx
 4000 1dc4 0FAF4D14             imull 20(%ebp),%ecx
 4001 1dc8 894DC4               movl %ecx,-60(%ebp)
 4002 1dcb 8D4806               leal 6(%eax),%ecx
 4003 1dce 0FAF4D14             imull 20(%ebp),%ecx
 4004 1dd2 894DC0               movl %ecx,-64(%ebp)
 4005 1dd5 8D4805               leal 5(%eax),%ecx
 4006 1dd8 0FAF4D14             imull 20(%ebp),%ecx
 4007 1ddc 894DBC               movl %ecx,-68(%ebp)
 4008 1ddf 8D4804               leal 4(%eax),%ecx
 4009 1de2 0FAF4D14             imull 20(%ebp),%ecx
 4010 1de6 894DB8               movl %ecx,-72(%ebp)
 4011 1de9 8D4803               leal 3(%eax),%ecx
 4012 1dec 0FAF4D14             imull 20(%ebp),%ecx
 4013 1df0 894DB4               movl %ecx,-76(%ebp)
 4014 1df3 8D4802               leal 2(%eax),%ecx
 4015 1df6 0FAF4D14             imull 20(%ebp),%ecx
 4016 1dfa 894DB0               movl %ecx,-80(%ebp)
 4017 1dfd 8D4801               leal 1(%eax),%ecx
 4018 1e00 0FAF4D14             imull 20(%ebp),%ecx
 4019 1e04 894DAC               movl %ecx,-84(%ebp)
 4020 1e07 8B4D14               movl 20(%ebp),%ecx
 4021 1e0a 894DA8               movl %ecx,-88(%ebp)
 4022 1e0d 0FAFC1               imull %ecx,%eax
 4023 1e10 8945A8               movl %eax,-88(%ebp)
 4024 1e13 C1E104               sall $4,%ecx
 4025 1e16 894D9C               movl %ecx,-100(%ebp)
 4026 1e19 8D7600               .align 4
 4027                   .L329:
1390:/home/tm/linpackc.c ****           for (i = 0; i < n1; i++)
 4029                   .LM519:
 4030 1e1c 31DB                 xorl %ebx,%ebx
 4031 1e1e 3B5D08               cmpl 8(%ebp),%ebx
 4032 1e21 0F8DEB00             jge .L328
 4032      0000
 4033 1e27 8B4DA8               movl -88(%ebp),%ecx
 4034 1e2a 894DE8               movl %ecx,-24(%ebp)
 4035 1e2d 8D7600               .align 4
 4036                   .L333:
1391:/home/tm/linpackc.c ****                   y[i] = ((((((((((((((( (y[i])
...
 4123                   .LM521:
 4124 1f08 43                   incl %ebx
 4125 1f09 3B5D08               cmpl 8(%ebp),%ebx
 4126 1f0c 0F8C1EFF             jl .L333
 4126      FFFF
 4128                   .LM522:
 4129                   .L328:
 4130 1f12 83C240               addl $64,%edx
 4131 1f15 8B4D9C               movl -100(%ebp),%ecx
 4132 1f18 014DE4               addl %ecx,-28(%ebp)
 4133 1f1b 014DE0               addl %ecx,-32(%ebp)
 4134 1f1e 014DDC               addl %ecx,-36(%ebp)
 4135 1f21 014DD8               addl %ecx,-40(%ebp)
 4136 1f24 014DD4               addl %ecx,-44(%ebp)
 4137 1f27 014DD0               addl %ecx,-48(%ebp)
 4138 1f2a 014DCC               addl %ecx,-52(%ebp)
 4139 1f2d 014DC8               addl %ecx,-56(%ebp)
 4140 1f30 014DC4               addl %ecx,-60(%ebp)
 4141 1f33 014DC0               addl %ecx,-64(%ebp)
 4142 1f36 014DBC               addl %ecx,-68(%ebp)
 4143 1f39 014DB8               addl %ecx,-72(%ebp)
 4144 1f3c 014DB4               addl %ecx,-76(%ebp)
 4145 1f3f 014DB0               addl %ecx,-80(%ebp)
 4146 1f42 014DAC               addl %ecx,-84(%ebp)
 4147 1f45 014DA8               addl %ecx,-88(%ebp)
 4148 1f48 83C610               addl $16,%esi
 4149 1f4b 3B7510               cmpl 16(%ebp),%esi
 4150 1f4e 0F8CC8FE             jl .L329

The basic problem seems to be that egcs is much more efficient than
gcc in finding givs and hoisting them. Unfortunately, on machines
with only a few registers, this is a huge loss because they wind up
in stack slots, and reading them is slower than pointer arithmetic,
especially when the memory load latency can't be hidden.

This appears to be the same problem I reported in a few Cygnus PRs;
namely gcc/14879, gcc/14900, and gcc/14921 (still open).

Perhaps artificially limiting the number of givs to number of 
registers / 2 or something of the sort would be beneficial on
machines where SMALL_REGISTER_CLASSES is true?

Toshi



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]