This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug c/69760] New: Wrong 64-bit memory address caused by an unneeded overflowing 32-bit integer multiplication on x86_64 under -O2 and -O3 code optimization
- From: "dimitri_laikov at mail dot ru" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Wed, 10 Feb 2016 22:20:08 +0000
- Subject: [Bug c/69760] New: Wrong 64-bit memory address caused by an unneeded overflowing 32-bit integer multiplication on x86_64 under -O2 and -O3 code optimization
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69760
Bug ID: 69760
Summary: Wrong 64-bit memory address caused by an unneeded
overflowing 32-bit integer multiplication on x86_64
under -O2 and -O3 code optimization
Product: gcc
Version: 5.3.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: dimitri_laikov at mail dot ru
Target Milestone: ---
This simplest code "test_func.c":
$ cat test_func.c
void test_func (double *a, int L, int m, int n, int N)
{
int i, k;
for (i = 0; i < N; i++) {
k = i - m;
if ((k >= 0) && (k < n)) {
a[L*k] = 0.0;
}
}
return;
}
when compiled on x86_64 by GCC versions from at least 4.8.1 up to 5.3.0
is working right with -O0 or -O1 optimizations,
but fails with -O2 or -O3, as can be seen by calling it
from this main function "test_main.c":
$ cat test_main.c
#include <stdlib.h>
void test_func (double *a, int L, int m, int n, int N);
int main ()
{
double *a;
int L, m, n, N;
L = 10000000;
n = 4;
N = 100*n;
a = (double *) malloc (sizeof (double)*L*n);
for (m = 0; m < N; m += n) {
test_func (a, L, m, n, N);
}
free (a);
return 0;
}
$ for n in 0 1 2 3 ; do gcc-5.3.0 -S -O$n test_func.c -o test_func.O$n.s ; wc
test_func.O$n.s ; gcc -O0 test_func.O$n.s test_main.c -o test.x ; ./test.x ;
done
51 119 881 test_func.O0.s
35 79 551 test_func.O1.s
45 94 696 test_func.O2.s
Segmentation fault
45 94 696 test_func.O3.s
Segmentation fault
An even more _amazing_ thing is seen when printf is added:
$ cat test_func_print.c
#include <stdio.h>
void test_func (double *a, int L, int m, int n, int N)
{
int i, k;
for (i = 0; i < N; i++) {
k = i - m;
if ((k >= 0) && (k < n)) {
printf ("k=%d a=%p a+L*k=%p\n", k, a, a + L*k);
a[L*k] = 0.0;
}
}
return;
}
$ gcc-5.3.0 -O2 test_func_print.c test_main.c -o test.x ; ./test.x
k=0 a=0x7f66ddc42010 a+L*k=0x7f66ddc42010
...
k=0 a=0x7f66ddc42010 a+L*k=0x7f6eddc42010
Segmentation fault
the corrupted pointer being 0x800000000 higher than it should!
(Should anyone say that this innocent C code is wrong?)
It can be understood by looking into the assembly file
(with comments added by us):
$ cat test_func.O2.S
.file "test_func.c"
.section .text.unlikely,"ax",@progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl test_func
.type test_func, @function
test_func:
.LFB0:
.cfi_startproc
testl %r8d, %r8d
jle .L1 # if (N <= 0) return
movl %edx, %eax # %eax = m
movslq %esi, %r9 # %r9 = (long) L
subl %edx, %r8d # %r8d = N - m
negl %eax # %eax = k = -m
salq $3, %r9 # %r9 = 8*((long) L)
imull %eax, %esi # %esi = L*(-m)
movslq %esi, %rsi # %rsi = (long) (L*(-m))
leaq (%rdi,%rsi,8), %rsi # %rsi = ak = a + L*(-m)
.p2align 4,,10
.p2align 3
.L4:
testl %eax, %eax
js .L3
cmpl %eax, %ecx
jle .L3
movq $0, (%rsi) # if ((k >= 0) && (k < n)) *ak = 0.0
.L3:
addl $1, %eax # %eax = k++
addq %r9, %rsi # %rsi = ak += L
cmpl %r8d, %eax
jne .L4 # if (k != N - m) goto L4
.L1:
rep ret
.cfi_endproc
.LFE0:
.size test_func, .-test_func
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.ident "GCC: (GNU) 5.3.0"
.section .note.GNU-stack,"",@progbits
The problem is that the optimizer rewrites the code as if it were:
void test_func (double *a, int L, int m, int n, int N)
{
double *ak;
int k;
if (N <= 0) {
return;
}
ak = a + L*(-m);
for (k = -m; k < N - m; k++) {
if ((k >= 0) && (k < n)) {
*ak = 0.0;
}
ak += L;
}
return;
}
and because "L*(-m)" is computed in 32-bit precision,
in which it does not fit, the pointer "double *ak"
fails to reference "a[0]" after being m times incremented by L.
The problem would be solved by teaching the compiler
to use more bits when precomputing pointers for the optimized loops,
in this example:
...
ak = a + ((long) L)*(-m)
...
or something like this
test_func:
.LFB0:
.cfi_startproc
testl %r8d, %r8d
jle .L1 # if (N <= 0) return
movslq %edx, %rax #|%rax = (long) m
movslq %esi, %r9 # %r9 = (long) L
subl %edx, %r8d # %r8d = N - m
negq %rax #|%rax = k = -m
salq $3, %r9 # %r9 = 8*((long) L)
movslq %esi, %rsi #|%rsi = (long) L
imulq %rax, %rsi # %rsi = ((long) L)*(-m)
# movslq %esi, %rsi #-
leaq (%rdi,%rsi,8), %rsi # %rsi = ak = a + ((long) L)*(-m)
.p2align 4,,10
.p2align 3
.L4:
testl %eax, %eax
js .L3
cmpl %eax, %ecx
jle .L3
movq $0, (%rsi) # if ((k >= 0) && (k < n)) *ak = 0.0
.L3:
addl $1, %eax # %eax = k++
addq %r9, %rsi # %rsi = ak += L
cmpl %r8d, %eax
jne .L4 # if (k != N - m) goto L4
.L1:
rep ret
Many thanks for your kind attention.