This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug c/40542] New: gcc-4.3.3 vectorizes access to volatile array
- From: "strauman at slac dot stanford dot edu" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 24 Jun 2009 13:10:18 -0000
- Subject: [Bug c/40542] New: gcc-4.3.3 vectorizes access to volatile array
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
gcc-4.3.3 (4.3.2 probably other versions, too)
seems to produce bad code when
accessing an array of small 'volatile'
objects -- it may try to access multiple
such objects in a 'parallel/vectorized' fashion.
-ftree-vectorize seems to be the option
that triggers this behavior.
E.g., instead of reading four consecutive
'volatile char's sequentially it reads
a single 32-bit longword (on x86_64 it
even uses XMM registers to read 16 volatile
chars at once). This may crash
e.g., when accessing a memory-mapped device
which allows only 8-bit accesses.
The vectorized access is preceded by
a single, redundant access of the correct
width implementing something like
for (i=0; i<16/vector_size; i++) {
d[i] = s[i];
((vector char*)d)[i] = ((vector char*)s)[i];
}
Look for the statements after '.L6' which
are executed if source and destination are
4-byte (16-byte in case of XMM) aligned
and if src/dst vectors don't overlap.
Both facts (redundant access + vectorized
access) seem to violate C99 (5.1.2.3.2):
"At certain specified points in the execution
sequence called 'sequence points', all side
effects of previous evaluations shall be
complete and no side effects of subsequent
evaluations shall have taken place"
This rule seems violated at the ';'
sequence point which terminates the
assignment.
void
volarr_cpy(char *d, volatile char *s)
{
int i;
for ( i=0; i<16; i++ )
d[i]=s[i];
}
compiled for i386 with gcc-4.3.3 -m32 -O3 -S -c
.file "volcharr_cpy.c"
.text
.p2align 4,,15
.globl volarr_cpy
.type volarr_cpy, @function
volarr_cpy:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %ebx
movl 12(%ebp), %ecx
movl 8(%ebp), %edx
movl %ecx, %eax
orl %edx, %eax
testb $3, %al
leal 4(%ecx), %ebx
leal 4(%edx), %esi
je .L8
.L2:
movzbl (%ecx), %eax
movb %al, (%edx)
movzbl 1(%ecx), %eax
movb %al, 1(%edx)
movzbl 2(%ecx), %eax
movb %al, 2(%edx)
movzbl 3(%ecx), %eax
movb %al, 3(%edx)
movzbl 4(%ecx), %eax
movb %al, 4(%edx)
movzbl 5(%ecx), %eax
movb %al, 5(%edx)
movzbl 6(%ecx), %eax
movb %al, 6(%edx)
movzbl 7(%ecx), %eax
movb %al, 7(%edx)
movzbl 8(%ecx), %eax
movb %al, 8(%edx)
movzbl 9(%ecx), %eax
movb %al, 9(%edx)
movzbl 10(%ecx), %eax
movb %al, 10(%edx)
movzbl 11(%ecx), %eax
movb %al, 11(%edx)
movzbl 12(%ecx), %eax
movb %al, 12(%edx)
movzbl 13(%ecx), %eax
movb %al, 13(%edx)
movzbl 14(%ecx), %eax
movb %al, 14(%edx)
movzbl 15(%ecx), %eax
movb %al, 15(%edx)
popl %ebx
popl %esi
popl %ebp
ret
.p2align 4,,7
.p2align 3
.L8:
cmpl %ebx, %edx
jbe .L9
.L6:
movzbl (%ecx), %eax
movl (%ecx), %eax
movl %eax, (%edx)
movzbl 1(%ecx), %eax
movl 4(%ecx), %eax
movl %eax, 4(%edx)
movzbl 2(%ecx), %eax
leal 4(%ebx), %edx
movl 4(%ebx), %eax
movl %eax, 4(%esi)
movzbl 3(%ecx), %eax
movl 4(%edx), %eax
movl %eax, 8(%esi)
popl %ebx
popl %esi
popl %ebp
ret
.p2align 4,,7
.p2align 3
.L9:
cmpl %esi, %ecx
jbe .L2
jmp .L6
.size volarr_cpy, .-volarr_cpy
.ident "GCC: (Ubuntu 4.3.3-5ubuntu4) 4.3.3"
.section .note.GNU-stack,"",@progbits
compiled for x86_64 with gcc -S -c -O3
.file "volcharr_cpy.c"
.text
.p2align 4,,15
.globl volarr_cpy
.type volarr_cpy, @function
volarr_cpy:
.LFB2:
testb $15, %dil
je .L8
.L2:
movzbl (%rsi), %eax
movb %al, (%rdi)
movzbl 1(%rsi), %eax
movb %al, 1(%rdi)
movzbl 2(%rsi), %eax
movb %al, 2(%rdi)
movzbl 3(%rsi), %eax
movb %al, 3(%rdi)
movzbl 4(%rsi), %eax
movb %al, 4(%rdi)
movzbl 5(%rsi), %eax
movb %al, 5(%rdi)
movzbl 6(%rsi), %eax
movb %al, 6(%rdi)
movzbl 7(%rsi), %eax
movb %al, 7(%rdi)
movzbl 8(%rsi), %eax
movb %al, 8(%rdi)
movzbl 9(%rsi), %eax
movb %al, 9(%rdi)
movzbl 10(%rsi), %eax
movb %al, 10(%rdi)
movzbl 11(%rsi), %eax
movb %al, 11(%rdi)
movzbl 12(%rsi), %eax
movb %al, 12(%rdi)
movzbl 13(%rsi), %eax
movb %al, 13(%rdi)
movzbl 14(%rsi), %eax
movb %al, 14(%rdi)
movzbl 15(%rsi), %eax
movb %al, 15(%rdi)
ret
.p2align 4,,10
.p2align 3
.L8:
leaq 16(%rsi), %rax
cmpq %rax, %rdi
jbe .L9
.L6:
movzbl (%rsi), %eax
movdqu (%rsi), %xmm0
movdqa %xmm0, (%rdi)
ret
.p2align 4,,10
.p2align 3
.L9:
leaq 16(%rdi), %rax
cmpq %rax, %rsi
jbe .L2
.p2align 4,,2
.p2align 3
jmp .L6
.LFE2:
.size volarr_cpy, .-volarr_cpy
.section .eh_frame,"a",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1
.LSCIE1:
.long 0x0
.byte 0x1
.string "zR"
.uleb128 0x1
.sleb128 -8
.byte 0x10
.uleb128 0x1
.byte 0x3
.byte 0xc
.uleb128 0x7
.uleb128 0x8
.byte 0x90
.uleb128 0x1
.align 8
.LECIE1:
.LSFDE1:
.long .LEFDE1-.LASFDE1
.LASFDE1:
.long .LASFDE1-.Lframe1
.long .LFB2
.long .LFE2-.LFB2
.uleb128 0x0
.align 8
.LEFDE1:
.ident "GCC: (Ubuntu 4.3.3-5ubuntu4) 4.3.3"
.section .note.GNU-stack,"",@progbits
compiled for powerpc with powerpc-rtems-gcc -S -c -O3
.file "volcharr_cpy.c"
.gnu_attribute 4, 1
.gnu_attribute 8, 1
.section ".text"
.align 2
.globl volarr_cpy
.type volarr_cpy, @function
volarr_cpy:
or 0,4,3
addi 11,4,4
andi. 9,0,3
addi 10,3,4
beq- 0,.L8
.L2:
lbz 0,0(4)
stb 0,0(3)
lbz 9,1(4)
stb 9,1(3)
lbz 0,2(4)
stb 0,2(3)
lbz 9,3(4)
stb 9,3(3)
lbz 0,4(4)
stb 0,4(3)
lbz 9,5(4)
stb 9,5(3)
lbz 0,6(4)
stb 0,6(3)
lbz 9,7(4)
stb 9,7(3)
lbz 0,8(4)
stb 0,8(3)
lbz 9,9(4)
stb 9,9(3)
lbz 0,10(4)
stb 0,10(3)
lbz 9,11(4)
stb 9,11(3)
lbz 0,12(4)
stb 0,12(3)
lbz 9,13(4)
stb 9,13(3)
lbz 0,14(4)
stb 0,14(3)
lbz 9,15(4)
stb 9,15(3)
blr
.L8:
cmplw 7,3,11
ble- 7,.L9
.L6:
lbz 0,0(4)
addi 9,11,4
lwz 0,0(4)
stw 0,0(3)
lbz 0,1(4)
lwz 0,4(4)
stw 0,4(3)
lbz 0,2(4)
lwz 0,4(11)
stw 0,4(10)
lbz 0,3(4)
lwz 0,4(9)
stw 0,8(10)
blr
.L9:
cmplw 7,4,10
ble- 7,.L2
b .L6
.size volarr_cpy, .-volarr_cpy
.ident "GCC: (GNU) 4.3.2"
--
Summary: gcc-4.3.3 vectorizes access to volatile array
Product: gcc
Version: 4.3.3
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: strauman at slac dot stanford dot edu
GCC build triplet: x86_64-unkown-linux
GCC host triplet: x86_64-unknown-linux
GCC target triplet: x86_64-unknown-linux, i386-unknown-linux, i386-unknown-
rtems, po
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40542