This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: gcc 4.3.2 vectorizes access to volatile array


H.J. Lu wrote:
> On Mon, Jun 22, 2009 at 11:14 AM, Till
> Straumann<strauman@slac.stanford.edu> wrote:
>> Andrew Haley wrote:
>>> Till Straumann wrote:
>>>
>>>> gcc-4.3.2 seems to produce bad code when
>>>> accessing an array of small 'volatile'
>>>> objects -- it may try to access multiple
>>>> such objects in a 'parallel' fashion.
>>>> E.g., instead of reading two consecutive
>>>> 'volatile short's sequentially it reads
>>>> a single 32-bit longword. This may crash
>>>> e.g., when accessing a memory-mapped device
>>>> which allows only 16-bit accesses.
>>>>
>>>> If I compile this code fragment
>>>>
>>>> void volarrcpy(short *d, volatile short *s, int n)
>>>> {
>>>> int i;
>>>>  for (i=0; i<n; i++)
>>>>   d[i] = s[i];
>>>> }
>>>>
>>>>
>>>> with '-O3' (the critical option seems to be '-ftree-vectorize')
>>>> then gcc-4.3.2 produces quite complicated code
>>>> but the essential section is (powerpc)
>>>>
>>>> .L7:
>>>>   lhz 0,0(11)
>>>>   addi 11,11,2
>>>>   lwzx 0,4,9
>>>>   stwx 0,3,9
>>>>   addi 9,9,4
>>>>   bdnz .L7
>>>>
>>>> or i386
>>>>
>>>> .L7:
>>>>   movw    (%ecx), %ax
>>>>   movl    (%esi,%edx,4), %eax
>>>>   movl    %eax, (%ebx,%edx,4)
>>>>   incl    %edx
>>>>   addl    $2, %ecx
>>>>   cmpl    %edx, -20(%ebp)
>>>>   ja  .L7
>>>>
>>>>
>>>> Disassembled back into C-code, this reads
>>>>
>>>> uint32_t *dst_l = (uint32_t*)d;
>>>> uint32_t *src_l = (uint32_t*)s;
>>>>
>>>> for (i=0; i<n/2; i++) {
>>>>   d[i]     = s[i];
>>>>   dst_l[i] = src_l[i];
>>>> }
>>>>
>>>> This code seems neither optimal nor correct.
>>>> Besides reading half of the locations twice
>>>> which violates the semantics of volatile
>>>> objects accessing such objects in a 'vectorized'
>>>> way (in this case: instead of reading
>>>> two adjacent short addresses gcc emits
>>>> a single 32-bit read) seems illegal to me.
>>>>
>>>> Similar behavior seems to be present in 4.3.3.
>>>>
>>>> Does anybody have some insight? Should I file
>>>> a bug report?
>>>>
>>> I can't reproduce this with "GCC: (GNU) 4.3.3 20081110 (prerelease)"
>>>
>>> .L8:
>>>        movzwl  (%ecx), %eax
>>>        addl    $1, %ebx
>>>        addl    $2, %ecx
>>>        movw    %ax, (%edx)
>>>        addl    $2, %edx
>>>        cmpl    %ebx, 16(%ebp)
>>>        jg      .L8
>>>
>>> I think you should upgrade.
>>>
>>> Andrew.
>>>
>> OK, try this then:
>>
>> void
>> c(char *d, volatile char *s)
>> {
>> int i;
>>   for ( i=0; i<32; i++ )
>>       d[i]=s[i];
>> }
>>
>>
>> (gcc --version: gcc (Ubuntu 4.3.3-5ubuntu4) 4.3.3)
>                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> That may be too old.  Gcc 4.3.4 revision 148680
> generates:
> 
> .L5:
> 	leaq	(%rsi,%rdx), %rax
> 	movzbl	(%rax), %eax
> 	movb	%al, (%rdi,%rdx)
> 	addq	$1, %rdx
> 	cmpq	$32, %rdx
> 	jne	.L5

4.4.0 20090307 generates truly bizarre code, though:

gcc -m32 -c -S -O3  x.c

c:
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%ebx
	movl	12(%ebp), %edx
	movl	8(%ebp), %ebx
	movl	%edx, %ecx
	orl	%ebx, %ecx
	andl	$3, %ecx
	leal	4(%ebx), %eax
	je	.L10
.L2:
	xorl	%eax, %eax
	.p2align 4,,7
	.p2align 3
.L5:
	leal	(%edx,%eax), %ecx
	movzbl	(%ecx), %ecx
	movb	%cl, (%ebx,%eax)
	addl	$1, %eax
	cmpl	$32, %eax
	jne	.L5
	popl	%ebx
	popl	%ebp
	ret
	.p2align 4,,7
	.p2align 3
.L10:
	leal	4(%edx), %ecx
	cmpl	%ecx, %ebx
	jbe	.L11
.L7:
	movzbl	(%edx), %ecx
	movl	(%edx), %ecx
	movl	%ecx, (%ebx)
	movzbl	1(%edx), %ecx
	movl	4(%edx), %ecx
	movl	%ecx, 4(%ebx)
	movzbl	2(%edx), %ecx
	movl	8(%edx), %ecx
	movl	%ecx, 4(%eax)
	movzbl	3(%edx), %ecx
	movl	12(%edx), %ecx
	movl	%ecx, 8(%eax)
	movzbl	4(%edx), %ecx
	movl	16(%edx), %ecx
	movl	%ecx, 12(%eax)
	movzbl	5(%edx), %ecx
	movl	20(%edx), %ecx
	movl	%ecx, 16(%eax)
	movzbl	6(%edx), %ebx
	leal	24(%edx), %ecx
	movl	24(%edx), %ebx
	movl	%ebx, 20(%eax)
	movzbl	7(%edx), %edx
	movl	4(%ecx), %edx
	movl	%edx, 24(%eax)
	popl	%ebx
	popl	%ebp
	ret
	.p2align 4,,7
	.p2align 3
.L11:
	cmpl	%edx, %eax
	jae	.L2
	jmp	.L7


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]