[Bug tree-optimization/42216] [4.5 Regression] rev 154688 regress 464.h264ref peak 20%
rguenth at gcc dot gnu dot org
gcc-bugzilla@gcc.gnu.org
Tue Dec 1 17:03:00 GMT 2009
------- Comment #8 from rguenth at gcc dot gnu dot org 2009-12-01 17:03 -------
The hot loop is mv-search.c:SetupFastFullPelSearch
for (pos = 0; pos < max_pos; pos++)
{
abs_y = offset_y + spiral_search_y[pos];
abs_x = offset_x + spiral_search_x[pos];
if (range_partly_outside)
{
if (abs_y >= 0 && abs_y <= max_height &&
abs_x >= 0 && abs_x <= max_width )
{
PelYline_11 = FastLine16Y_11;
}
else
{
PelYline_11 = UMVLine16Y_11;
}
}
orgptr = orig_blocks;
bindex = 0;
for (blky = 0; blky < 4; blky++)
{
LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;
for (y = 0; y < 4; y++)
{
refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
}
block_sad[bindex++][pos] = LineSadBlk0;
block_sad[bindex++][pos] = LineSadBlk1;
block_sad[bindex++][pos] = LineSadBlk2;
block_sad[bindex++][pos] = LineSadBlk3;
}
}
good assembly of the innermost loop:
.L1422:
leal 1(%rsi), %r9d
movl 64(%rsp), %r8d
movl 68(%rsp), %ecx
movl 52(%rsp), %edx
movq 72(%rsp), %rdi
movl %r9d, 32(%rsp)
call *%rax
movzwl (%rbx), %ecx
movzwl (%rax), %edx
movzwl 2(%rbx), %r10d
movzwl 2(%rax), %r11d
movq byte_abs(%rip), %r9
movzwl 4(%rbx), %esi
subl %ecx, %edx
movzwl 4(%rax), %ecx
movslq %edx, %r8
subl %r10d, %r11d
movzwl 6(%rax), %r10d
addl (%r9,%r8,4), %r14d
movzwl 6(%rbx), %r8d
movslq %r11d, %rdi
addl (%r9,%rdi,4), %r14d
movzwl 8(%rbx), %edi
subl %esi, %ecx
movslq %ecx, %rdx
movzwl 8(%rax), %ecx
subl %r8d, %r10d
addl (%r9,%rdx,4), %r14d
movzwl 10(%rax), %r8d
movzwl 10(%rbx), %edx
movslq %r10d, %r11
addl (%r9,%r11,4), %r14d
movzwl 12(%rbx), %r11d
subl %edi, %ecx
movzwl 12(%rax), %edi
movslq %ecx, %rsi
subl %edx, %r8d
addl (%r9,%rsi,4), %r13d
movzwl 14(%rax), %edx
movzwl 14(%rbx), %esi
movslq %r8d, %r10
subl %r11d, %edi
addl (%r9,%r10,4), %r13d
movzwl 16(%rax), %r11d
movzwl 16(%rbx), %r10d
movslq %edi, %rcx
addl (%r9,%rcx,4), %r13d
movzwl 18(%rbx), %ecx
subl %esi, %edx
movslq %edx, %r8
movzwl 18(%rax), %edx
subl %r10d, %r11d
addl (%r9,%r8,4), %r13d
movzwl 20(%rax), %r10d
movzwl 20(%rbx), %r8d
movslq %r11d, %rdi
addl (%r9,%rdi,4), %ebp
movzwl 22(%rbx), %edi
subl %ecx, %edx
movzwl 22(%rax), %ecx
movslq %edx, %rsi
subl %r8d, %r10d
addl (%r9,%rsi,4), %ebp
movzwl 24(%rax), %r8d
movzwl 24(%rbx), %esi
movslq %r10d, %r11
subl %edi, %ecx
addl (%r9,%r11,4), %ebp
movzwl 26(%rax), %edi
movslq %ecx, %rdx
movzwl 26(%rbx), %r11d
addl (%r9,%rdx,4), %ebp
movzwl 28(%rbx), %edx
subl %esi, %r8d
movslq %r8d, %r10
movzwl 28(%rax), %r8d
movzwl 30(%rax), %eax
addl (%r9,%r10,4), %r12d
movzwl 30(%rbx), %r10d
subl %r11d, %edi
movslq %edi, %rcx
addl (%r9,%rcx,4), %r12d
subl %edx, %r8d
subl %r10d, %eax
movslq %r8d, %rsi
addl (%r9,%rsi,4), %r12d
cltq
addl (%r9,%rax,4), %r12d
addq $32, %rbx
addq $32, %r15
cmpq $128, %r15
je .L1600
movq PelYline_11(%rip), %rax
movl 32(%rsp), %esi
jmp .L1422
bad assembly:
.L1422:
leal 1(%rsi), %edx
movl 68(%rsp), %ecx
movl 64(%rsp), %r8d
movq 72(%rsp), %rdi
movl %edx, 32(%rsp)
movl 52(%rsp), %edx
call *%rax
movzwl (%rbx), %esi
movzwl (%rax), %ecx
movq byte_abs(%rip), %rdx
subl %esi, %ecx
movzwl 2(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r14d
movzwl 2(%rax), %ecx
subl %esi, %ecx
movzwl 4(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r14d
movzwl 4(%rax), %ecx
subl %esi, %ecx
movzwl 6(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r14d
movzwl 6(%rax), %ecx
subl %esi, %ecx
movzwl 8(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r14d
movzwl 8(%rax), %ecx
subl %esi, %ecx
movzwl 10(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r13d
movzwl 10(%rax), %ecx
subl %esi, %ecx
movzwl 12(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r13d
movzwl 12(%rax), %ecx
subl %esi, %ecx
movzwl 14(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r13d
movzwl 14(%rax), %ecx
subl %esi, %ecx
movzwl 16(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r13d
movzwl 16(%rax), %ecx
subl %esi, %ecx
movzwl 18(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %ebp
movzwl 18(%rax), %ecx
subl %esi, %ecx
movzwl 20(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %ebp
movzwl 20(%rax), %ecx
subl %esi, %ecx
movzwl 22(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %ebp
movzwl 22(%rax), %ecx
subl %esi, %ecx
movzwl 24(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %ebp
movzwl 24(%rax), %ecx
subl %esi, %ecx
movzwl 26(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r12d
movzwl 26(%rax), %ecx
subl %esi, %ecx
movzwl 28(%rbx), %esi
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r12d
movzwl 28(%rax), %ecx
movzwl 30(%rax), %eax
subl %esi, %ecx
movslq %ecx, %rcx
addl (%rdx,%rcx,4), %r12d
movzwl 30(%rbx), %ecx
subl %ecx, %eax
cltq
addl (%rdx,%rax,4), %r12d
addq $32, %rbx
addq $32, %r15
cmpq $128, %r15
je .L1600
movq PelYline_11(%rip), %rax
movl 32(%rsp), %esi
jmp .L1422
seems to be really only scheduling differences...
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42216
More information about the Gcc-bugs
mailing list