[Bug other/47167] Performance regression in numerical code

Wed Jan 5 17:38:00 GMT 2011

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47167

--- Comment #2 from Uros Bizjak <ubizjak at gmail dot com> 2011-01-05 17:31:20 UTC ---
The only difference in the hot loop is the usage of two regs in the address:

4.5.1:

.L142:
    movapd    %xmm0, (%rcx)
    mulpd    %xmm6, %xmm2
    addq    $32, %rbx
    movapd    %xmm1, %xmm6
    mulpd    %xmm0, %xmm6
    movsd    (%rax), %xmm1
    movsd    8(%rax), %xmm3
    unpcklpd    %xmm1, %xmm1
    subpd    %xmm2, %xmm6
    unpcklpd    %xmm3, %xmm3
    mulpd    %xmm9, %xmm1
    mulpd    %xmm0, %xmm3
    movapd    %xmm6, 16(%rcx)
    addq    $32, %rcx
    movapd    %xmm1, %xmm0
    movsd    16(%rax), %xmm1
    mulpd    %xmm6, %xmm0
    unpcklpd    %xmm1, %xmm1
    movsd    24(%rax), %xmm2
    addq    $32, %rax
    cmpq    %rsi, %rbx
    unpcklpd    %xmm2, %xmm2
    subpd    %xmm3, %xmm0
    mulpd    %xmm9, %xmm1
    jne    .L142

4.6:

.L167:
    movapd    %xmm0, %xmm10
.L143:
    mulpd    %xmm2, %xmm6
    movapd    %xmm3, %xmm2
    movapd    %xmm10, (%rsi,%rcx)
    mulpd    %xmm10, %xmm2
    movsd    (%rdx), %xmm0
    movsd    8(%rdx), %xmm1
    subpd    %xmm6, %xmm2
    unpcklpd    %xmm0, %xmm0
    unpcklpd    %xmm1, %xmm1
    mulpd    %xmm11, %xmm0
    movapd    %xmm2, 16(%rsi,%rcx)
    mulpd    %xmm10, %xmm1
    addq    $32, %rcx
    mulpd    %xmm2, %xmm0
    movsd    16(%rdx), %xmm3
    movsd    24(%rdx), %xmm6
    addq    $32, %rdx
    cmpq    %rdi, %rcx
    unpcklpd    %xmm3, %xmm3
    unpcklpd    %xmm6, %xmm6
    subpd    %xmm1, %xmm0
    mulpd    %xmm11, %xmm3
    jne    .L167

Given the comment above ix86_address_cost:

/* Return cost of the memory address x.
   For i386, it is better to use a complex address than let gcc copy
   the address into a reg and make a new pseudo.  But not if the address
   requires to two regs - that would mean more pseudos with longer
   lifetimes.  */

this could be the reason for slowdown.