[Bug target/65951] [AArch64] Will not vectorize 64bit integer multiplication

Thu Jul 9 11:22:00 GMT 2015

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65951

vekumar at gcc dot gnu.org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |vekumar at gcc dot gnu.org

--- Comment #6 from vekumar at gcc dot gnu.org ---
I found similar pattern in SPEC2006 hmmer benchmark, when comparing x86_64 (
-O3 + -march=bdver3 vs. -O3 + -mcpu=cortex-a57). x86_64 was able to vectorize 5
additional loops. Out of 5 loops, two were cost model related and aarch64
rejects because of running high vector cost. 

Remaining three loops are of this pattern. one used a constant 104. 
The other two of them used multiplication by 4 and that could be converted to
vector shifts.

I made a simple test case and wanted to open a PR. James pointed me to this PR.
Thought of posting it as comments.

unsigned long int __attribute__ ((aligned (64)))arr[100];
int i;

void test_vector_shifts()
{
        for(i=0; i<=99;i++)
        arr[i]=arr[i]<<2;
}

void test_vectorshift_via_mul()
{
        for(i=0; i<=99;i++)
        arr[i]=arr[i]*4            ;

}

Assembly
------------
        .cpu cortex-a57+fp+simd+crc
        .file   "test.c"
        .text
        .align  2
        .p2align 4,,15
        .global test_vector_shifts
        .type   test_vector_shifts, %function
test_vector_shifts:
        adrp    x0, arr
        add     x0, x0, :lo12:arr
        adrp    x1, arr+800
        add     x1, x1, :lo12:arr+800
        .p2align 2
.L2:
        ldr     q0, [x0]
        shl     v0.2d, v0.2d, 2 <==vector shifts 
        str     q0, [x0], 16
        cmp     x0, x1
        bne     .L2
        adrp    x0, i
        mov     w1, 100
        str     w1, [x0, #:lo12:i]
        ret
        .size   test_vector_shifts, .-test_vector_shifts
        .align  2
       .p2align 4,,15
        .global test_vectorshift_via_mul
        .type   test_vectorshift_via_mul, %function
test_vectorshift_via_mul:
        adrp    x0, arr
        add     x0, x0, :lo12:arr
        adrp    x2, arr+800
        add     x2, x2, :lo12:arr+800
        .p2align 2
.L6:
        ldr     x1, [x0]
        lsl     x1, x1, 2
        str     x1, [x0], 8 <==scalar shifts 
        cmp     x0, x2
        bne     .L6
        adrp    x0, i
        mov     w1, 100
        str     w1, [x0, #:lo12:i]
        ret
        .size   test_vectorshift_via_mul, .-test_vectorshift_via_mul