typedef struct { int a, b, c, d; } my_struct; my_struct *array; my_struct *ptrs[4]; void loop () { for (int i = 0; i < 4; i++) ptrs[i] = &array[i]; } Vectorizes on x86, but not on AArch64. From -fdump-tree-vect-details: test.c:13:3: note: vectorization factor = 4 ... test.c:13:3: note: not vectorized: relevant stmt not supported: _6 = _5 * 16; test.c:13:3: note: bad operation or unsupported loop bound. test.c:13:3: note: ***** Re-trying analysis with vector size 8 ... test.c:13:3: note: not vectorized: no vectype for stmt: ptrs[i_12] = _7; scalar_type: struct my_struct * test.c:13:3: note: bad data references. test.c:11:1: note: vectorized 0 loops in function.
I think the issue is really 65951. >_6 = _5 * 16; That is a size_t multiply which is an unsigned 64bit multiply.
if we use -mabi=ilp32, we can see it is vectorized: loop: adrp x0, array adrp x1, ptrs ldr q1, .LC0 add x3, x1, :lo12:ptrs ldr w2, [x0, #:lo12:array] dup v0.4s, w2 add v2.4s, v0.4s, v1.4s str q2, [x3] ret
Confirmed.
Hmmm. Yes. Well, x * 16 = x << 4, of course. Or, in theory something like VRP could let us see that # i_12 = PHI <i_9(4), 0(2)> # ivtmp_18 = PHI <ivtmp_17(4), 4(2)> _5 = (long unsigned int) i_12; _6 = _5 * 16; _7 = pretmp_11 + _6; ptrs[i_12] = _7; i_9 = i_12 + 1; could be rewritten to something like # i_12 = PHI <i_9(4), 0(2)> # ivtmp_18 = PHI <ivtmp_17(4), 4(2)> _5 = _12 * 16; _6 = (long unsigned int) _5; _7 = pretmp_11 + _6; ptrs[i_12] = _7; i_9 = i_12 + 1; which would then be vectorizable.
So the above example tends to get fully unrolled, but even on an example with 32 ptrs rather than 4, yes the vectorizer fails because of the multiplication - but the multiplication is gone by the final tree stage, as it's strength reduced down to an add; I believe this -fdump-tree-optimized would be perfectly vectorizable: loop () { unsigned long ivtmp.12; unsigned long ivtmp.10; void * _4; struct my_struct * _7; struct my_struct * pretmp_11; unsigned long _20; <bb 2>: pretmp_11 = array; ivtmp.10_16 = (unsigned long) pretmp_11; ivtmp.12_2 = (unsigned long) &ptrs; _20 = (unsigned long) &MEM[(void *)&ptrs + 256B]; <bb 3>: # ivtmp.10_10 = PHI <ivtmp.10_1(3), ivtmp.10_16(2)> # ivtmp.12_15 = PHI <ivtmp.12_14(3), ivtmp.12_2(2)> _7 = (struct my_struct *) ivtmp.10_10; _4 = (void *) ivtmp.12_15; MEM[base: _4, offset: 0B] = _7; ivtmp.10_1 = ivtmp.10_10 + 16; ivtmp.12_14 = ivtmp.12_15 + 8; if (ivtmp.12_14 != _20) goto <bb 3>; else goto <bb 4>; <bb 4>: return; }
So aarch64 has no DImode vectors? Or just no DImode multiply (but it has a DImode vector shift?). If so this could be handled by a vectorizer pattern transforming the multiply to a shift.
(In reply to Richard Biener from comment #6) > So aarch64 has no DImode vectors? Or just no DImode multiply (but it has a > DImode vector shift?). Yes, the latter.
(In reply to alalaw01 from comment #7) > (In reply to Richard Biener from comment #6) > > So aarch64 has no DImode vectors? Or just no DImode multiply (but it has a > > DImode vector shift?). > > Yes, the latter. Sorry, aarch64 has a DImode multiply, but no V2DImode multiply; and it has V2DImode shifts.
As per Richards, suggestion I added a pattern in vector recog. This seems to vectorize the this PR. However I need some help on the following (1)How do I check the shift amount and also care about type/signedness. There could be different shift amounts allowed in the target architecture when looking for power 2 constants. (2)Should I need to check if target architecture supports vectorized shifts before converting the pattern? ---Patch--- diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index f034635..995c9b2 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -76,6 +76,10 @@ static gimple vect_recog_vector_vector_shift_pattern (vec<gimple> *, tree *, tree *); static gimple vect_recog_divmod_pattern (vec<gimple> *, tree *, tree *); + +static gimple vect_recog_multconst_pattern (vec<gimple> *, + tree *, tree *); + static gimple vect_recog_mixed_size_cond_pattern (vec<gimple> *, tree *, tree *); static gimple vect_recog_bool_pattern (vec<gimple> *, tree *, tree *); @@ -90,6 +94,7 @@ static vect_recog_func_ptr vect_vect_recog_func_ptrs[NUM_PATTERNS] = { vect_recog_rotate_pattern, vect_recog_vector_vector_shift_pattern, vect_recog_divmod_pattern, + vect_recog_multconst_pattern, vect_recog_mixed_size_cond_pattern, vect_recog_bool_pattern}; @@ -2147,6 +2152,90 @@ vect_recog_vector_vector_shift_pattern (vec<gimple> *stmts, return pattern_stmt; } +static gimple +vect_recog_multconst_pattern (vec<gimple> *stmts, + tree *type_in, tree *type_out) +{ + gimple last_stmt = stmts->pop (); + tree oprnd0, oprnd1, vectype, itype, cond; + gimple pattern_stmt, def_stmt; + enum tree_code rhs_code; + stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo); + optab optab; + tree q; + int dummy_int, prec; + stmt_vec_info def_stmt_vinfo; + + if (!is_gimple_assign (last_stmt)) + return NULL; + + rhs_code = gimple_assign_rhs_code (last_stmt); + switch (rhs_code) + { + case MULT_EXPR: + break; + default: + return NULL; + } + + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + return NULL; + + oprnd0 = gimple_assign_rhs1 (last_stmt); + oprnd1 = gimple_assign_rhs2 (last_stmt); + itype = TREE_TYPE (oprnd0); + if (TREE_CODE (oprnd0) != SSA_NAME + || TREE_CODE (oprnd1) != INTEGER_CST + || TREE_CODE (itype) != INTEGER_TYPE + || TYPE_PRECISION (itype) != GET_MODE_PRECISION (TYPE_MODE (itype))) + return NULL; + vectype = get_vectype_for_scalar_type (itype); + if (vectype == NULL_TREE) + return NULL; + + /* If the target can handle vectorized division or modulo natively, + don't attempt to optimize this. */ + optab = optab_for_tree_code (rhs_code, vectype, optab_default); + if (optab != unknown_optab) + { + machine_mode vec_mode = TYPE_MODE (vectype); + int icode = (int) optab_handler (optab, vec_mode); + if (icode != CODE_FOR_nothing) + return NULL; + } + + prec = TYPE_PRECISION (itype); + if (integer_pow2p (oprnd1)) + { + /*if (TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1) + return NULL; + */ + + /* Pattern detected. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_recog_multconst_pattern: detected:\n"); + + tree shift; + + shift = build_int_cst (itype, tree_log2 (oprnd1)); + pattern_stmt + = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL), + LSHIFT_EXPR, oprnd0, shift); + if (dump_enabled_p ()) + dump_gimple_stmt_loc (MSG_NOTE, vect_location, TDF_SLIM, pattern_stmt, + 0); + + stmts->safe_push (last_stmt); + + *type_in = vectype; + *type_out = vectype; + return pattern_stmt; + } + return NULL; +} /* Detect a signed division by a constant that wouldn't be otherwise vectorized: diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 48c1f8d..833fe4b 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1131,7 +1131,7 @@ extern void vect_slp_transform_bb (basic_block); Additional pattern recognition functions can (and will) be added in the future. */ typedef gimple (* vect_recog_func_ptr) (vec<gimple> *, tree *, tree *); -#define NUM_PATTERNS 12 +#define NUM_PATTERNS 13 void vect_pattern_recog (loop_vec_info, bb_vec_info); /* In tree-vectorizer.c. */ ---Patch ---
With the patch I get loop: adrp x0, array ldr q1, .LC0 ldr q2, .LC1 adrp x1, ptrs add x1, x1, :lo12:ptrs ldr x0, [x0, #:lo12:array] dup v0.2d, x0 add v1.2d, v0.2d, v1.2d <== vectorized add v0.2d, v0.2d, v2.2d <== vectorized str q1, [x1] str q0, [x1, 16] ret .size loop, .-loop .align 4 .LC0: .xword 0 .xword 16 .align 4 .LC1: .xword 32 .xword 48
This is getting fixed after patch https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=226675 Vectorize mult expressions with power 2 constants via shift, for targets has no vector multiplication support. 2015-08-06 Venkataramanan Kumar <Venkataramanan.kumar@amd.com> * tree-vect-patterns.c (vect_recog_mult_pattern): New function for vectorizing multiplication patterns. * tree-vectorizer.h: Adjust the number of patterns. 2015-08-06 Venkataramanan Kumar <Venkataramanan.kumar@amd.com> * gcc.dg/vect/vect-mult-pattern-1.c: New test. * gcc.dg/vect/vect-mult-pattern-2.c: New test.