Bug 63530 - GCC generates incorrect aligned store on ARM after the loop is unrolled.
Summary: GCC generates incorrect aligned store on ARM after the loop is unrolled.
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.9.0
: P3 normal
Target Milestone: 4.9.2
Assignee: Not yet assigned to anyone
URL:
Keywords: wrong-code
Depends on:
Blocks:
 
Reported: 2014-10-14 02:23 UTC by Cong Hou
Modified: 2014-10-28 16:30 UTC (History)
2 users (show)

See Also:
Host:
Target: arm
Build:
Known to work:
Known to fail: 4.9.0, 4.9.1, 5.0
Last reconfirmed:


Attachments
assembly (2.11 KB, text/plain)
2014-10-14 02:23 UTC, Cong Hou
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Cong Hou 2014-10-14 02:23:52 UTC
Created attachment 33710 [details]
assembly

When compile the code shown below using GCC 5.0 for ARM with the following options:

-O2 -ftree-vectorize -march=armv7-a -mfpu=neon -funroll-loops --param=max-completely-peeled-insns=400


// The code:

typedef struct {
  unsigned char map[256];
  int i;
} A, *AP;

void* calloc(int, int);

AP foo(int n)
{
  AP b = calloc(1, sizeof(A));
  int i;
  for (i = n; i < 256; i++)
    b->map[i] = i;
  return b;
}


A instruction

vst1.64	{d0-d1}, [r2:64]

is generated, which is an aligned store with 8 bytes alignment requirement. However this requirement cannot be satisfied as the loop is not peeled for alignment, and the start address on the array is unknown at compile time.

I have attached the generated assembly code here.
Comment 1 Carrot 2014-10-14 22:19:09 UTC
The problem is in 116t.vect pass.

...
 67   vector(16) unsigned char * vectp_b.15;
 68      <var_decl 0x7f1361c15e10 vectp_b.15
 69         type <pointer_type 0x7f1361b56e70 type <vector_type 0x7f1361b56540>
 70             unsigned SI
 71             size <integer_cst 0x7f1361c0beb8 constant 32>
 72             unit size <integer_cst 0x7f1361c0bed0 constant 4>
 73             align 32 symtab 0 alias set -1 canonical type 0x7f1361b56e70>
 74         used unsigned ignored SI file t3.c line 8 col 8
 75         size <integer_cst 0x7f1361c0beb8 constant 32>
 76         unit size <integer_cst 0x7f1361c0bed0 constant 4>
 77         align 32 context <function_decl 0x7f1361b40300 foo>>
 78   vector(16) unsigned char * vectp_b.14;
 79      <var_decl 0x7f1361c15d80 vectp_b.14
 80         type <pointer_type 0x7f1361b56e70 type <vector_type 0x7f1361b56540>
 81             unsigned SI
 82             size <integer_cst 0x7f1361c0beb8 constant 32>
 83             unit size <integer_cst 0x7f1361c0bed0 constant 4>
 84             align 32 symtab 0 alias set -1 canonical type 0x7f1361b56e70>
 85         used unsigned ignored SI file t3.c line 8 col 8
 86         size <integer_cst 0x7f1361c0beb8 constant 32>
 87         unit size <integer_cst 0x7f1361c0bed0 constant 4>
 88         align 32 context <function_decl 0x7f1361b40300 foo>>
...
100   vector(16) unsigned char vect__7.12;
101      <var_decl 0x7f1361c15c60 vect__7.12
102         type <vector_type 0x7f1361b56540 type <integer_type
0x7f1361c0f3f0 unsigned char>
103             unsigned V16QI
104             size <integer_cst 0x7f1361c1d270 constant 128>
105             unit size <integer_cst 0x7f1361c1d288 constant 16>
106             align 64 symtab 0 alias set 0 canonical type
0x7f1361b56540 nunits 16
107             pointer_to_this <pointer_type 0x7f1361b56e70>>
108         used unsigned ignored V16QI file t3.c line 8 col 8
109         size <integer_cst 0x7f1361c1d270 constant 128>
110         unit size <integer_cst 0x7f1361c1d288 constant 16>
111         align 64 context <function_decl 0x7f1361b40300 foo>>
...
223   unsigned char _7;
224   unsigned int _11;
225   unsigned int _15;
226   unsigned int _16;
227   unsigned char _20;
228   unsigned int _23;
229   unsigned int _24;
230   int _32;
231   sizetype _48;
232   unsigned int ivtmp_56;
233   unsigned int ivtmp_57;
234
235   <bb 2>:
236   b_5 = calloc (1, 260);
237   if (n_6(D) <= 255)
238     goto <bb 4>;
239   else
240     goto <bb 3>;
241
242   <bb 3>:
243   return b_5;
244
245   <bb 4>:
246   _11 = (unsigned int) n_6(D);
247   niters.3_2 = 256 - _11;
248   _15 = niters.3_2 + 4294967280;
249   _16 = _15 >> 4;
250   bnd.4_10 = _16 + 1;
251   ratio_mult_vf.5_17 = bnd.4_10 << 4;
252   _23 = (unsigned int) n_6(D);
253   _24 = 255 - _23;
254   if (_24 <= 14)
255     goto <bb 10>;
256   else
257     goto <bb 5>;
258
259   <bb 5>:
260   stmp_var_.7_33 = n_6(D) + 1;
261   stmp_var_.7_34 = stmp_var_.7_33 + 1;
262   stmp_var_.7_35 = stmp_var_.7_34 + 1;
263   vect_cst_.8_36 = {n_6(D), stmp_var_.7_33, stmp_var_.7_34, stmp_var_.7_35};
264   vect_cst_.9_37 = { 16, 16, 16, 16 };
265   vect_cst_.11_40 = { 4, 4, 4, 4 };
266   _48 = (sizetype) n_6(D);
267   vectp_b.15_47 = b_5 + _48;
268   vect_cst_.17_8 = { 1, 1, 1, 1 };
269
270   <bb 6>:
271   # n_12 = PHI <n_6(D)(5), n_9(13)>
272   # vect_vec_iv_.10_38 = PHI <vect_cst_.8_36(5), vect_vec_iv_.10_39(13)>
273   # vectp_b.14_49 = PHI <vectp_b.15_47(5), vectp_b.14_50(13)>
274   # ivtmp_56 = PHI <0(5), ivtmp_57(13)>
275   vect_vec_iv_.10_39 = vect_vec_iv_.10_38 + vect_cst_.9_37;
276   vect_vec_iv_.10_41 = vect_vec_iv_.10_38 + vect_cst_.11_40;
277   vect_vec_iv_.10_42 = vect_vec_iv_.10_41 + vect_cst_.11_40;
278   vect_vec_iv_.10_43 = vect_vec_iv_.10_42 + vect_cst_.11_40;
279   vect__7.13_44 = VEC_PACK_TRUNC_EXPR <vect_vec_iv_.10_38,
vect_vec_iv_.10_41>;
280   vect__7.13_45 = VEC_PACK_TRUNC_EXPR <vect_vec_iv_.10_42,
vect_vec_iv_.10_43>;
281   vect__7.12_46 = VEC_PACK_TRUNC_EXPR <vect__7.13_44, vect__7.13_45>;
282   _7 = (unsigned char) n_12;
283   MEM[(unsigned char[256] *)vectp_b.14_49] = vect__7.12_46;
284   vect_n_9.16_52 = vect_vec_iv_.10_38 + vect_cst_.17_8;
285   vect_n_9.16_53 = vect_vec_iv_.10_41 + vect_cst_.17_8;
286   vect_n_9.16_54 = vect_vec_iv_.10_42 + vect_cst_.17_8;
287   vect_n_9.16_55 = vect_vec_iv_.10_43 + vect_cst_.17_8;
288   n_9 = n_12 + 1;
289   vectp_b.14_50 = vectp_b.14_49 + 16;
290   ivtmp_57 = ivtmp_56 + 1;
291   if (ivtmp_57 < bnd.4_10)
292     goto <bb 13>;
293   else
294     goto <bb 9>;
295
296   <bb 7>:
297   # n_18 = PHI <n_22(8), n_25(10)>
298   _20 = (unsigned char) n_18;
299   b_5->map[n_18] = _20;
300   n_22 = n_18 + 1;
301   if (n_22 <= 255)
302     goto <bb 8>;
303   else
304     goto <bb 11>;
305
306   <bb 8>:
307   goto <bb 7>;
308
309   <bb 9>:
310   # n_26 = PHI <n_9(6)>
311   _32 = (int) ratio_mult_vf.5_17;
312   tmp.6_31 = n_6(D) + _32;
313   if (niters.3_2 == ratio_mult_vf.5_17)
314     goto <bb 12>;
315   else
316     goto <bb 10>;
317
318   <bb 10>:
319   # n_25 = PHI <tmp.6_31(9), n_6(D)(4)>
320   goto <bb 7>;
321
322   <bb 11>:
323
324   <bb 12>:
325   goto <bb 3>;
326
327   <bb 13>:
328   goto <bb 6>;
329
330 }

vectp_b is assigned a value a line 267,

267   vectp_b.15_47 = b_5 + _48;

_48 is a passed in parameter, so the pointer value of vectp_b should be unaligned.
But the declaration of vectp_b is a pointer to type <vector_type 0x7f1361b56540>, that is 64bit aligned at line 106.
So the backend generates a 64bit aligned memory access instruction accordingly.
Comment 2 Cong Hou 2014-10-15 22:35:49 UTC
This issue can also be reproduced on x86_64. Compile the following code with options (assume the file name is t.c): -O2 -ftree-vectorize t.c -fdump-tree-all-alias


#include <stdlib.h>

typedef struct {
  unsigned char map[256];
  int i;
} A, *AP;

AP foo(int n)
{
  AP b = malloc(sizeof(A));
  int i;
  for (i = n; i < 256; i++)
    b->map[i] = i;
  return b;
}

The from t.c.116t.vect we can find such a statement:

  # ALIGN = 8, MISALIGN = 0
  vectp_b.15_47 = b_5 + _48;

Here b_5 is obtained from malloc which can be 8 bytes aligned, but _48 is from input parameter n, and the alignment of vectp_b.15_47 should be unknown instead of 8 here. I suspect the ptr_info_def object of vectp_b.15_47 is just copied from that of b_5, which is incorrect.
Comment 3 Carrot 2014-10-17 00:46:53 UTC
It turns out that when function vect_create_addr_base_for_vector_ref create a new pointer, it doesn't set the correct alignment of pointer value, so the default alignment of the point_to type is used. We should use the alignment information from DR_MISALIGNMENT (dr).

I'm testing following patch

Index: tree-vect-data-refs.c
===================================================================
--- tree-vect-data-refs.c	(revision 216341)
+++ tree-vect-data-refs.c	(working copy)
@@ -3957,8 +3957,12 @@
       && TREE_CODE (addr_base) == SSA_NAME)
     {
       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
-      if (offset)
+      unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
+      unsigned misalign = DR_MISALIGNMENT (dr);
+      if (offset || (misalign == -1))
 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
+      else if (misalign)
+        set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
     }
 
   if (dump_enabled_p ())
Comment 4 carrot 2014-10-22 15:57:31 UTC
Author: carrot
Date: Wed Oct 22 15:56:59 2014
New Revision: 216562

URL: https://gcc.gnu.org/viewcvs?rev=216562&root=gcc&view=rev
Log:
	PR tree-optimization/63530
	tree-vect-data-refs.c (vect_create_addr_base_for_vector_ref): Set
	pointer alignment according to DR_MISALIGNMENT.
	gcc.dg/vect/pr63530.c: New testcase.


Added:
    trunk/gcc/testsuite/gcc.dg/vect/pr63530.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/tree-vect-data-refs.c
Comment 5 carrot 2014-10-28 00:20:45 UTC
Author: carrot
Date: Tue Oct 28 00:20:13 2014
New Revision: 216770

URL: https://gcc.gnu.org/viewcvs?rev=216770&root=gcc&view=rev
Log:
	PR tree-optimization/63530
	tree-vect-data-refs.c (vect_create_addr_base_for_vector_ref): Set
	pointer alignment according to DR_MISALIGNMENT.
	gcc.dg/vect/pr63530.c: New test.


Added:
    branches/gcc-4_9-branch/gcc/testsuite/gcc.dg/vect/pr63530.c
Modified:
    branches/gcc-4_9-branch/gcc/ChangeLog
    branches/gcc-4_9-branch/gcc/testsuite/ChangeLog
    branches/gcc-4_9-branch/gcc/tree-vect-data-refs.c
Comment 6 Ramana Radhakrishnan 2014-10-28 12:51:55 UTC
Fixed then ?
Comment 7 Carrot 2014-10-28 16:30:49 UTC
(In reply to Ramana Radhakrishnan from comment #6)
> Fixed then ?

Yes, thanks for closing it.