This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [patch, fortran] PR 37131


Hello Mikael and Dominique,

thanks for your helpful comments!

> To sum um, tests missing for the following:
> 	array(4,:,:)
> 	array(3:5,:)
> 	array(3:10:2,:)
> 	array(:,:)%comp
> with both lbound == 1 and lbound != 1.
> One test with lhs-rhs dependency would be good as well.

I have included those (and fixed the bugs that appeared).  This
is done in inline_matmul_1.f90 and in inline_matmul_5.f90.


> 
>> Index: fortran/array.c
>> ===================================================================
>> --- fortran/array.c	(Revision 222218)
>> +++ fortran/array.c	(Arbeitskopie)
>> @@ -338,6 +338,9 @@ gfc_resolve_array_spec (gfc_array_spec *as, int ch
>>    if (as == NULL)
>>      return true;
>>  
>> +  if (as->resolved)
>> +    return true;
>> +
> Why this?

Because you get regressions otherwise.  Not resolving an array spec
twice should do no harm, and resolving it twice does so - I hit the
error message in check_restricted.  I'm not sure what is wrong, maybe
PR 23466 was not fully fixed, but this works.

>
>> -static gfc_expr *create_var (gfc_expr *);
>> +static gfc_expr *create_var (gfc_expr *, const char *vname=NULL);
>> +static int optimize_matmul_assign (gfc_code **, int *, void *);
> The function doesn't really "optimize", so name it inline_matmul_assign
> instead.
> Same for the comments about "optimizing MATMUL".

Done.

> 
>> @@ -524,29 +542,11 @@ constant_string_length (gfc_expr *e)
>>  
>>  }
>>  
>> -/* Returns a new expression (a variable) to be used in place of the old one,
>> -   with an assignment statement before the current statement to set
>> -   the value of the variable. Creates a new BLOCK for the statement if
>> -   that hasn't already been done and puts the statement, plus the
>> -   newly created variables, in that block.  Special cases:  If the
>> -   expression is constant or a temporary which has already
>> -   been created, just copy it.  */
>> -
>> -static gfc_expr*
>> -create_var (gfc_expr * e)
> Keep a comment here.

Still exists, further down.

>> +static gfc_namespace*
>> +insert_block ()
>>  {
>> -  char name[GFC_MAX_SYMBOL_LEN +1];
>> -  static int num = 1;
>> -  gfc_symtree *symtree;
>> -  gfc_symbol *symbol;
>> -  gfc_expr *result;
>> -  gfc_code *n;
>>    gfc_namespace *ns;
>> -  int i;
>>  
>> -  if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e))
>> -    return gfc_copy_expr (e);
>> -
>>    /* If the block hasn't already been created, do so.  */
>>    if (inserted_block == NULL)
>>      {
> 
>> @@ -1939,7 +1977,1049 @@ doloop_warn (gfc_namespace *ns)
>>    gfc_code_walker (&ns->code, doloop_code, do_function, NULL);
>>  }
>>  
>> +/* This selction deals with inlining calls to MATMUL.  */
> section
>>  
>> +/* Auxiliary function to build and simplify an array inquiry function.
>> +   dim is zero-based.  */
>> +
>> +static gfc_expr *
>> +get_array_inq_function (gfc_expr *e, int dim, gfc_isym_id id)
> It's better if the id is the first argument, so that the function id and
> its arguments come in their natural order.

Changed.

> [...]
> 
>> +/* Builds a logical expression.  */
>> +
>> +static gfc_expr*
>> +build_logical_expr (gfc_expr *e1, gfc_expr *e2, gfc_intrinsic_op op)
> Same here, op first.

Also changed.

> [...]
> 
>> +
>> +/* Return an operation of one two gfc_expr (one if e2 is NULL). This assumes
>> +   compatible typespecs.  */
>> +
>> +static gfc_expr *
>> +get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
> Here it's good already. :-)

:-)

> [...]
> 
>> +/* Insert code to issue a runtime error if the expressions are not equal.  */
>> +
>> +static gfc_code *
>> +runtime_error_ne (gfc_expr *e1, gfc_expr *e2, const char *msg)
>> +{
>> +  gfc_expr *cond;
>> +  gfc_code *if_1, *if_2;
>> +  gfc_code *c;
>> +  // const char *name;
> Any reason...
> 
>> +  gfc_actual_arglist *a1, *a2, *a3;
>> +
>> +  gcc_assert (e1->where.lb);
>> +  /* Build the call to runtime_error.  */
>> +  c = XCNEW (gfc_code);
>> +  c->op = EXEC_CALL;
>> +  c->loc = e1->where;
>> +  // name = gfc_get_string (PREFIX ("runtime_error"));
>> +  // c->resolved_sym = gfc_get_intrinsic_sub_symbol (name);
> ... to keep these?

Removed.


>> +  while (ref)
>> +    {
>> +      if (ref->type == REF_ARRAY && ref->u.ar.type != AR_ELEMENT)
>> +	break;
>> +
>> +      ref = ref->next;
>> +
>> +    }
>> +  ar = &ref->u.ar;
> You can probably use gfc_find_array_ref here.

Changed.  There are a few other places that could also benefit
from gfc_find_array_ref (now I know it exists :-)

> [...]
> 
> 
>> +
>> +/* Function to return a scalarized expression. It is assumed that indices are
>> + zero based to make generation of DO loops easier.  A zero as index will
>> + access the first element along a dimension.  Single element references will
>> + be skipped.  A NULL as an expression will be replaced by a full reference.
>> + This assumes that the index loops have gfc_index_integer_kind, and that all
>> + references have been frozen.  */
>> +
>> +static gfc_expr*
>> +scalarized_expr (gfc_expr *e_in, gfc_expr **index, int count_index)
> I suggest using a variable argument list for index (no strong opinion,
> your decision).

I wanted to keep the NULL around for signallign a reference.
It is in the code, but currently unused.



>> +	      if (ar->start[i])
>> +		{
>> +		  lbound = gfc_copy_expr (ar->start[i]);
>> +		  if (lbound->ts.kind != gfc_index_integer_kind)
>> +		    {
>> +		      gfc_typespec ts;
>> +		      gfc_clear_ts (&ts);
>> +		      ts.type = BT_INTEGER;
>> +		      ts.kind = gfc_index_integer_kind;
>> +		      gfc_convert_type (lbound, &ts, 2);
>> +
>> +		    }
>> +		}
>> +	      else
>> +		lbound = get_array_inq_function (e_in, i+1, GFC_ISYM_LBOUND);
> I think you are assuming that e_in is a full array ref without
> subreference.  What if e_in is foo(3, :, :) or bar(:,:)%comp (think
> about non-default lbound)?

It was a bug, fixed (and put into a test case).

> 
>> +
>> +	      ar->dimen_type[i] = DIMEN_ELEMENT;
>> +	      ar->start[i] = get_operand (INTRINSIC_PLUS, nindex, lbound);
> Use gfc_replace_expr here (or gfc_free_expr) ...

Done.

>> +	      ar->end[i] = NULL;
>> +	      ar->stride[i] = NULL;
> ... and gfc_free_expr here.



>> +	      gfc_simplify_expr (ar->start[i], 0);
>> +	    }
>> +	  else if (was_fullref)
>> +	    {
>> +	      ar->dimen_type[i] = DIMEN_RANGE;
>> +	      ar->start[i] = NULL;
>> +	      ar->end[i] = NULL;
>> +	      ar->stride[i] = NULL;
>> +	    }
> Is this reachable ?

Not in the current incarnation, I wanted to keep it around for
a full segment later.  I can also remove this.


> 
>> +
>> +  current_code = &ns->code;
>> +
>> +  /* Freeze the references, keeping track of how many temporary variables were
>> +     created.  */
>> +  n_vars = 0;
>> +  freeze_references (matrix_a);
>> +  freeze_references (matrix_b);
>> +  freeze_references (expr1);
>> +
>> +  if (n_vars == 0)
>> +    next_code_point = current_code;
>> +  else
>> +    {
>> +      next_code_point = &ns->code;
>> +      for (i=0; i<n_vars; i++)
>> +	next_code_point = &(*next_code_point)->next;
>> +    }
> I'm not fond of this n_vars stuff.
> Is next_code_point different from current_code->next?
> Can freeze_references take next_code_point as argument so that it can
> update it directly instead, maybe?

Neither am I, but I spent a fair amount of time getting this to work
and I don't really want to do more of this.  I would have to revisit
the logic of create_var here, which could come later, as a cleanup.

> 
> [...]
> 
>> Index: fortran/options.c
>> ===================================================================
>> --- fortran/options.c	(Revision 222218)
>> +++ fortran/options.c	(Arbeitskopie)
>> @@ -378,6 +378,11 @@ gfc_post_options (const char **pfilename)
>>    if (!flag_automatic)
>>      flag_max_stack_var_size = 0;
>>    
>> +  /* If we call BLAS directly, only inline up to the BLAS limit.  */
> This deserves a note in the documentation.
> The new flag in general deserves documentation.

I have added something, rewording suggestions welcome.

>> +
>> +  if (flag_external_blas && flag_inline_matmul_limit < 0)
>> +    flag_inline_matmul_limit = flag_blas_matmul_limit;
> Hum, shouldn't we do something for flag_inline_matmul_limit > 0 as well?


This is done automatically, by the options machinery.  That is cool :-)

>> +
>>    /* Optimization implies front end optimization, unless the user
>>       specified it directly.  */
>>  
> 
> 
>> Index: fortran/simplify.c
>> ===================================================================
>> --- fortran/simplify.c	(Revision 222218)
>> +++ fortran/simplify.c	(Arbeitskopie)

> Can you submit that part separately with a testcase?

Can do, but later.

Regarding Dominique's remark: The test for the matrix sizes to be
inlined should be correct now, I was hanging the inlinding code off
the wrong branch of the IF statement.

OK, here is the new patch.

Any more holes to poke into it?

2015-04-19  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/37131
	* gfortran.h (gfc_isym_id):  Add GFC_ISYM_FE_RUNTIME_ERROR.
	(gfc_array_spec):  Add resolved flag.
	(gfc_intrinsic_sym):  Add vararg.
	* intrinsic.h (gfc_check_fe_runtime_error):  Add prototype.
	(gfc_resolve_re_runtime_error):  Likewise.
	Add prototype for gfc_is_reallocatable_lhs.
	* array.c (gfc_resolve_array_spec):  Do not resolve if it has
	already been resolved.
	* trans-array.h (gfc_is_reallocatable_lhs):  Remove prototype.
	* check.c (gfc_check_fe_runtime_error):  New function.
	* intrinsic.c (add_sym_1p):  New function.
	(make_vararg):  New function.
	(add_subroutines):  Add fe_runtime_error.
	(gfc_intrinsic_sub_interface): Skip sorting for variable number
	of arguments.
	* iresolve.c (gfc_resolve_fe_runtime_error):  New function.
	* lang.opt (inline-matmul-limit):  New option.
	(gfc_post_options): If no inline matmul limit has been set and
	BLAS is called externally, use the BLAS limit.
	* simplify.c (simplify_bound): Get constant lower bounds from
	array spec for assumed shape arrays.
	* frontend-passes.c:  Include intrinsic.h.
	(var_num):  New global counter for naming temporary variablbles.
	(matrix_case):  Enum for differentiating the different matmul
	cases.
	(realloc_string_callback):  Add "trim" to the variable name.
	(create_var): Add optional argument vname as part of the name.
	Use var_num. Set dimension of result correctly. Split off block
	creation into
	(insert_block): New function.
	(cfe_expr_0): Use "fcn" as part of temporary variable name.
	(optimize_namesapce): Also set gfc_current_ns. Call
	inline_matmul_assign.
	(combine_array_constructor):  Use "constr" as part of
	temporary name.
	(get_array_inq_function):  New function.
	(build_logical_expr):  New function.
	(get_operand):  new function.
	(inline_limit_check):  New function.
	(runtime_error_ne):  New function.
	(matmul_lhs_realloc):  New function.
	(is_functino_or_op):  New function.
	(has_function_or_op):  New function.
	(freeze_expr):  New function.
	(freeze_references):  New function.
	(convert_to_index_kind):  New function.
	(create_do_loop):  New function.
	(get_size_m1):  New function.
	(scalarized_expr):  New function.
	(inline_matmul_assign):  New function.
	* simplify.c (simplify_bound):  Simplify the case of the
	lower bound of an assumed-shape argument.

2015-04-19  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/37131
	* gfortran.dg/dependency_26.f90: Add option to suppress inlining
	matmul.
	* gfortran.dg/function_optimize_1.f90:  Likewise.
	* gfortran.dg/function_optimize_2.f90:  Likewise.
	* gfortran.dg/function_optimize_5.f90:  Likewise.
	* gfortran.dg/function_optimize_7.f90:  Likewise.
	* gfortran.dg/inline_matmul_1.f90:  New test.
	* gfortran.dg/inline_matmul_2.f90:  New test.
	* gfortran.dg/inline_matmul_3.f90:  New test.
	* gfortran.dg/inline_matmul_4.f90:  New test.
	* gfortran.dg/inline_matmul_5.f90:  New test.

Index: array.c
===================================================================
--- array.c	(Revision 222218)
+++ array.c	(Arbeitskopie)
@@ -338,6 +338,9 @@ gfc_resolve_array_spec (gfc_array_spec *as, int ch
   if (as == NULL)
     return true;
 
+  if (as->resolved)
+    return true;
+
   for (i = 0; i < as->rank + as->corank; i++)
     {
       e = as->lower[i];
@@ -364,6 +367,8 @@ gfc_resolve_array_spec (gfc_array_spec *as, int ch
 	}
     }
 
+  as->resolved = true;
+
   return true;
 }
 
Index: check.c
===================================================================
--- check.c	(Revision 222218)
+++ check.c	(Arbeitskopie)
@@ -5527,7 +5527,37 @@ gfc_check_random_seed (gfc_expr *size, gfc_expr *p
   return true;
 }
 
+bool
+gfc_check_fe_runtime_error (gfc_actual_arglist *a)
+{
+  gfc_expr *e;
+  int len, i;
+  int num_percent, nargs;
 
+  e = a->expr;
+  if (e->expr_type != EXPR_CONSTANT)
+    return true;
+
+  len = e->value.character.length;
+  if (e->value.character.string[len-1] != '\0')
+    gfc_internal_error ("fe_runtime_error string must be null terminated");
+
+  num_percent = 0;
+  for (i=0; i<len-1; i++)
+    if (e->value.character.string[i] == '%')
+      num_percent ++;
+
+  nargs = 0;
+  for (; a; a = a->next)
+    nargs ++;
+
+  if (nargs -1 != num_percent)
+    gfc_internal_error ("fe_runtime_error: Wrong number of arguments (%d instead of %d)",
+			nargs, num_percent++);
+
+  return true;
+}
+
 bool
 gfc_check_second_sub (gfc_expr *time)
 {
Index: frontend-passes.c
===================================================================
--- frontend-passes.c	(Revision 222218)
+++ frontend-passes.c	(Arbeitskopie)
@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dependency.h"
 #include "constructor.h"
 #include "opts.h"
+#include "intrinsic.h"
 
 /* Forward declarations.  */
 
@@ -43,7 +44,11 @@ static void doloop_warn (gfc_namespace *);
 static void optimize_reduction (gfc_namespace *);
 static int callback_reduction (gfc_expr **, int *, void *);
 static void realloc_strings (gfc_namespace *);
-static gfc_expr *create_var (gfc_expr *);
+static gfc_expr *create_var (gfc_expr *, const char *vname=NULL);
+static int inline_matmul_assign (gfc_code **, int *, void *);
+static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *,
+				  locus *, gfc_namespace *, 
+				  char *vname=NULL);
 
 /* How deep we are inside an argument list.  */
 
@@ -93,6 +98,19 @@ struct my_struct *evec;
 
 static bool in_assoc_list;
 
+/* Counter for temporary variables.  */
+
+static int var_num = 1;
+
+/* What sort of matrix we are dealing with when inlining MATMUL.  */
+
+enum matrix_case { none=0, A2B2, A2B1, A1B2 };
+
+/* Keep track of the number of expressions we have inserted so far 
+   using create_var.  */
+
+int n_vars;
+
 /* Entry point - run all passes for a namespace.  */
 
 void
@@ -157,7 +175,7 @@ realloc_string_callback (gfc_code **c, int *walk_s
     return 0;
   
   current_code = c;
-  n = create_var (expr2);
+  n = create_var (expr2, "trim");
   co->expr2 = n;
   return 0;
 }
@@ -524,29 +542,11 @@ constant_string_length (gfc_expr *e)
 
 }
 
-/* Returns a new expression (a variable) to be used in place of the old one,
-   with an assignment statement before the current statement to set
-   the value of the variable. Creates a new BLOCK for the statement if
-   that hasn't already been done and puts the statement, plus the
-   newly created variables, in that block.  Special cases:  If the
-   expression is constant or a temporary which has already
-   been created, just copy it.  */
-
-static gfc_expr*
-create_var (gfc_expr * e)
+static gfc_namespace*
+insert_block ()
 {
-  char name[GFC_MAX_SYMBOL_LEN +1];
-  static int num = 1;
-  gfc_symtree *symtree;
-  gfc_symbol *symbol;
-  gfc_expr *result;
-  gfc_code *n;
   gfc_namespace *ns;
-  int i;
 
-  if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e))
-    return gfc_copy_expr (e);
-
   /* If the block hasn't already been created, do so.  */
   if (inserted_block == NULL)
     {
@@ -578,7 +578,37 @@ constant_string_length (gfc_expr *e)
   else
     ns = inserted_block->ext.block.ns;
 
-  sprintf(name, "__var_%d",num++);
+  return ns;
+}
+
+/* Returns a new expression (a variable) to be used in place of the old one,
+   with an optional assignment statement before the current statement to set
+   the value of the variable. Creates a new BLOCK for the statement if that
+   hasn't already been done and puts the statement, plus the newly created
+   variables, in that block.  Special cases: If the expression is constant or
+   a temporary which has already been created, just copy it.  */
+
+static gfc_expr*
+create_var (gfc_expr * e, const char *vname)
+{
+  char name[GFC_MAX_SYMBOL_LEN +1];
+  gfc_symtree *symtree;
+  gfc_symbol *symbol;
+  gfc_expr *result;
+  gfc_code *n;
+  gfc_namespace *ns;
+  int i;
+
+  if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e))
+    return gfc_copy_expr (e);
+
+  ns = insert_block ();
+
+  if (vname)
+    snprintf (name, GFC_MAX_SYMBOL_LEN, "__var_%d_%s", var_num++, vname);
+  else
+    snprintf (name, GFC_MAX_SYMBOL_LEN, "__var_%d", var_num++);
+
   if (gfc_get_sym_tree (name, ns, &symtree, false) != 0)
     gcc_unreachable ();
 
@@ -651,6 +681,7 @@ constant_string_length (gfc_expr *e)
       result->ref->type = REF_ARRAY;
       result->ref->u.ar.type = AR_FULL;
       result->ref->u.ar.where = e->where;
+      result->ref->u.ar.dimen = e->rank;
       result->ref->u.ar.as = symbol->ts.type == BT_CLASS
 			     ? CLASS_DATA (symbol)->as : symbol->as;
       if (warn_array_temporaries)
@@ -666,6 +697,7 @@ constant_string_length (gfc_expr *e)
   n->expr1 = gfc_copy_expr (result);
   n->expr2 = e;
   *changed_statement = n;
+  n_vars ++;
 
   return result;
 }
@@ -724,7 +756,7 @@ cfe_expr_0 (gfc_expr **e, int *walk_subtrees,
 	  if (gfc_dep_compare_functions (*ei, *ej, true) == 0)
 	    {
 	      if (newvar == NULL)
-		newvar = create_var (*ei);
+		newvar = create_var (*ei, "fcn");
 
 	      if (warn_function_elimination)
 		do_warn_function_elimination (*ej);
@@ -931,13 +963,15 @@ convert_elseif (gfc_code **c, int *walk_subtrees A
   /*  Don't walk subtrees.  */
   return 0;
 }
+
 /* Optimize a namespace, including all contained namespaces.  */
 
 static void
 optimize_namespace (gfc_namespace *ns)
 {
-
+  gfc_namespace *saved_ns = gfc_current_ns;
   current_ns = ns;
+  gfc_current_ns = ns;
   forall_level = 0;
   iterator_level = 0;
   in_assoc_list = false;
@@ -947,6 +981,9 @@ optimize_namespace (gfc_namespace *ns)
   gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
   gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
   gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
+  if (flag_inline_matmul_limit != 0)
+    gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
+		     NULL);
 
   /* BLOCKs are handled in the expression walker below.  */
   for (ns = ns->contained; ns; ns = ns->sibling)
@@ -954,6 +991,7 @@ optimize_namespace (gfc_namespace *ns)
       if (ns->code == NULL || ns->code->op != EXEC_BLOCK)
 	optimize_namespace (ns);
     }
+  gfc_current_ns = saved_ns;
 }
 
 /* Handle dependencies for allocatable strings which potentially redefine
@@ -968,10 +1006,7 @@ realloc_strings (gfc_namespace *ns)
   for (ns = ns->contained; ns; ns = ns->sibling)
     {
       if (ns->code == NULL || ns->code->op != EXEC_BLOCK)
-	{
-	  // current_ns = ns;
-	  realloc_strings (ns);
-	}
+	realloc_strings (ns);
     }
 
 }
@@ -1222,7 +1257,7 @@ combine_array_constructor (gfc_expr *e)
   if (op2->ts.type == BT_CHARACTER)
     return false;
 
-  scalar = create_var (gfc_copy_expr (op2));
+  scalar = create_var (gfc_copy_expr (op2), "constr");
 
   oldbase = op1->value.constructor;
   newbase = NULL;
@@ -1939,7 +1974,1049 @@ doloop_warn (gfc_namespace *ns)
   gfc_code_walker (&ns->code, doloop_code, do_function, NULL);
 }
 
+/* This selction deals with inlining calls to MATMUL.  */
 
+/* Auxiliary function to build and simplify an array inquiry function.
+   dim is zero-based.  */
+
+static gfc_expr *
+get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
+{
+  gfc_expr *fcn;
+  gfc_expr *dim_arg, *kind;
+  const char *name;
+  gfc_expr *ec;
+
+  switch (id)
+    {
+    case GFC_ISYM_LBOUND:
+      name = "_gfortran_lbound";
+      break;
+
+    case GFC_ISYM_UBOUND:
+      name = "_gfortran_ubound";
+      break;
+
+    case GFC_ISYM_SIZE:
+      name = "_gfortran_size";
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  dim_arg =  gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim);
+  kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
+			   gfc_index_integer_kind);
+
+  ec = gfc_copy_expr (e);
+  fcn = gfc_build_intrinsic_call (current_ns, id, name, e->where, 3,
+				  ec, dim_arg,  kind);
+  gfc_simplify_expr (fcn, 0);
+  return fcn;
+}
+
+/* Builds a logical expression.  */
+
+static gfc_expr*
+build_logical_expr (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
+{
+  gfc_typespec ts;
+  gfc_expr *res;
+
+  ts.type = BT_LOGICAL;
+  ts.kind = gfc_default_logical_kind;
+  res = gfc_get_expr ();
+  res->where = e1->where;
+  res->expr_type = EXPR_OP;
+  res->value.op.op = op;
+  res->value.op.op1 = e1;
+  res->value.op.op2 = e2;
+  res->ts = ts;
+
+  return res;
+}
+
+
+/* Return an operation of one two gfc_expr (one if e2 is NULL). This assumes
+   compatible typespecs.  */
+
+static gfc_expr *
+get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
+{
+  gfc_expr *res;
+
+  res = gfc_get_expr ();
+  res->ts = e1->ts;
+  res->where = e1->where;
+  res->expr_type = EXPR_OP;
+  res->value.op.op = op;
+  res->value.op.op1 = e1;
+  res->value.op.op2 = e2;
+  gfc_simplify_expr (res, 0);
+  return res;
+}
+
+/* Generate the IF statement for a runtime check if we want to do inlining or
+   not - putting in the code for both branches and putting it into the syntax
+   tree is the caller's responsibility.  For fixed array sizes, this should be
+   removed by DCE. Only called for rank-two matrices A and B.  */
+
+static gfc_code *
+inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
+{
+  gfc_expr *inline_limit;
+  gfc_code *if_1, *if_2, *else_2;
+  gfc_expr *b2, *a2, *a1, *m1, *m2;
+  gfc_typespec ts;
+  gfc_expr *cond;
+
+  gcc_assert (m_case == A2B2);
+
+  /* Calculation is done in real to avoid integer overflow.  */
+
+  inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind,
+					&a->where);
+  mpfr_set_si (inline_limit->value.real, flag_inline_matmul_limit,
+	       GFC_RND_MODE);
+  mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3,
+	       GFC_RND_MODE);
+
+  a1 = get_array_inq_function (GFC_ISYM_SIZE, a, 1);
+  a2 = get_array_inq_function (GFC_ISYM_SIZE, a, 2);
+  b2 = get_array_inq_function (GFC_ISYM_SIZE, b, 2);
+
+  gfc_clear_ts (&ts);
+  ts.type = BT_REAL;
+  ts.kind = gfc_default_real_kind;
+  gfc_convert_type_warn (a1, &ts, 2, 0);
+  gfc_convert_type_warn (a2, &ts, 2, 0);
+  gfc_convert_type_warn (b2, &ts, 2, 0);
+
+  m1 = get_operand (INTRINSIC_TIMES, a1, a2);
+  m2 = get_operand (INTRINSIC_TIMES, m1, b2);
+
+  cond = build_logical_expr (INTRINSIC_LE, m2, inline_limit);
+  gfc_simplify_expr (cond, 0);
+
+  else_2 = XCNEW (gfc_code);
+  else_2->op = EXEC_IF;
+  else_2->loc = a->where;
+
+  if_2 = XCNEW (gfc_code);
+  if_2->op = EXEC_IF;
+  if_2->expr1 = cond;
+  if_2->loc = a->where;
+  if_2->block = else_2;
+
+  if_1 = XCNEW (gfc_code);
+  if_1->op = EXEC_IF;
+  if_1->block = if_2;
+  if_1->loc = a->where;
+
+  return if_1;
+}
+
+
+/* Insert code to issue a runtime error if the expressions are not equal.  */
+
+static gfc_code *
+runtime_error_ne (gfc_expr *e1, gfc_expr *e2, const char *msg)
+{
+  gfc_expr *cond;
+  gfc_code *if_1, *if_2;
+  gfc_code *c;
+  gfc_actual_arglist *a1, *a2, *a3;
+
+  gcc_assert (e1->where.lb);
+  /* Build the call to runtime_error.  */
+  c = XCNEW (gfc_code);
+  c->op = EXEC_CALL;
+  c->loc = e1->where;
+
+  /* Get a null-terminated message string.  */
+
+  a1 = gfc_get_actual_arglist ();
+  a1->expr = gfc_get_character_expr (gfc_default_character_kind, &e1->where,
+				     msg, strlen(msg)+1);
+  c->ext.actual = a1;
+
+  /* Pass the value of the first expression.  */
+  a2 = gfc_get_actual_arglist ();
+  a2->expr = gfc_copy_expr (e1);
+  a1->next = a2;
+
+  /* Pass the value of the second expression.  */
+  a3 = gfc_get_actual_arglist ();
+  a3->expr = gfc_copy_expr (e2);
+  a2->next = a3;
+
+  gfc_check_fe_runtime_error (c->ext.actual);
+  gfc_resolve_fe_runtime_error (c);
+
+  if_2 = XCNEW (gfc_code);
+  if_2->op = EXEC_IF;
+  if_2->loc = e1->where;
+  if_2->next = c;
+
+  if_1 = XCNEW (gfc_code);
+  if_1->op = EXEC_IF;
+  if_1->block = if_2;
+  if_1->loc = e1->where;
+
+  cond = build_logical_expr (INTRINSIC_NE, e1, e2);
+  gfc_simplify_expr (cond, 0);
+  if_2->expr1 = cond;
+
+  return if_1;
+}
+
+/* Handle matrix reallocation.  Caller is responsible to insert into
+   the code tree.
+
+   For the two-dimensional case, build 
+
+  if (allocated(c)) then
+     if (size(c,1) /= size(a,1) .or. size(c,2) /= size(b,2)) then
+        deallocate(c)
+        allocate (c(size(a,1), size(b,2)))
+     end if
+  else
+     allocate (c(size(a,1),size(b,2)))
+  end if
+
+  and for the other cases correspondingly.
+*/
+
+static gfc_code *
+matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b,
+		    enum matrix_case m_case)
+{
+
+  gfc_expr *allocated, *alloc_expr;
+  gfc_code *if_alloc_1, *if_alloc_2, *if_size_1, *if_size_2;
+  gfc_code *else_alloc;
+  gfc_code *deallocate, *allocate1, *allocate_else;
+  gfc_array_ref *ar;
+  gfc_expr *cond, *ne1, *ne2;
+
+  if (warn_realloc_lhs)
+    gfc_warning (OPT_Wrealloc_lhs,
+		 "Code for reallocating the allocatable array at %L will "
+		 "be added", &c->where);
+
+  alloc_expr = gfc_copy_expr (c);
+
+  ar = gfc_find_array_ref (alloc_expr);
+  gcc_assert (ar && ar->type == AR_FULL);
+
+  /* c comes in as a full ref.  Change it into a copy and make it into an
+     element ref so it has the right form for for ALLOCATE.  In the same
+     switch statement, also generate the size comparison for the secod IF
+     statement.  */
+
+  ar->type = AR_ELEMENT;
+
+  switch (m_case)
+    {
+    case A2B2:
+      ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 1);
+      ar->start[1] = get_array_inq_function (GFC_ISYM_SIZE, b, 2);
+      ne1 = build_logical_expr (INTRINSIC_NE,
+				get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+				get_array_inq_function (GFC_ISYM_SIZE, a, 1));
+      ne2 = build_logical_expr (INTRINSIC_NE,
+				get_array_inq_function (GFC_ISYM_SIZE, c, 2),
+				get_array_inq_function (GFC_ISYM_SIZE, b, 2));
+      cond = build_logical_expr (INTRINSIC_OR, ne1, ne2);
+      break;
+
+    case A2B1:
+      ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 1);
+      cond = build_logical_expr (INTRINSIC_NE,
+				 get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+				 get_array_inq_function (GFC_ISYM_SIZE, a, 2));
+      break;
+
+    case A1B2:
+      ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, b, 1);
+      cond = build_logical_expr (INTRINSIC_NE,
+				 get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+				 get_array_inq_function (GFC_ISYM_SIZE, b, 2));
+      break;
+
+    default:
+      gcc_unreachable();
+
+    }
+
+  gfc_simplify_expr (cond, 0);
+
+  /* We need two identical allocate statements in two
+     branches of the IF statement.  */
+  
+  allocate1 = XCNEW (gfc_code);
+  allocate1->op = EXEC_ALLOCATE;
+  allocate1->ext.alloc.list = gfc_get_alloc ();
+  allocate1->loc = c->where;
+  allocate1->ext.alloc.list->expr = gfc_copy_expr (alloc_expr);
+
+  allocate_else = XCNEW (gfc_code);
+  allocate_else->op = EXEC_ALLOCATE;
+  allocate_else->ext.alloc.list = gfc_get_alloc ();
+  allocate_else->loc = c->where;
+  allocate_else->ext.alloc.list->expr = alloc_expr;
+
+  allocated = gfc_build_intrinsic_call (current_ns, GFC_ISYM_ALLOCATED,
+					"_gfortran_allocated", c->where,
+					1, gfc_copy_expr (c));
+
+  deallocate = XCNEW (gfc_code);
+  deallocate->op = EXEC_DEALLOCATE;
+  deallocate->ext.alloc.list = gfc_get_alloc ();
+  deallocate->ext.alloc.list->expr = gfc_copy_expr (c);
+  deallocate->next = allocate1;
+  deallocate->loc = c->where;
+  
+  if_size_2 = XCNEW (gfc_code);
+  if_size_2->op = EXEC_IF;
+  if_size_2->expr1 = cond;
+  if_size_2->loc = c->where;
+  if_size_2->next = deallocate;
+
+  if_size_1 = XCNEW (gfc_code);
+  if_size_1->op = EXEC_IF;
+  if_size_1->block = if_size_2;
+  if_size_1->loc = c->where;
+
+  else_alloc = XCNEW (gfc_code);
+  else_alloc->op = EXEC_IF;
+  else_alloc->loc = c->where;
+  else_alloc->next = allocate_else;
+
+  if_alloc_2 = XCNEW (gfc_code);
+  if_alloc_2->op = EXEC_IF;
+  if_alloc_2->expr1 = allocated;
+  if_alloc_2->loc = c->where;
+  if_alloc_2->next = if_size_1;
+  if_alloc_2->block = else_alloc;
+
+  if_alloc_1 = XCNEW (gfc_code);
+  if_alloc_1->op = EXEC_IF;
+  if_alloc_1->block = if_alloc_2;
+  if_alloc_1->loc = c->where;
+
+  return if_alloc_1;
+}
+
+/* Callback function for has_function_or_op.  */
+
+static int
+is_function_or_op (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
+	     void *data ATTRIBUTE_UNUSED)
+{
+  if ((*e) == 0)
+    return 0;
+  else
+    return (*e)->expr_type == EXPR_FUNCTION
+      || (*e)->expr_type == EXPR_OP;
+}
+
+/* Returns true if the expression contains a function.  */
+
+static bool
+has_function_or_op (gfc_expr **e)
+{
+  if (e == NULL)
+    return false;
+  else
+    return gfc_expr_walker (e, is_function_or_op, NULL);
+}
+
+/* Freeze (assign to a temporary variable) a single expression.  */
+
+static void
+freeze_expr (gfc_expr **ep)
+{
+  gfc_expr *ne;
+  if (has_function_or_op (ep))
+    {
+      ne = create_var (*ep, "freeze");
+      *ep = ne;
+    }
+}
+
+/* Go through an expression's references and assign them to temporary
+   variables if they contain functions.  This is usually done prior to
+   front-end scalarization to avoid multiple invocations of functions.  */
+
+static void
+freeze_references (gfc_expr *e)
+{
+  gfc_ref *r;
+  gfc_array_ref *ar;
+  int i;
+
+  for (r=e->ref; r; r=r->next)
+    {
+      if (r->type == REF_SUBSTRING)
+	{
+	  if (r->u.ss.start != NULL)
+	    freeze_expr (&r->u.ss.start);
+
+	  if (r->u.ss.end != NULL)
+	    freeze_expr (&r->u.ss.end);
+	}
+      else if (r->type == REF_ARRAY)
+	{
+	  ar = &r->u.ar;
+	  switch (ar->type)
+	    {
+	    case AR_FULL:
+	      break;
+
+	    case AR_SECTION:
+	      for (i=0; i<ar->dimen; i++)
+		{
+		  if (ar->dimen_type[i] == DIMEN_RANGE)
+		    {
+		      freeze_expr (&ar->start[i]);
+		      freeze_expr (&ar->end[i]);
+		      freeze_expr (&ar->stride[i]);
+		    }
+		  else if (ar->dimen_type[i] == DIMEN_ELEMENT)
+		    {
+		      freeze_expr (&ar->start[i]);
+		    }
+		}
+	      break;
+
+	    case AR_ELEMENT:
+	      for (i=0; i<ar->dimen; i++)
+		freeze_expr (&ar->start[i]);
+	      break;
+
+	    default:
+	      break;
+	    }
+	}
+    }
+}
+
+/* Convert to gfc_index_integer_kind if needed, just do a copy otherwise.  */
+
+static gfc_expr *
+convert_to_index_kind (gfc_expr *e)
+{
+  gfc_expr *res;
+
+  gcc_assert (e != NULL);
+
+  res = gfc_copy_expr (e);
+
+  gcc_assert (e->ts.type == BT_INTEGER);
+
+  if (res->ts.kind != gfc_index_integer_kind)
+    {
+      gfc_typespec ts;
+      gfc_clear_ts (&ts);
+      ts.type = BT_INTEGER;
+      ts.kind = gfc_index_integer_kind;
+
+      gfc_convert_type_warn (e, &ts, 2, 0);
+    }
+
+  return res;
+}
+
+/* Function to create a DO loop including creation of the
+   iteration variable.  gfc_expr are copied.*/
+
+static gfc_code *
+create_do_loop (gfc_expr *start, gfc_expr *end, gfc_expr *step, locus *where,
+		gfc_namespace *ns, char *vname)
+{
+
+  char name[GFC_MAX_SYMBOL_LEN +1];
+  gfc_symtree *symtree;
+  gfc_symbol *symbol;
+  gfc_expr *i;
+  gfc_code *n, *n2;
+
+  /* Create an expression for the iteration variable.  */
+  if (vname)
+    sprintf (name, "__var_%d_do_%s", var_num++, vname);
+  else
+    sprintf (name, "__var_%d_do", var_num++);
+
+
+  if (gfc_get_sym_tree (name, ns, &symtree, false) != 0)
+    gcc_unreachable ();
+
+  /* Create the loop variable.  */
+
+  symbol = symtree->n.sym;
+  symbol->ts.type = BT_INTEGER;
+  symbol->ts.kind = gfc_index_integer_kind;
+  symbol->attr.flavor = FL_VARIABLE;
+  symbol->attr.referenced = 1;
+  symbol->attr.dimension = 0;
+  symbol->attr.fe_temp = 1;
+  gfc_commit_symbol (symbol);
+
+  i = gfc_get_expr ();
+  i->expr_type = EXPR_VARIABLE;
+  i->ts = symbol->ts;
+  i->rank = 0;
+  i->where = *where;
+  i->symtree = symtree;
+
+  /* ... and the nested DO statements.  */
+  n = XCNEW (gfc_code);
+  n->op = EXEC_DO;
+  n->loc = *where;
+  n->ext.iterator = gfc_get_iterator ();
+  n->ext.iterator->var = i;
+  n->ext.iterator->start = convert_to_index_kind (start);
+  n->ext.iterator->end = convert_to_index_kind (end);
+  if (step)
+    n->ext.iterator->step = convert_to_index_kind (step);
+  else
+    n->ext.iterator->step = gfc_get_int_expr (gfc_index_integer_kind,
+					      where, 1);
+
+  n2 = XCNEW (gfc_code);
+  n2->op = EXEC_DO;
+  n2->loc = *where;
+  n2->next = NULL;
+  n->block = n2;
+  return n;
+}
+
+/* Get the upper bound of the DO loops for matmul along a dimension.  This
+ is one-based.  */
+
+static gfc_expr*
+get_size_m1 (gfc_expr *e, int dimen)
+{
+  mpz_t size;
+  gfc_expr *res;
+
+  if (gfc_array_dimen_size (e, dimen - 1, &size))
+    {
+      res = gfc_get_constant_expr (BT_INTEGER,
+				   gfc_index_integer_kind, &e->where);
+      mpz_sub_ui (res->value.integer, size, 1);
+      mpz_clear (size);
+    }
+  else
+    {
+      res = get_operand (INTRINSIC_MINUS,
+			 get_array_inq_function (GFC_ISYM_SIZE, e, dimen),
+			 gfc_get_int_expr (gfc_index_integer_kind,
+					   &e->where, 1));
+      gfc_simplify_expr (res, 0);
+    }
+
+  return res;
+}
+
+/* Function to return a scalarized expression. It is assumed that indices are
+ zero based to make generation of DO loops easier.  A zero as index will
+ access the first element along a dimension.  Single element references will
+ be skipped.  A NULL as an expression will be replaced by a full reference.
+ This assumes that the index loops have gfc_index_integer_kind, and that all
+ references have been frozen.  */
+
+static gfc_expr*
+scalarized_expr (gfc_expr *e_in, gfc_expr **index, int count_index)
+{
+  gfc_array_ref *ar;
+  int i;
+  int rank;
+  gfc_expr *e;
+  int i_index;
+  bool was_fullref;
+
+  e = gfc_copy_expr(e_in);
+
+  rank = e->rank;
+
+  ar = gfc_find_array_ref (e);
+
+  /* We scalarize count_index variables, reducing the rank by count_index.  */
+
+  e->rank = rank - count_index;
+
+  was_fullref = ar->type == AR_FULL;
+
+  if (e->rank == 0)
+    ar->type = AR_ELEMENT;
+  else
+    ar->type = AR_SECTION;
+
+  /* Loop over the indices.  For each index, create the expression
+     index * stride + lbound(e, dim).  */
+  
+  i_index = 0;
+  for (i=0; i < ar->dimen; i++)
+    {
+      if (was_fullref || ar->dimen_type[i] == DIMEN_RANGE)
+	{
+	  if (index[i_index] != NULL)
+	    {
+	      gfc_expr *lbound, *nindex;
+	      gfc_expr *loopvar;
+	      
+	      loopvar = gfc_copy_expr (index[i_index]); 
+	      
+	      if (ar->stride[i])
+		{
+		  gfc_expr *tmp;
+
+		  tmp = gfc_copy_expr(ar->stride[i]);
+		  if (tmp->ts.kind != gfc_index_integer_kind)
+		    {
+		      gfc_typespec ts;
+		      gfc_clear_ts (&ts);
+		      ts.type = BT_INTEGER;
+		      ts.kind = gfc_index_integer_kind;
+		      gfc_convert_type (tmp, &ts, 2);
+		    }
+		  nindex = get_operand (INTRINSIC_TIMES, loopvar, tmp);
+		}
+	      else
+		nindex = loopvar;
+	      
+	      /* Calculate the lower bound of the expression.  */
+	      if (ar->start[i])
+		{
+		  lbound = gfc_copy_expr (ar->start[i]);
+		  if (lbound->ts.kind != gfc_index_integer_kind)
+		    {
+		      gfc_typespec ts;
+		      gfc_clear_ts (&ts);
+		      ts.type = BT_INTEGER;
+		      ts.kind = gfc_index_integer_kind;
+		      gfc_convert_type (lbound, &ts, 2);
+
+		    }
+		}
+	      else
+		{
+		  if (!was_fullref)
+		    {
+		      /* Look at full individual sections, like a(:).  The first index
+			 is the lbound of a full ref.  */
+
+		      gfc_array_ref *ar;
+
+		      ar = gfc_find_array_ref (e_in);
+		      ar->type = AR_FULL;
+		    }
+		  lbound = get_array_inq_function (GFC_ISYM_LBOUND, e_in,
+						   i_index + 1);
+		}
+	      
+	      ar->dimen_type[i] = DIMEN_ELEMENT;
+
+	      gfc_free_expr (ar->start[i]);
+	      ar->start[i] = get_operand (INTRINSIC_PLUS, nindex, lbound);
+	      
+	      gfc_free_expr (ar->end[i]);
+	      ar->end[i] = NULL;
+	      gfc_free_expr (ar->stride[i]);
+	      ar->stride[i] = NULL;
+	      gfc_simplify_expr (ar->start[i], 0);
+	    }
+	  else if (was_fullref)
+	    {
+	      ar->dimen_type[i] = DIMEN_RANGE;
+	      ar->start[i] = NULL;
+	      ar->end[i] = NULL;
+	      ar->stride[i] = NULL;
+	    }
+	  i_index ++;
+	}
+    }
+  return e;
+}
+
+
+/* Inline assignments of the form c = matmul(a,b).
+   Handle only the cases currently where b and c are rank-two arrays.
+
+   This basically translates the code to
+
+   BLOCK
+     integer i,j,k
+     c = 0
+     do j=0, size(b,2)-1
+       do k=0, size(a, 2)-1
+         do i=0, size(a, 1)-1
+            c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) =
+	    c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) +
+            a(i * stride(a,1) + lbound(a,1), k * stride(a,2) + lbound(a,2)) *
+            b(k * stride(b,1) + lbound(b,1), j * stride(b,2) + lbound(b,2))
+         end do
+       end do
+     end do
+   END BLOCK
+   
+*/
+
+static int
+inline_matmul_assign (gfc_code **c, int *walk_subtrees,
+			  void *data ATTRIBUTE_UNUSED)
+{
+  gfc_code *co = *c;
+  gfc_expr *expr1, *expr2;
+  gfc_expr *matrix_a, *matrix_b;
+  gfc_actual_arglist *a, *b;
+  gfc_code *do_1, *do_2, *do_3, *assign_zero, *assign_matmul;
+  gfc_expr *zero_e;
+  gfc_expr *u1, *u2, *u3;
+  gfc_expr *list[2];
+  gfc_expr *ascalar, *bscalar, *cscalar;
+  gfc_expr *mult;
+  gfc_expr *var_1, *var_2, *var_3;
+  gfc_expr *zero;
+  gfc_namespace *ns;
+  gfc_intrinsic_op op_times, op_plus;
+  enum matrix_case m_case;
+  int i;
+  gfc_code *if_limit = NULL;
+  gfc_code **next_code_point;
+
+  if (co->op != EXEC_ASSIGN)
+    return 0;
+
+  expr1 = co->expr1;
+  expr2 = co->expr2;
+  if (expr2->expr_type != EXPR_FUNCTION
+      || expr2->value.function.isym == NULL
+      || expr2->value.function.isym->id != GFC_ISYM_MATMUL)
+    return 0;
+
+  current_code = c;
+  inserted_block = NULL;
+  changed_statement = NULL;
+
+  a = expr2->value.function.actual;
+  matrix_a = a->expr;
+  b = a->next;
+  matrix_b = b->expr;
+
+  /* Currently only handling direct variables.  Transpose etc. will come
+     later.  */
+
+  if (matrix_a->expr_type != EXPR_VARIABLE
+      || matrix_b->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  if (matrix_a->rank == 2)
+    m_case = matrix_b->rank == 1 ? A2B1 : A2B2;
+  else
+    m_case = A1B2;
+
+  /* We do not handle data dependencies yet.  */
+  if (gfc_check_dependency (expr1, matrix_a, true)
+      || gfc_check_dependency (expr1, matrix_b, true))
+    return 0;
+
+  ns = insert_block ();
+
+  /* Assign the type of the zero expression for initializing the resulting
+     array, and the expression (+ and * for real, integer and complex;
+     .and. and .or for logical.  */
+
+  switch(expr1->ts.type)
+    {
+    case BT_INTEGER:
+      zero_e = gfc_get_int_expr (expr1->ts.kind, &expr1->where, 0);
+      op_times = INTRINSIC_TIMES;
+      op_plus = INTRINSIC_PLUS;
+      break;
+
+    case BT_LOGICAL:
+      op_times = INTRINSIC_AND;
+      op_plus = INTRINSIC_OR;
+      zero_e = gfc_get_logical_expr (expr1->ts.kind, &expr1->where,
+				     0);
+      break;
+    case BT_REAL:
+      zero_e = gfc_get_constant_expr (BT_REAL, expr1->ts.kind,
+				      &expr1->where);
+      mpfr_set_si (zero_e->value.real, 0, GFC_RND_MODE);
+      op_times = INTRINSIC_TIMES;
+      op_plus = INTRINSIC_PLUS;
+      break;
+
+    case BT_COMPLEX:
+      zero_e = gfc_get_constant_expr (BT_COMPLEX, expr1->ts.kind,
+				      &expr1->where);
+      mpc_set_si_si (zero_e->value.complex, 0, 0, GFC_RND_MODE);
+      op_times = INTRINSIC_TIMES;
+      op_plus = INTRINSIC_PLUS;
+
+      break;
+
+    default:
+      gcc_unreachable();
+    }
+
+  current_code = &ns->code;
+
+  /* Freeze the references, keeping track of how many temporary variables were
+     created.  */
+  n_vars = 0;
+  freeze_references (matrix_a);
+  freeze_references (matrix_b);
+  freeze_references (expr1);
+
+  if (n_vars == 0)
+    next_code_point = current_code;
+  else
+    {
+      next_code_point = &ns->code;
+      for (i=0; i<n_vars; i++)
+	next_code_point = &(*next_code_point)->next;
+    }
+
+  /* Take care of the inline flag.  If the limit check evaluates to a
+     constant, dead code elimination will eliminate the unneeded branch.  */
+
+  if (m_case == A2B2 && flag_inline_matmul_limit > 0)
+    {
+      if_limit = inline_limit_check (matrix_a, matrix_b, m_case);
+
+      /* Insert the original statement into the else branch.  */
+      if_limit->block->block->next = co;
+      co->next = NULL;
+
+      /* ... and the new ones go into the original one.  */
+      *next_code_point = if_limit;
+      next_code_point = &if_limit->block->next;
+    }
+
+  assign_zero = XCNEW (gfc_code);
+  assign_zero->op = EXEC_ASSIGN;
+  assign_zero->loc = co->loc;
+  assign_zero->expr1 = gfc_copy_expr (expr1);
+  assign_zero->expr2 = zero_e;
+
+  /* Handle the reallocation, if needed.  */
+  if (flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1))
+    {
+      gfc_code *lhs_alloc;
+
+      /* Only need to check a single dimension for the A2B2 case for
+	 bounds checking, the rest will be allocated.  */
+
+      if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS && m_case == A2B2)
+	{
+	  gfc_code *test;
+	  gfc_expr *a2, *b1;
+
+	  a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+	  b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+	  test = runtime_error_ne (b1, a2, "Dimension of array B incorrect "
+				   "in MATMUL intrinsic: Is %ld, should be %ld");
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+	}
+
+
+      lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case);
+
+      *next_code_point = lhs_alloc;
+      next_code_point = &lhs_alloc->next;
+
+    }
+  else if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS)
+    {
+      gfc_code *test;
+      gfc_expr *a2, *b1, *c1, *c2, *a1, *b2;
+
+      if (m_case == A2B2 || m_case == A2B1)
+	{
+	  a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+	  b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+	  test = runtime_error_ne (b1, a2, "Dimension of array B incorrect "
+				   "in MATMUL intrinsic: Is %ld, should be %ld");
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+
+	  c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+	  a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+
+	  if (m_case == A2B2)
+	    test = runtime_error_ne (c1, a1, "Incorrect extent in return array in "
+				     "MATMUL intrinsic for dimension 1: "
+				     "is %ld, should be %ld");
+	  else if (m_case == A2B1)
+	    test = runtime_error_ne (c1, a1, "Incorrect extent in return array in "
+				     "MATMUL intrinsic: "
+				     "is %ld, should be %ld");
+
+
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+	}
+      else if (m_case == A1B2)
+	{
+	  a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+	  b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+	  test = runtime_error_ne (b1, a1, "Dimension of array B incorrect "
+				   "in MATMUL intrinsic: Is %ld, should be %ld");
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+
+	  c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+	  b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+
+	  test = runtime_error_ne (c1, b2, "Incorrect extent in return array in "
+				   "MATMUL intrinsic: "
+				   "is %ld, should be %ld");
+
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+	}
+
+      if (m_case == A2B2)
+	{
+	  c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+	  b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+	  test = runtime_error_ne (c2, b2, "Incorrect extent in return array in "
+				   "MATMUL intrinsic for dimension 2: is %ld, should be %ld");
+
+	  *next_code_point = test;
+	  next_code_point = &test->next;
+	}
+    }
+
+  *next_code_point = assign_zero;
+
+  zero = gfc_get_int_expr (gfc_index_integer_kind, &co->loc, 0);
+
+  assign_matmul = XCNEW (gfc_code);
+  assign_matmul->op = EXEC_ASSIGN;
+  assign_matmul->loc = co->loc;
+
+  /* Get the bounds for the loops, create them and create the scalarized
+     expressions.  */
+
+  switch (m_case)
+    {
+    case A2B2:
+      inline_limit_check (matrix_a, matrix_b, m_case);
+
+      u1 = get_size_m1 (matrix_b, 2);
+      u2 = get_size_m1 (matrix_a, 2);
+      u3 = get_size_m1 (matrix_a, 1);
+
+      do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns);
+      do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns);
+      do_3 = create_do_loop (gfc_copy_expr (zero), u3, NULL, &co->loc, ns);
+
+      do_1->block->next = do_2;
+      do_2->block->next = do_3;
+      do_3->block->next = assign_matmul;
+
+      var_1 = do_1->ext.iterator->var;
+      var_2 = do_2->ext.iterator->var;
+      var_3 = do_3->ext.iterator->var;
+
+      list[0] = var_3;
+      list[1] = var_1;
+      cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 2);
+
+      list[0] = var_3;
+      list[1] = var_2;
+      ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 2);
+
+      list[0] = var_2;
+      list[1] = var_1;
+      bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 2);
+
+      break;
+
+    case A2B1:
+      u1 = get_size_m1 (matrix_b, 1);
+      u2 = get_size_m1 (matrix_a, 1);
+
+      do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns);
+      do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns);
+
+      do_1->block->next = do_2;
+      do_2->block->next = assign_matmul;
+
+      var_1 = do_1->ext.iterator->var;
+      var_2 = do_2->ext.iterator->var;
+
+      list[0] = var_2;
+      cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 1);
+
+      list[0] = var_2;
+      list[1] = var_1;
+      ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 2);
+
+      list[0] = var_1;
+      bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 1);
+
+      break;
+
+    case A1B2:
+      u1 = get_size_m1 (matrix_b, 2);
+      u2 = get_size_m1 (matrix_a, 1);
+
+      do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns);
+      do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns);
+
+      do_1->block->next = do_2;
+      do_2->block->next = assign_matmul;
+
+      var_1 = do_1->ext.iterator->var;
+      var_2 = do_2->ext.iterator->var;
+
+      list[0] = var_1;
+      cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 1);
+
+      list[0] = var_2;
+      ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 1);
+
+      list[0] = var_2;
+      list[1] = var_1;
+      bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 2);
+
+      break;
+
+    default:
+      gcc_unreachable();
+    }
+
+  /* First loop comes after the zero assignment.  */
+  assign_zero->next = do_1;
+
+  /* Build the assignment expression in the loop.  */
+  assign_matmul->expr1 = gfc_copy_expr (cscalar);
+
+  mult = get_operand (op_times, ascalar, bscalar);
+  assign_matmul->expr2 = get_operand (op_plus, cscalar, mult);
+
+  /* If we don't want to keep the original statement around in
+     the else branch, we can free it.  */
+
+  if (if_limit == NULL)
+    gfc_free_statements(co);
+  else
+    co->next = NULL;
+
+  gfc_free_expr (zero);
+  *walk_subtrees = 0;
+  return 0;
+}
+
 #define WALK_SUBEXPR(NODE) \
   do							\
     {							\
Index: gfortran.h
===================================================================
--- gfortran.h	(Revision 222218)
+++ gfortran.h	(Arbeitskopie)
@@ -419,6 +419,7 @@ enum gfc_isym_id
   GFC_ISYM_EXPONENT,
   GFC_ISYM_EXTENDS_TYPE_OF,
   GFC_ISYM_FDATE,
+  GFC_ISYM_FE_RUNTIME_ERROR,
   GFC_ISYM_FGET,
   GFC_ISYM_FGETC,
   GFC_ISYM_FLOOR,
@@ -1001,6 +1002,7 @@ typedef struct
   bool cp_was_assumed; /* AS_ASSUMED_SIZE cp arrays are converted to
 			AS_EXPLICIT, but we want to remember that we
 			did this.  */
+  bool resolved;
 
 }
 gfc_array_spec;
@@ -1907,7 +1909,7 @@ typedef struct gfc_intrinsic_sym
   gfc_typespec ts;
   unsigned elemental:1, inquiry:1, transformational:1, pure:1,
     generic:1, specific:1, actual_ok:1, noreturn:1, conversion:1,
-    from_module:1;
+    from_module:1, vararg:1;
 
   int standard;
 
@@ -3226,4 +3228,8 @@ int gfc_code_walker (gfc_code **, walk_code_fn_t,
 
 void gfc_convert_mpz_to_signed (mpz_t, int);
 
+/* trans-array.c  */
+
+bool gfc_is_reallocatable_lhs (gfc_expr *);
+
 #endif /* GCC_GFORTRAN_H  */
Index: intrinsic.c
===================================================================
--- intrinsic.c	(Revision 222218)
+++ intrinsic.c	(Arbeitskopie)
@@ -520,7 +520,30 @@ add_sym_1s (const char *name, gfc_isym_id id, enum
 	   (void *) 0);
 }
 
+/* Add a symbol to the subroutine ilst where the subroutine takes one
+   printf-style character argument and a variable number of arguments
+   to follow.  */
 
+static void
+add_sym_1p (const char *name, gfc_isym_id id, enum klass cl, bt type, int kind,
+	    int standard, bool (*check) (gfc_actual_arglist *),
+	    gfc_expr *(*simplify) (gfc_expr*), void (*resolve) (gfc_code *),
+	    const char *a1, bt type1, int kind1, int optional1, sym_intent intent1)
+{
+  gfc_check_f cf;
+  gfc_simplify_f sf;
+  gfc_resolve_f rf;
+
+  cf.f1m = check;
+  sf.f1 = simplify;
+  rf.s1 = resolve;
+
+  add_sym (name, id, cl, ACTUAL_NO, type, kind, standard, cf, sf, rf,
+	   a1, type1, kind1, optional1, intent1,
+	   (void *) 0);
+}
+
+
 /* Add a symbol from the MAX/MIN family of intrinsic functions to the
    function.  MAX et al take 2 or more arguments.  */
 
@@ -1159,6 +1182,17 @@ make_from_module (void)
     next_sym[-1].from_module = 1;
 }
 
+
+/* Mark the current subroutine as having a variable number of
+   arguments.  */
+
+static void
+make_vararg (void)
+{
+  if (sizing == SZ_NOTHING)
+    next_sym[-1].vararg = 1;
+}
+
 /* Set the attr.value of the current procedure.  */
 
 static void
@@ -3292,6 +3326,17 @@ add_subroutines (void)
 	      "fptr", BT_UNKNOWN, 0, REQUIRED, INTENT_OUT);
   make_from_module();
 
+  /* Internal subroutine for emitting a runtime error.  */
+
+  add_sym_1p ("fe_runtime_error", GFC_ISYM_FE_RUNTIME_ERROR, CLASS_IMPURE,
+	      BT_UNKNOWN, 0, GFC_STD_GNU,
+	      gfc_check_fe_runtime_error, NULL, gfc_resolve_fe_runtime_error,
+	      "msg", BT_CHARACTER, dc, REQUIRED, INTENT_IN);
+
+  make_noreturn ();
+  make_vararg ();
+  make_from_module ();
+
   /* Coarray collectives.  */
   add_sym_4s ("co_broadcast", GFC_ISYM_CO_BROADCAST, CLASS_IMPURE,
 	      BT_UNKNOWN, 0, GFC_STD_F2008_TS,
@@ -4501,7 +4546,7 @@ gfc_intrinsic_sub_interface (gfc_code *c, int erro
 
   init_arglist (isym);
 
-  if (!sort_actual (name, &c->ext.actual, isym->formal, &c->loc))
+  if (!isym->vararg && !sort_actual (name, &c->ext.actual, isym->formal, &c->loc))
     goto fail;
 
   if (!do_ts29113_check (isym, c->ext.actual))
Index: intrinsic.h
===================================================================
--- intrinsic.h	(Revision 222218)
+++ intrinsic.h	(Arbeitskopie)
@@ -190,6 +190,7 @@ bool gfc_check_system_clock (gfc_expr *, gfc_expr
 bool gfc_check_date_and_time (gfc_expr *, gfc_expr *, gfc_expr *, gfc_expr *);
 bool gfc_check_exit (gfc_expr *);
 bool gfc_check_fdate_sub (gfc_expr *);
+bool gfc_check_fe_runtime_error (gfc_actual_arglist *);
 bool gfc_check_flush (gfc_expr *);
 bool gfc_check_free (gfc_expr *);
 bool gfc_check_fstat_sub (gfc_expr *, gfc_expr *, gfc_expr *);
@@ -602,6 +603,7 @@ void gfc_resolve_ctime_sub (gfc_code *);
 void gfc_resolve_execute_command_line (gfc_code *);
 void gfc_resolve_exit (gfc_code *);
 void gfc_resolve_fdate_sub (gfc_code *);
+void gfc_resolve_fe_runtime_error (gfc_code *);
 void gfc_resolve_flush (gfc_code *);
 void gfc_resolve_free (gfc_code *);
 void gfc_resolve_fseek_sub (gfc_code *);
Index: invoke.texi
===================================================================
--- invoke.texi	(Revision 222218)
+++ invoke.texi	(Arbeitskopie)
@@ -178,6 +178,7 @@ and warnings}.
 -finit-character=@var{n} -finit-integer=@var{n} -finit-local-zero @gol
 -finit-logical=@var{<true|false>}
 -finit-real=@var{<zero|inf|-inf|nan|snan>} @gol
+-finline-matmul-limit=@var{n} @gol
 -fmax-array-constructor=@var{n} -fmax-stack-var-size=@var{n}
 -fno-align-commons @gol
 -fno-automatic -fno-protect-parens -fno-underscoring @gol
@@ -1537,6 +1538,20 @@ geometric mean of the dimensions of the argument a
 
 The default value for @var{n} is 30.
 
+@item -finline-matmul-limit=@var{n}
+@opindex @code{finline-matmul-limit}
+When front-end optimiztion is active, some calls to the @code{MATMUL}
+intrinsic function will be inlined.  Setting
+@code{-finline-matmul-limit=0} will disable inlining in all cases.
+Setting this option it to a specified value will call the library
+routines for matrices with size larger than @var{n}. If the matrices
+involved are not square, the size comparison is performed using the
+geometric mean of the dimensions of the argument and result matrices.
+
+The default value for @var{n} is the value specified for
+@code{-fblas-matmul-limit} if this option is specified, or unlimitited
+otherwise.
+
 @item -frecursive
 @opindex @code{frecursive}
 Allow indirect recursion by forcing all local arrays to be allocated
@@ -1632,11 +1647,12 @@ if @option{-ffrontend-optimize} is in effect.
 @cindex Front-end optimization
 This option performs front-end optimization, based on manipulating
 parts the Fortran parse tree.  Enabled by default by any @option{-O}
-option.  Optimizations enabled by this option include elimination of
-identical function calls within expressions, removing unnecessary
-calls to @code{TRIM} in comparisons and assignments and replacing
-@code{TRIM(a)} with @code{a(1:LEN_TRIM(a))}. 
-It can be deselected by specifying @option{-fno-frontend-optimize}.
+option.  Optimizations enabled by this option include inlining calls
+to @code{MATMUL}, elimination of identical function calls within
+expressions, removing unnecessary calls to @code{TRIM} in comparisons
+and assignments and replacing @code{TRIM(a)} with
+@code{a(1:LEN_TRIM(a))}.  It can be deselected by specifying
+@option{-fno-frontend-optimize}.
 @end table
 
 @xref{Code Gen Options,,Options for Code Generation Conventions,
Index: iresolve.c
===================================================================
--- iresolve.c	(Revision 222218)
+++ iresolve.c	(Arbeitskopie)
@@ -2197,7 +2197,20 @@ gfc_resolve_rrspacing (gfc_expr *f, gfc_expr *x)
   f->value.function.name = gfc_get_string ("__rrspacing_%d", x->ts.kind);
 }
 
+void
+gfc_resolve_fe_runtime_error (gfc_code *c)
+{
+  const char *name;
+  gfc_actual_arglist *a;
 
+  name = gfc_get_string (PREFIX ("runtime_error"));
+
+  for (a = c->ext.actual->next; a; a = a->next)
+    a->name = "%VAL";
+
+  c->resolved_sym = gfc_get_intrinsic_sub_symbol (name);
+}
+
 void
 gfc_resolve_scale (gfc_expr *f, gfc_expr *x, gfc_expr *i ATTRIBUTE_UNUSED)
 {
Index: lang.opt
===================================================================
--- lang.opt	(Revision 222218)
+++ lang.opt	(Arbeitskopie)
@@ -542,6 +542,10 @@ Enum(gfc_init_local_real) String(inf) Value(GFC_IN
 EnumValue
 Enum(gfc_init_local_real) String(-inf) Value(GFC_INIT_REAL_NEG_INF)
 
+finline-matmul-limit=
+Fortran RejectNegative Joined UInteger Var(flag_inline_matmul_limit) Init(-1)
+-finline-matmul-limit=<n>	Specify the size of the largest matrix for which matmul will be inlined
+
 fmax-array-constructor=
 Fortran RejectNegative Joined UInteger Var(flag_max_array_constructor) Init(65535)
 -fmax-array-constructor=<n>	Maximum number of objects in an array constructor
Index: options.c
===================================================================
--- options.c	(Revision 222218)
+++ options.c	(Arbeitskopie)
@@ -378,6 +378,11 @@ gfc_post_options (const char **pfilename)
   if (!flag_automatic)
     flag_max_stack_var_size = 0;
   
+  /* If we call BLAS directly, only inline up to the BLAS limit.  */
+
+  if (flag_external_blas && flag_inline_matmul_limit < 0)
+    flag_inline_matmul_limit = flag_blas_matmul_limit;
+
   /* Optimization implies front end optimization, unless the user
      specified it directly.  */
 
Index: simplify.c
===================================================================
--- simplify.c	(Revision 222218)
+++ simplify.c	(Arbeitskopie)
@@ -3445,6 +3445,32 @@ simplify_bound (gfc_expr *array, gfc_expr *dim, gf
 
  done:
 
+
+  if (!upper && as && as->type == AS_ASSUMED_SHAPE && dim
+      && dim->expr_type == EXPR_CONSTANT && ref->u.ar.type != AR_SECTION)
+    {
+      if (!(array->symtree && array->symtree->n.sym
+	    && (array->symtree->n.sym->attr.allocatable
+		|| array->symtree->n.sym->attr.pointer)))
+	{
+	  unsigned long int ndim;
+	  gfc_expr *lower, *res;
+
+	  ndim = mpz_get_si (dim->value.integer) - 1;
+	  lower = as->lower[ndim];
+	  if (lower->expr_type == EXPR_CONSTANT)
+	    {
+	      res = gfc_copy_expr (lower);
+	      if (kind)
+		{
+		  int nkind = mpz_get_si (kind->value.integer);
+		  res->ts.kind = nkind;
+		}
+	      return res;
+	    }
+	}
+    }
+
   if (as && (as->type == AS_DEFERRED || as->type == AS_ASSUMED_SHAPE
 	     || as->type == AS_ASSUMED_RANK))
     return NULL;
Index: trans-array.h
===================================================================
--- trans-array.h	(Revision 222218)
+++ trans-array.h	(Arbeitskopie)
@@ -64,8 +64,6 @@ tree gfc_copy_only_alloc_comp (gfc_symbol *, tree,
 
 tree gfc_alloc_allocatable_for_assignment (gfc_loopinfo*, gfc_expr*, gfc_expr*);
 
-bool gfc_is_reallocatable_lhs (gfc_expr *);
-
 /* Add initialization for deferred arrays.  */
 void gfc_trans_deferred_array (gfc_symbol *, gfc_wrapped_block *);
 /* Generate an initializer for a static pointer or allocatable array.  */
! { dg-do  run }
! { dg-options "-ffrontend-optimize -fdump-tree-original -Wrealloc-lhs" }
! PR 37131 - check basic functionality of inlined matmul, making
! sure that the library is not called, with and without reallocation.

program main
  implicit none
  integer, parameter :: offset = -2
  real, dimension(3,2) :: a
  real, dimension(2,4) :: b
  real, dimension(3,4) :: c
  real, dimension(3,4) :: cres
  real, dimension(:,:), allocatable :: c_alloc
  integer, parameter :: a1_lower_p = 1 + offset, a1_upper_p = size(a,1) + offset
  integer, parameter :: a2_lower_p = 1 + offset, a2_upper_p = size(a,2) + offset
  integer, parameter :: b1_lower_p = 1 + offset, b1_upper_p = size(b,1) + offset
  integer, parameter :: b2_lower_p = 1 + offset, b2_upper_p = size(b,2) + offset
  integer, parameter :: c1_lower_p = 1 + offset, c1_upper_p = size(c,1) + offset
  integer, parameter :: c2_lower_p = 1 + offset, c2_upper_p = size(c,2) + offset
  real, dimension(a1_lower_p:a1_upper_p, a2_lower_p:a2_upper_p) :: ap
  real, dimension(b1_lower_p:b1_upper_p, b2_lower_p:b2_upper_p) :: bp
  real, dimension(c1_lower_p:c1_upper_p, c2_lower_p:c2_upper_p) :: cp
  real, dimension(4,8,4) :: f, fresult
  integer :: eight = 8, two = 2

  type foo
     real :: a
     integer :: i
  end type foo

  type(foo), dimension(3,2) :: afoo
  type(foo), dimension(2,4) :: bfoo
  type(foo), dimension(3,4) :: cfoo

  data a / 2.,  -3.,  5.,  -7., 11., -13./
  data b /17., -23., 29., -31., 37., -39., 41., -47./
  data cres /195., -304.,  384.,  275., -428.,  548.,  347., -540.,  692.,  411., -640.,  816./
  data fresult / &
   0.,   0., 195.,   0.,   0.,  17.,   0.,   0.,   0., -23.,-304.,   0.,   0.,   0.,   0.,   0., &
   0.,   0., 384.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., &
   2.,   0., 275.,   0.,  -3.,  29.,   0.,   0.,   5., -31.,-428.,   0.,   0.,   0.,   0.,   0., &
   0.,   0., 548.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., &
  -7.,   0., 347.,   0.,  11.,  37.,   0.,   0., -13., -39.,-540.,   0.,   0.,   0.,   0.,   0., &
   0.,   0., 692.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., &
   0.,   0., 411.,   0.,   0.,  41.,   0.,   0.,   0., -47.,-640.,   0.,   0.,   0.,   0.,   0., &
   0.,   0., 816.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0./

  integer :: a1 = size(a,1), a2 = size(a,2)
  integer :: b1 = size(b,1), b2 = size(b,2)
  integer :: c1 = size(c,1), c2 = size(c,2)

  integer :: a1_lower, a1_upper, a2_lower, a2_upper
  integer :: b1_lower, b1_upper, b2_lower, b2_upper
  integer :: c1_lower, c1_upper, c2_lower, c2_upper

  a1_lower = 1 + offset ; a1_upper = a1 + offset
  a2_lower = 1 + offset ; a2_upper = a2 + offset
  b1_lower = 1 + offset ; b1_upper = b1 + offset
  b2_lower = 1 + offset ; b2_upper = b2 + offset
  c1_lower = 1 + offset ; c1_upper = c1 + offset
  c2_lower = 1 + offset ; c2_upper = c2 + offset

  c = matmul(a,b)
  if (sum(abs(c-cres))>1e-4) call abort

  c_alloc = matmul(a,b)      ! { dg-warning "Code for reallocating the allocatable array" }
  if (sum(abs(c_alloc-cres))>1e-4) call abort
  if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
  deallocate(c_alloc)

  allocate(c_alloc(4,4))
  c_alloc = matmul(a,b)      ! { dg-warning "Code for reallocating the allocatable array" }
  if (sum(abs(c_alloc-cres))>1e-4) call abort
  if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
  deallocate(c_alloc)

  allocate(c_alloc(3,3))
  c_alloc = matmul(a,b)      ! { dg-warning "Code for reallocating the allocatable array" }
  if (sum(abs(c_alloc-cres))>1e-4) call abort
  if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort

  c_alloc = 42.
  c_alloc(:,:) = matmul(a,b)
  if (sum(abs(c_alloc-cres))>1e-4) call abort
  if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort

  deallocate(c_alloc)
  
  ap = a
  bp = b
  cp = matmul(ap, bp)
  if (sum(abs(cp-cres)) > 1e-4) call abort

  f = 0
  f(1,1:3,2:3) = a
  f(2,2:3,:) = b
  c = matmul(f(1,1:3,2:3), f(2,2:3,:))
  if (sum(abs(c-cres))>1e-4) call abort

  f(3,1:eight:2,:) = matmul(a, b)
  if (sum(abs(f(3,1:eight:2,:)-cres))>1e-4) call abort

  afoo%a = a
  bfoo%a = b
  cfoo%a = matmul(afoo%a, bfoo%a)

  if (sum(abs(cfoo%a-cres)) > 1e-4) call abort

  block
    real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
    real :: am(a1_lower:a1_upper, a2_lower:a2_upper)
    real :: bm(b1_lower:b1_upper, b2_lower:b2_upper)
    real :: cm(c1_lower:c1_upper, c2_lower:c2_upper)

    aa = a
    bb = b
    am = a
    bm = b

    cc = matmul(aa,bb)
    if (sum(cc-cres)>1e-4) call abort
    c_alloc = matmul(aa,bb)    ! { dg-warning "Code for reallocating the allocatable array" }
    if (sum(abs(c_alloc-cres))>1e-4) call abort
    if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
    c_alloc = 42.
    deallocate(c_alloc)

    allocate(c_alloc(4,4))
    c_alloc = matmul(aa,bb)   ! { dg-warning "Code for reallocating the allocatable array" }
    if (sum(abs(c_alloc-cres))>1e-4) call abort
    if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
    deallocate(c_alloc)

    allocate(c_alloc(3,3))
    c_alloc = matmul(aa,bb)  ! { dg-warning "Code for reallocating the allocatable array" }
    if (sum(abs(c_alloc-cres))>1e-4) call abort
    if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
    deallocate(c_alloc)

    cm = matmul(am, bm)
    if (sum(abs(cm-cres)) > 1e-4) call abort

    cm = 42.

    cm(:,:) = matmul(a,bm)
    if (sum(abs(cm-cres)) > 1e-4) call abort

  end block

end program main

! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "original" } }
! { dg-final { cleanup-tree-dump "original" } }
! { dg-do compile }
! { dg-options "-ffrontend-optimize -finline-matmul-limit=0 -fdump-tree-original" }
! PR 37131 - no inlining with -finline-matmul-limit=0
program main
  real, dimension(3,2) :: a
  real, dimension(2,4) :: b
  real, dimension(3,4) :: c
  real, dimension(3,4) :: cres
  real, dimension(:,:), allocatable :: calloc
  integer :: a1 = size(a,1), a2 = size(a,2)
  integer :: b1 = size(b,1), b2 = size(b,2)
  integer :: c1 = size(c,1), c2 = size(c,2)

  data a / 2.,  -3.,  5.,  -7., 11., -13./
  data b /17., -23., 29., -31., 37., -39., 41., -47./
  data cres /195., -304.,  384.,  275., -428.,  548.,  347., -540.,  692.,  411., -640.,  816./
  c = matmul(a,b)
  if (sum(c-cres)>1e-4) call abort

  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)

  allocate(calloc(4,4))
  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)

  allocate(calloc(3,3))
  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)
  
  block
    real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
    aa = a
    bb = b

    cc = matmul(aa,bb)
    if (sum(cc-cres)>1e-4) call abort
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    calloc = 42.
    deallocate(calloc)

    allocate(calloc(4,4))
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    deallocate(calloc)

    allocate(calloc(3,3))
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    deallocate(calloc)
  end block

end program main
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 8 "original" } }
! { dg-final { cleanup-tree-dump "original" } }
! { dg-do  run }
! { dg-options "-O3 -finline-matmul-limit=2 -fdump-tree-optimized" }
! PR 37131 - all calls to matmul should be kept
program main
  real, dimension(3,2) :: a
  real, dimension(2,4) :: b
  real, dimension(3,4) :: c
  real, dimension(3,4) :: cres
  real, dimension(:,:), allocatable :: calloc
  integer :: a1 = size(a,1), a2 = size(a,2)
  integer :: b1 = size(b,1), b2 = size(b,2)
  integer :: c1 = size(c,1), c2 = size(c,2)

  data a / 2.,  -3.,  5.,  -7., 11., -13./
  data b /17., -23., 29., -31., 37., -39., 41., -47./
  data cres /195., -304.,  384.,  275., -428.,  548.,  347., -540.,  692.,  411., -640.,  816./
  c = matmul(a,b)
  if (sum(c-cres)>1e-4) call abort

  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)

  allocate(calloc(4,4))
  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)

  allocate(calloc(3,3))
  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)
  
  block
    real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
    aa = a
    bb = b

    cc = matmul(aa,bb)
    if (sum(cc-cres)>1e-4) call abort
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    calloc = 42.
    deallocate(calloc)

    allocate(calloc(4,4))
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    deallocate(calloc)

    allocate(calloc(3,3))
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    deallocate(calloc)
  end block

end program main
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 8 "optimized" } }
! { dg-final { cleanup-tree-dump "optimized" } }
! { dg-do  run }
! { dg-options "-O3 -finline-matmul-limit=10 -fdump-tree-optimized -fdump-tree-original" }
! PR 37131 - all calls to matmul should be optimized away with -O3
! and the high limit.
program main
  real, dimension(3,2) :: a
  real, dimension(2,4) :: b
  real, dimension(3,4) :: c
  real, dimension(3,4) :: cres
  real, dimension(:,:), allocatable :: calloc
  integer :: a1 = size(a,1), a2 = size(a,2)
  integer :: b1 = size(b,1), b2 = size(b,2)
  integer :: c1 = size(c,1), c2 = size(c,2)

  data a / 2.,  -3.,  5.,  -7., 11., -13./
  data b /17., -23., 29., -31., 37., -39., 41., -47./
  data cres /195., -304.,  384.,  275., -428.,  548.,  347., -540.,  692.,  411., -640.,  816./
  c = matmul(a,b)
  if (sum(c-cres)>1e-4) call abort

  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)

  allocate(calloc(4,4))
  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)

  allocate(calloc(3,3))
  calloc = matmul(a,b)
  if (sum(calloc-cres)>1e-4) call abort
  if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
  deallocate(calloc)
  
  block
    real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
    aa = a
    bb = b

    cc = matmul(aa,bb)
    if (sum(cc-cres)>1e-4) call abort
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    calloc = 42.
    deallocate(calloc)

    allocate(calloc(4,4))
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    deallocate(calloc)

    allocate(calloc(3,3))
    calloc = matmul(aa,bb)
    if (sum(calloc-cres)>1e-4) call abort
    if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
    deallocate(calloc)
  end block

end program main
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 4 "original" } }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
! { dg-final { cleanup-tree-dump "original" } }
! { dg-final { cleanup-tree-dump "optimized" } }
! { dg-do  run }
! { dg-options "-ffrontend-optimize" }
program main

  real, dimension(2,2) :: a,b,c

  data a /2., 4., 8., 16. /
  data b /3., 9., 27., 81./

  c = matmul(a,b)
  a = matmul(a,b)
  if (any(a /= c)) call abort
end program main

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]