[PATCH] RFC: Use Levenshtein spelling suggestions in Fortran FE

David Malcolm dmalcolm@redhat.com
Tue Dec 1 17:28:00 GMT 2015


On Tue, 2015-12-01 at 13:55 +0100, Bernhard Reutner-Fischer wrote:
> gcc/fortran/ChangeLog
> 
> 2015-11-29  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>
> 
> 	* gfortran.h (gfc_lookup_function_fuzzy): New declaration.
> 	* resolve.c: Include spellcheck.h.
> 	(lookup_function_fuzzy_find_candidates): New static function.
> 	(lookup_uop_fuzzy_find_candidates): Likewise.
> 	(lookup_uop_fuzzy): Likewise.
> 	(resolve_operator) <INTRINSIC_USER>: Call lookup_uop_fuzzy.
> 	(gfc_lookup_function_fuzzy): New definition.
> 	(resolve_unknown_f): Call gfc_lookup_function_fuzzy.
> 	* interface.c (check_interface0): Likewise.
> 	* symbol.c: Include spellcheck.h.
> 	(lookup_symbol_fuzzy_find_candidates): New static function.
> 	(lookup_symbol_fuzzy): Likewise.
> 	(gfc_set_default_type): Call lookup_symbol_fuzzy.
> 	(lookup_component_fuzzy_find_candidates): New static function.
> 	(lookup_component_fuzzy): Likewise.
> 	(gfc_find_component): Call lookup_component_fuzzy.
> 
> gcc/testsuite/ChangeLog
> 
> 2015-11-29  Bernhard Reutner-Fischer  <aldot@gcc.gnu.org>
> 
> 	* gfortran.dg/spellcheck-operator.f90: New testcase.
> 	* gfortran.dg/spellcheck-procedure.f90: New testcase.
> 	* gfortran.dg/spellcheck-structure.f90: New testcase.
> 
> ---
> 
> David Malcolm nice Levenshtein distance spelling check helpers
> were used in some parts of other frontends. This proposed patch adds
> some spelling corrections to the fortran frontend.
> 
> Suggestions are printed if we can find a suitable name, currently
> perusing a very simple cutoff factor:
> /* If more than half of the letters were misspelled, the suggestion is
>    likely to be meaningless.  */
> cutoff = MAX (strlen (typo), strlen (best_guess)) / 2;
> which effectively skips names with less than 4 characters.
> For e.g. structures, one could try to be much smarter in an attempt to
> also provide suggestions for single-letter members/components.
> 
> This patch covers (at least partly):
> - user-defined operators
> - structures (types and their components)
> - functions
> - symbols (variables)
> 
> I do not immediately see how to handle subroutines. Ideas?
> 
> If anybody has a testcase where a spelling-suggestion would make sense
> then please pass it along so we maybe can add support for GCC-7.
> 
> Signed-off-by: Bernhard Reutner-Fischer <rep.dot.nop@gmail.com>
> ---
>  gcc/fortran/gfortran.h                             |   1 +
>  gcc/fortran/interface.c                            |  16 ++-
>  gcc/fortran/resolve.c                              | 135 ++++++++++++++++++++-
>  gcc/fortran/symbol.c                               | 129 +++++++++++++++++++-
>  gcc/testsuite/gfortran.dg/spellcheck-operator.f90  |  30 +++++
>  gcc/testsuite/gfortran.dg/spellcheck-procedure.f90 |  41 +++++++
>  gcc/testsuite/gfortran.dg/spellcheck-structure.f90 |  35 ++++++
>  7 files changed, 376 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/gfortran.dg/spellcheck-operator.f90
>  create mode 100644 gcc/testsuite/gfortran.dg/spellcheck-procedure.f90
>  create mode 100644 gcc/testsuite/gfortran.dg/spellcheck-structure.f90
> 
> diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
> index 5487c93..cbfd592 100644
> --- a/gcc/fortran/gfortran.h
> +++ b/gcc/fortran/gfortran.h
> @@ -3060,6 +3060,7 @@ bool gfc_type_is_extensible (gfc_symbol *);
>  bool gfc_resolve_intrinsic (gfc_symbol *, locus *);
>  bool gfc_explicit_interface_required (gfc_symbol *, char *, int);
>  extern int gfc_do_concurrent_flag;
> +const char* gfc_lookup_function_fuzzy (const char *, gfc_symtree *);
>  
> 
>  /* array.c */
> diff --git a/gcc/fortran/interface.c b/gcc/fortran/interface.c
> index 30cc522..19f800f 100644
> --- a/gcc/fortran/interface.c
> +++ b/gcc/fortran/interface.c
> @@ -1590,10 +1590,18 @@ check_interface0 (gfc_interface *p, const char *interface_name)
>  	  if (p->sym->attr.external)
>  	    gfc_error ("Procedure %qs in %s at %L has no explicit interface",
>  		       p->sym->name, interface_name, &p->sym->declared_at);
> -	  else
> -	    gfc_error ("Procedure %qs in %s at %L is neither function nor "
> -		       "subroutine", p->sym->name, interface_name,
> -		      &p->sym->declared_at);
> +	  else {
> +	    const char *guessed
> +	      = gfc_lookup_function_fuzzy (p->sym->name, p->sym->ns->sym_root);
> +	    if (guessed)
> +	      gfc_error ("Procedure %qs in %s at %L is neither function nor "
> +			 "subroutine; did you mean %qs?", p->sym->name,
> +			interface_name, &p->sym->declared_at, guessed);
> +	    else
> +	      gfc_error ("Procedure %qs in %s at %L is neither function nor "
> +			 "subroutine", p->sym->name, interface_name,
> +			&p->sym->declared_at);
> +	  }
>  	  return 1;
>  	}
>  
> diff --git a/gcc/fortran/resolve.c b/gcc/fortran/resolve.c
> index 685e3f5..6e1f63c 100644
> --- a/gcc/fortran/resolve.c
> +++ b/gcc/fortran/resolve.c
> @@ -29,6 +29,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "data.h"
>  #include "target-memory.h" /* for gfc_simplify_transfer */
>  #include "constructor.h"
> +#include "spellcheck.h"
>  
>  /* Types used in equivalence statements.  */
>  
> @@ -2682,6 +2683,61 @@ resolve_specific_f (gfc_expr *expr)
>    return true;
>  }
>  
> +/* Recursively append candidate SYM to CANDIDATES.  */
> +
> +static void
> +lookup_function_fuzzy_find_candidates (gfc_symtree *sym,
> +                                       vec<const char *> *candidates)
> +{
> +  gfc_symtree *p;
> +  for (p = sym->right; p; p = p->right)
> +    {
> +      lookup_function_fuzzy_find_candidates (p, candidates);
> +      if (p->n.sym->ts.type != BT_UNKNOWN)
> +	candidates->safe_push (p->name);
> +    }
> +  for (p = sym->left; p; p = p->left)
> +    {
> +      lookup_function_fuzzy_find_candidates (p, candidates);
> +      if (p->n.sym->ts.type != BT_UNKNOWN)
> +	candidates->safe_push (p->name);
> +    }
> +}
> +
> +
> +/* Lookup function FN fuzzily, taking names in FUN into account.  */
> +
> +const char*
> +gfc_lookup_function_fuzzy (const char *fn, gfc_symtree *fun)
> +{
> +  auto_vec <const char *> candidates;
> +  lookup_function_fuzzy_find_candidates (fun, &candidates);
> +
> +  /* Determine closest match.  */
> +  int i;
> +  const char *name, *best = NULL;
> +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> +  FOR_EACH_VEC_ELT (candidates, i, name)
> +    {
> +      edit_distance_t dist = levenshtein_distance (fn, name);
> +      if (dist < best_distance)
> +	{
> +	  best_distance = dist;
> +	  best = name;
> +	}
> +    }
> +  /* If more than half of the letters were misspelled, the suggestion is
> +     likely to be meaningless.  */
> +  if (best)
> +    {
> +      unsigned int cutoff = MAX (strlen (fn), strlen (best)) / 2;
> +      if (best_distance > cutoff)
> +	return NULL;
> +    }
> +  return best;
> +}


Caveat: I'm not very familiar with the Fortran FE, so take the following
with a pinch of salt.

If I'm reading things right, here, and in various other places, you're
building a vec of const char *, and then seeing which one of those
candidates is the best match for another const char *.

You could simplify things by adding a helper function to spellcheck.h,
akin to this one:

extern tree
find_closest_identifier (tree target, const auto_vec<tree> *candidates);

This would reduce the amount of duplication in the patch (and slightly
reduce the amount of C++).

[are there IDENTIFIER nodes in the Fortran FE, or is it all const char
*? this would avoid some strlen calls]
 
>  /* Resolve a procedure call not known to be generic nor specific.  */
>  
> @@ -2732,8 +2788,15 @@ set_type:
>  
>        if (ts->type == BT_UNKNOWN)
>  	{
> -	  gfc_error ("Function %qs at %L has no IMPLICIT type",
> -		     sym->name, &expr->where);
> +	  const char *guessed
> +	    = gfc_lookup_function_fuzzy (sym->name, sym->ns->sym_root);
> +	  if (guessed)
> +	    gfc_error ("Function %qs at %L has no IMPLICIT type"
> +		       "; did you mean %qs?",
> +		       sym->name, &expr->where, guessed);
> +	  else
> +	    gfc_error ("Function %qs at %L has no IMPLICIT type",
> +		       sym->name, &expr->where);
>  	  return false;
>  	}
>        else
> @@ -3504,6 +3567,63 @@ compare_shapes (gfc_expr *op1, gfc_expr *op2)
>    return t;
>  }
>  
> +/* Recursively append candidate UOP to CANDIDATES.  */
> +
> +static void
> +lookup_uop_fuzzy_find_candidates (gfc_symtree *uop,
> +				  vec<const char *> *candidates)
> +{
> +  gfc_symtree *p;
> +  /* Not sure how to properly filter here.  Use all for a start.
> +     n.uop.op is NULL for empty interface operators (is that legal?) disregard
> +     these as i suppose they don't make terribly sense.  */
> +  for (p = uop->right; p; p = p->right)
> +    {
> +      lookup_function_fuzzy_find_candidates (p, candidates);
> +      if (p->n.uop->op != NULL)
> +	candidates->safe_push (p->name);
> +    }
> +  for (p = uop->left; p; p = p->left)
> +    {
> +      lookup_function_fuzzy_find_candidates (p, candidates);
> +      if (p->n.uop->op != NULL)
> +	candidates->safe_push (p->name);
> +    }
> +}
> +
> +/* Lookup user-operator OP fuzzily, taking names in UOP into account.  */
> +
> +static const char*
> +lookup_uop_fuzzy (const char *op, gfc_symtree *uop)
> +{
> +  auto_vec <const char *> candidates;
> +  lookup_uop_fuzzy_find_candidates (uop, &candidates);
> +
> +  /* Determine closest match.  */
> +  int i;
> +  const char *name, *best = NULL;
> +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> +  FOR_EACH_VEC_ELT (candidates, i, name)
> +    {
> +      edit_distance_t dist = levenshtein_distance (op, name);
> +      if (dist < best_distance)
> +	{
> +	  best_distance = dist;
> +	  best = name;
> +	}
> +    }
> +  /* If more than half of the letters were misspelled, the suggestion is
> +     likely to be meaningless.  */
> +  if (best)
> +    {
> +      unsigned int cutoff = MAX (strlen (op), strlen (best)) / 2;
> +      if (best_distance > cutoff)
> +	return NULL;
> +    }
> +  return best;
> +}

Here again, I think.


>  /* Resolve an operator expression node.  This can involve replacing the
>     operation with a user defined function call.  */
> @@ -3703,7 +3823,16 @@ resolve_operator (gfc_expr *e)
>  
>      case INTRINSIC_USER:
>        if (e->value.op.uop->op == NULL)
> -	sprintf (msg, _("Unknown operator '%s' at %%L"), e->value.op.uop->name);
> +	{
> +	  const char *name = e->value.op.uop->name;
> +	  const char *guessed;
> +	  guessed = lookup_uop_fuzzy (name, e->value.op.uop->ns->uop_root);
> +	  if (guessed)
> +	    sprintf (msg, _("Unknown operator '%s' at %%L; did you mean '%s'?"),
> +		name, guessed);
> +	  else
> +	    sprintf (msg, _("Unknown operator '%s' at %%L"), name);
> +	}
>        else if (op2 == NULL)
>  	sprintf (msg, _("Operand of user operator '%s' at %%L is %s"),
>  		 e->value.op.uop->name, gfc_typename (&op1->ts));
> diff --git a/gcc/fortran/symbol.c b/gcc/fortran/symbol.c
> index ff9aff9..212f7d8 100644
> --- a/gcc/fortran/symbol.c
> +++ b/gcc/fortran/symbol.c
> @@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "parse.h"
>  #include "match.h"
>  #include "constructor.h"
> +#include "spellcheck.h"
>  
> 
>  /* Strings for all symbol attributes.  We use these for dumping the
> @@ -235,6 +236,62 @@ gfc_get_default_type (const char *name, gfc_namespace *ns)
>  }
>  
> 
> +/* Recursively append candidate SYM to CANDIDATES.  */
> +
> +static void
> +lookup_symbol_fuzzy_find_candidates (gfc_symtree *sym,
> +				        vec<const char *> *candidates)
> +{
> +  gfc_symtree *p;
> +  for (p = sym->right; p; p = p->right)
> +    {
> +      lookup_symbol_fuzzy_find_candidates (p, candidates);
> +      if (p->n.sym->ts.type != BT_UNKNOWN)
> +	candidates->safe_push (p->name);
> +    }
> +  for (p = sym->left; p; p = p->left)
> +    {
> +      lookup_symbol_fuzzy_find_candidates (p, candidates);
> +      if (p->n.sym->ts.type != BT_UNKNOWN)
> +	candidates->safe_push (p->name);
> +    }
> +}
> +
> +
> +/* Lookup symbol SYM fuzzily, taking names in SYMBOL into account.  */
> +
> +static const char*
> +lookup_symbol_fuzzy (const char *sym, gfc_symbol *symbol)
> +{
> +  auto_vec <const char *> candidates;
> +  lookup_symbol_fuzzy_find_candidates (symbol->ns->sym_root, &candidates);
> +
> +  /* Determine closest match.  */
> +  int i;
> +  const char *name, *best = NULL;
> +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> +  FOR_EACH_VEC_ELT (candidates, i, name)
> +    {
> +      edit_distance_t dist = levenshtein_distance (sym, name);
> +      if (dist < best_distance)
> +	{
> +	  best_distance = dist;
> +	  best = name;
> +	}
> +    }
> +  /* If more than half of the letters were misspelled, the suggestion is
> +     likely to be meaningless.  */
> +  if (best)
> +    {
> +      unsigned int cutoff = MAX (strlen (sym), strlen (best)) / 2;
> +      if (best_distance > cutoff)
> +	return NULL;
> +    }
> +  return best;
> +}

Here again, I think.

> +
>  /* Given a pointer to a symbol, set its type according to the first
>     letter of its name.  Fails if the letter in question has no default
>     type.  */
> @@ -253,8 +310,15 @@ gfc_set_default_type (gfc_symbol *sym, int error_flag, gfc_namespace *ns)
>      {
>        if (error_flag && !sym->attr.untyped)
>  	{
> -	  gfc_error ("Symbol %qs at %L has no IMPLICIT type",
> -		     sym->name, &sym->declared_at);
> +	  const char *guessed
> +	    = lookup_symbol_fuzzy (sym->name, sym);
> +	  if (guessed)
> +	    gfc_error ("Symbol %qs at %L has no IMPLICIT type"
> +		       "; did you mean %qs?",
> +		       sym->name, &sym->declared_at, guessed);
> +	  else
> +	    gfc_error ("Symbol %qs at %L has no IMPLICIT type",
> +		       sym->name, &sym->declared_at);
>  	  sym->attr.untyped = 1; /* Ensure we only give an error once.  */
>  	}
>  
> @@ -2188,6 +2252,55 @@ bad:
>  }
>  
> 
> +/* Recursively append candidate COMPONENT structures to CANDIDATES.  */
> +
> +static void
> +lookup_component_fuzzy_find_candidates (gfc_component *component,
> +				        vec<const char *> *candidates)
> +{
> +  for (gfc_component *p = component; p; p = p->next)
> +    {
> +      if (00 && p->ts.type == BT_DERIVED)
> +	/* ??? There's no (suitable) DERIVED_TYPE which would come in
> +	   handy throughout the frontend; Use CLASS_DATA here for brevity.  */
> +	lookup_component_fuzzy_find_candidates (CLASS_DATA (p), candidates);
> +      candidates->safe_push (p->name);
> +    }
> +}
> +
> +/* Lookup component MEMBER fuzzily, taking names in COMPONENT into account.  */
> +
> +static const char*
> +lookup_component_fuzzy (const char *member, gfc_component *component)
> +{
> +  auto_vec <const char *> candidates;
> +  lookup_component_fuzzy_find_candidates (component, &candidates);
> +
> +  /* Determine closest match.  */
> +  int i;
> +  const char *name, *best = NULL;
> +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> +  FOR_EACH_VEC_ELT (candidates, i, name)
> +    {
> +      edit_distance_t dist = levenshtein_distance (member, name);
> +      if (dist < best_distance)
> +	{
> +	  best_distance = dist;
> +	  best = name;
> +	}
> +    }
> +  /* If more than half of the letters were misspelled, the suggestion is
> +     likely to be meaningless.  */
> +  if (best)
> +    {
> +      unsigned int cutoff = MAX (strlen (member), strlen (best)) / 2;
> +      if (best_distance > cutoff)
> +	return NULL;
> +    }
> +  return best;
> +}

...and here again.

>  /* Given a derived type node and a component name, try to locate the
>     component structure.  Returns the NULL pointer if the component is
>     not found or the components are private.  If noaccess is set, no access
> @@ -2238,8 +2351,16 @@ gfc_find_component (gfc_symbol *sym, const char *name,
>      }
>  
>    if (p == NULL && !silent)
> -    gfc_error ("%qs at %C is not a member of the %qs structure",
> -	       name, sym->name);
> +    {
> +      const char *guessed = lookup_component_fuzzy (name, sym->components);
> +      if (guessed)
> +	gfc_error ("%qs at %C is not a member of the %qs structure"
> +		   "; did you mean %qs?",
> +		   name, sym->name, guessed);
> +      else
> +	gfc_error ("%qs at %C is not a member of the %qs structure",
> +		   name, sym->name);
> +    }
>  
>    return p;
>  }
> diff --git a/gcc/testsuite/gfortran.dg/spellcheck-operator.f90 b/gcc/testsuite/gfortran.dg/spellcheck-operator.f90
> new file mode 100644
> index 0000000..810a770
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/spellcheck-operator.f90
> @@ -0,0 +1,30 @@
> +! { dg-do compile }
> +! test levenshtein based spelling suggestions
> +
> +module mymod1
> +  implicit none
> +  contains
> +    function something_good (iarg1)
> +      integer :: something_good
> +      integer, intent(in) :: iarg1
> +      something_good = iarg1 + 42
> +    end function something_good
> +end module mymod1
> +
> +program spellchekc
> +  use mymod1
> +  implicit none
> +
> +  interface operator (.mywrong.)
> +    module procedure something_wring ! { dg-error "Procedure .something_wring. in operator interface .mywrong. at .1. is neither function nor subroutine; did you mean .something_good.\\?|User operator procedure .something_wring. at .1. must be a FUNCTION" }
> +  end interface
> +
> +  interface operator (.mygood.)
> +    module procedure something_good
> +  end interface
> +
> +  integer :: i, j, added
> +  i = 0
> +  j = 0
> +  added = .mygoof. j ! { dg-error "Unknown operator .mygoof. at .1.; did you mean .mygood.\\?" }
> +end program spellchekc
> diff --git a/gcc/testsuite/gfortran.dg/spellcheck-procedure.f90 b/gcc/testsuite/gfortran.dg/spellcheck-procedure.f90
> new file mode 100644
> index 0000000..7923081
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/spellcheck-procedure.f90
> @@ -0,0 +1,41 @@
> +! { dg-do compile }
> +! test levenshtein based spelling suggestions
> +
> +module mymod1
> +  implicit none
> +  contains
> +    function something_good (iarg1)
> +      integer :: something_good
> +      integer, intent(in) :: iarg1
> +      something_good = iarg1 + 42
> +    end function something_good
> +end module mymod1
> +
> +subroutine bark_unless_zero(iarg)
> +  implicit none
> +  integer, intent(in) :: iarg
> +  if (iarg /= 0) call abort
> +end subroutine bark_unless_zero
> +
> +function myadd(iarg1, iarg2)
> +  implicit none
> +  integer :: myadd
> +  integer, intent(in) :: iarg1, iarg2
> +  myadd = iarg1 + iarg2
> +end function myadd
> +
> +program spellchekc
> +  use mymod1
> +  implicit none
> +
> +  integer :: i, j, myadd
> +  i = 0
> +  j = 0
> +! I suppose this cannot be made to work, no\\?
> +!  call barf_unless_zero(i) ! { -dg-error "; did you mean .bark_unless_zero.\\?" }
> +  j = something_goof(j) ! { dg-error "no IMPLICIT type; did you mean .something_good.\\?" }
> +  j = myaddd(i, j) ! { dg-error "no IMPLICIT type; did you mean .myadd.\\?" }
> +  j = mya(i, j) ! { dg-error "no IMPLICIT type; did you mean .myadd.\\?" }
> +  if (j /= 42) call abort
> +
> +end program spellchekc
> diff --git a/gcc/testsuite/gfortran.dg/spellcheck-structure.f90 b/gcc/testsuite/gfortran.dg/spellcheck-structure.f90
> new file mode 100644
> index 0000000..929e05f
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/spellcheck-structure.f90
> @@ -0,0 +1,35 @@
> +! { dg-do compile }
> +! test levenshtein based spelling suggestions
> +implicit none
> +
> +!!!!!!!!!!!!!! structure tests !!!!!!!!!!!!!!
> +type type1
> +   real :: radius
> +   integer :: i
> +end type type1
> +
> +type type2
> +  integer :: myint
> +  type(type1) :: mytype
> +end type type2
> +
> +type type3
> +  type(type2) :: type_2
> +end type type3
> +type type4
> +  type(type3) :: type_3
> +end type type4
> +
> +type(type1) :: t1
> +t1%radiuz = .0 ! { dg-error ".radiuz. at .1. is not a member of the .type1. structure; did you mean .radius.\\?" }
> +t1%x = .0 ! { dg-error ".x. at .1. is not a member of the .type1. structure" }
> +type(type2) :: t2
> +t2%mytape%radius = .0 ! { dg-error ".mytape. at .1. is not a member of the .type2. structure; did you mean .mytype.\\?" }
> +t2%mytype%radious = .0 ! { dg-error ".radious. at .1. is not a member of the .type1. structure; did you mean .radius.\\?" }
> +type(type4) :: t4
> +t4%type_3%type_2%mytype%radium = 88.0 ! { dg-error ".radium. at .1. is not a member of the .type1. structure; did you mean .radius.\\?" }
> +
> +!!!!!!!!!!!!!! symbol tests !!!!!!!!!!!!!!
> +integer :: iarg1
> +iarg2 = 1 ! { dg-error "Symbol .iarg2. at .1. has no IMPLICIT type; did you mean .iarg1.\\?" }
> +end




More information about the Gcc-patches mailing list