[PATCH] RFC: Use Levenshtein spelling suggestions in Fortran FE
David Malcolm
dmalcolm@redhat.com
Tue Dec 1 17:28:00 GMT 2015
On Tue, 2015-12-01 at 13:55 +0100, Bernhard Reutner-Fischer wrote:
> gcc/fortran/ChangeLog
>
> 2015-11-29 Bernhard Reutner-Fischer <aldot@gcc.gnu.org>
>
> * gfortran.h (gfc_lookup_function_fuzzy): New declaration.
> * resolve.c: Include spellcheck.h.
> (lookup_function_fuzzy_find_candidates): New static function.
> (lookup_uop_fuzzy_find_candidates): Likewise.
> (lookup_uop_fuzzy): Likewise.
> (resolve_operator) <INTRINSIC_USER>: Call lookup_uop_fuzzy.
> (gfc_lookup_function_fuzzy): New definition.
> (resolve_unknown_f): Call gfc_lookup_function_fuzzy.
> * interface.c (check_interface0): Likewise.
> * symbol.c: Include spellcheck.h.
> (lookup_symbol_fuzzy_find_candidates): New static function.
> (lookup_symbol_fuzzy): Likewise.
> (gfc_set_default_type): Call lookup_symbol_fuzzy.
> (lookup_component_fuzzy_find_candidates): New static function.
> (lookup_component_fuzzy): Likewise.
> (gfc_find_component): Call lookup_component_fuzzy.
>
> gcc/testsuite/ChangeLog
>
> 2015-11-29 Bernhard Reutner-Fischer <aldot@gcc.gnu.org>
>
> * gfortran.dg/spellcheck-operator.f90: New testcase.
> * gfortran.dg/spellcheck-procedure.f90: New testcase.
> * gfortran.dg/spellcheck-structure.f90: New testcase.
>
> ---
>
> David Malcolm nice Levenshtein distance spelling check helpers
> were used in some parts of other frontends. This proposed patch adds
> some spelling corrections to the fortran frontend.
>
> Suggestions are printed if we can find a suitable name, currently
> perusing a very simple cutoff factor:
> /* If more than half of the letters were misspelled, the suggestion is
> likely to be meaningless. */
> cutoff = MAX (strlen (typo), strlen (best_guess)) / 2;
> which effectively skips names with less than 4 characters.
> For e.g. structures, one could try to be much smarter in an attempt to
> also provide suggestions for single-letter members/components.
>
> This patch covers (at least partly):
> - user-defined operators
> - structures (types and their components)
> - functions
> - symbols (variables)
>
> I do not immediately see how to handle subroutines. Ideas?
>
> If anybody has a testcase where a spelling-suggestion would make sense
> then please pass it along so we maybe can add support for GCC-7.
>
> Signed-off-by: Bernhard Reutner-Fischer <rep.dot.nop@gmail.com>
> ---
> gcc/fortran/gfortran.h | 1 +
> gcc/fortran/interface.c | 16 ++-
> gcc/fortran/resolve.c | 135 ++++++++++++++++++++-
> gcc/fortran/symbol.c | 129 +++++++++++++++++++-
> gcc/testsuite/gfortran.dg/spellcheck-operator.f90 | 30 +++++
> gcc/testsuite/gfortran.dg/spellcheck-procedure.f90 | 41 +++++++
> gcc/testsuite/gfortran.dg/spellcheck-structure.f90 | 35 ++++++
> 7 files changed, 376 insertions(+), 11 deletions(-)
> create mode 100644 gcc/testsuite/gfortran.dg/spellcheck-operator.f90
> create mode 100644 gcc/testsuite/gfortran.dg/spellcheck-procedure.f90
> create mode 100644 gcc/testsuite/gfortran.dg/spellcheck-structure.f90
>
> diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
> index 5487c93..cbfd592 100644
> --- a/gcc/fortran/gfortran.h
> +++ b/gcc/fortran/gfortran.h
> @@ -3060,6 +3060,7 @@ bool gfc_type_is_extensible (gfc_symbol *);
> bool gfc_resolve_intrinsic (gfc_symbol *, locus *);
> bool gfc_explicit_interface_required (gfc_symbol *, char *, int);
> extern int gfc_do_concurrent_flag;
> +const char* gfc_lookup_function_fuzzy (const char *, gfc_symtree *);
>
>
> /* array.c */
> diff --git a/gcc/fortran/interface.c b/gcc/fortran/interface.c
> index 30cc522..19f800f 100644
> --- a/gcc/fortran/interface.c
> +++ b/gcc/fortran/interface.c
> @@ -1590,10 +1590,18 @@ check_interface0 (gfc_interface *p, const char *interface_name)
> if (p->sym->attr.external)
> gfc_error ("Procedure %qs in %s at %L has no explicit interface",
> p->sym->name, interface_name, &p->sym->declared_at);
> - else
> - gfc_error ("Procedure %qs in %s at %L is neither function nor "
> - "subroutine", p->sym->name, interface_name,
> - &p->sym->declared_at);
> + else {
> + const char *guessed
> + = gfc_lookup_function_fuzzy (p->sym->name, p->sym->ns->sym_root);
> + if (guessed)
> + gfc_error ("Procedure %qs in %s at %L is neither function nor "
> + "subroutine; did you mean %qs?", p->sym->name,
> + interface_name, &p->sym->declared_at, guessed);
> + else
> + gfc_error ("Procedure %qs in %s at %L is neither function nor "
> + "subroutine", p->sym->name, interface_name,
> + &p->sym->declared_at);
> + }
> return 1;
> }
>
> diff --git a/gcc/fortran/resolve.c b/gcc/fortran/resolve.c
> index 685e3f5..6e1f63c 100644
> --- a/gcc/fortran/resolve.c
> +++ b/gcc/fortran/resolve.c
> @@ -29,6 +29,7 @@ along with GCC; see the file COPYING3. If not see
> #include "data.h"
> #include "target-memory.h" /* for gfc_simplify_transfer */
> #include "constructor.h"
> +#include "spellcheck.h"
>
> /* Types used in equivalence statements. */
>
> @@ -2682,6 +2683,61 @@ resolve_specific_f (gfc_expr *expr)
> return true;
> }
>
> +/* Recursively append candidate SYM to CANDIDATES. */
> +
> +static void
> +lookup_function_fuzzy_find_candidates (gfc_symtree *sym,
> + vec<const char *> *candidates)
> +{
> + gfc_symtree *p;
> + for (p = sym->right; p; p = p->right)
> + {
> + lookup_function_fuzzy_find_candidates (p, candidates);
> + if (p->n.sym->ts.type != BT_UNKNOWN)
> + candidates->safe_push (p->name);
> + }
> + for (p = sym->left; p; p = p->left)
> + {
> + lookup_function_fuzzy_find_candidates (p, candidates);
> + if (p->n.sym->ts.type != BT_UNKNOWN)
> + candidates->safe_push (p->name);
> + }
> +}
> +
> +
> +/* Lookup function FN fuzzily, taking names in FUN into account. */
> +
> +const char*
> +gfc_lookup_function_fuzzy (const char *fn, gfc_symtree *fun)
> +{
> + auto_vec <const char *> candidates;
> + lookup_function_fuzzy_find_candidates (fun, &candidates);
> +
> + /* Determine closest match. */
> + int i;
> + const char *name, *best = NULL;
> + edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> + FOR_EACH_VEC_ELT (candidates, i, name)
> + {
> + edit_distance_t dist = levenshtein_distance (fn, name);
> + if (dist < best_distance)
> + {
> + best_distance = dist;
> + best = name;
> + }
> + }
> + /* If more than half of the letters were misspelled, the suggestion is
> + likely to be meaningless. */
> + if (best)
> + {
> + unsigned int cutoff = MAX (strlen (fn), strlen (best)) / 2;
> + if (best_distance > cutoff)
> + return NULL;
> + }
> + return best;
> +}
Caveat: I'm not very familiar with the Fortran FE, so take the following
with a pinch of salt.
If I'm reading things right, here, and in various other places, you're
building a vec of const char *, and then seeing which one of those
candidates is the best match for another const char *.
You could simplify things by adding a helper function to spellcheck.h,
akin to this one:
extern tree
find_closest_identifier (tree target, const auto_vec<tree> *candidates);
This would reduce the amount of duplication in the patch (and slightly
reduce the amount of C++).
[are there IDENTIFIER nodes in the Fortran FE, or is it all const char
*? this would avoid some strlen calls]
> /* Resolve a procedure call not known to be generic nor specific. */
>
> @@ -2732,8 +2788,15 @@ set_type:
>
> if (ts->type == BT_UNKNOWN)
> {
> - gfc_error ("Function %qs at %L has no IMPLICIT type",
> - sym->name, &expr->where);
> + const char *guessed
> + = gfc_lookup_function_fuzzy (sym->name, sym->ns->sym_root);
> + if (guessed)
> + gfc_error ("Function %qs at %L has no IMPLICIT type"
> + "; did you mean %qs?",
> + sym->name, &expr->where, guessed);
> + else
> + gfc_error ("Function %qs at %L has no IMPLICIT type",
> + sym->name, &expr->where);
> return false;
> }
> else
> @@ -3504,6 +3567,63 @@ compare_shapes (gfc_expr *op1, gfc_expr *op2)
> return t;
> }
>
> +/* Recursively append candidate UOP to CANDIDATES. */
> +
> +static void
> +lookup_uop_fuzzy_find_candidates (gfc_symtree *uop,
> + vec<const char *> *candidates)
> +{
> + gfc_symtree *p;
> + /* Not sure how to properly filter here. Use all for a start.
> + n.uop.op is NULL for empty interface operators (is that legal?) disregard
> + these as i suppose they don't make terribly sense. */
> + for (p = uop->right; p; p = p->right)
> + {
> + lookup_function_fuzzy_find_candidates (p, candidates);
> + if (p->n.uop->op != NULL)
> + candidates->safe_push (p->name);
> + }
> + for (p = uop->left; p; p = p->left)
> + {
> + lookup_function_fuzzy_find_candidates (p, candidates);
> + if (p->n.uop->op != NULL)
> + candidates->safe_push (p->name);
> + }
> +}
> +
> +/* Lookup user-operator OP fuzzily, taking names in UOP into account. */
> +
> +static const char*
> +lookup_uop_fuzzy (const char *op, gfc_symtree *uop)
> +{
> + auto_vec <const char *> candidates;
> + lookup_uop_fuzzy_find_candidates (uop, &candidates);
> +
> + /* Determine closest match. */
> + int i;
> + const char *name, *best = NULL;
> + edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> + FOR_EACH_VEC_ELT (candidates, i, name)
> + {
> + edit_distance_t dist = levenshtein_distance (op, name);
> + if (dist < best_distance)
> + {
> + best_distance = dist;
> + best = name;
> + }
> + }
> + /* If more than half of the letters were misspelled, the suggestion is
> + likely to be meaningless. */
> + if (best)
> + {
> + unsigned int cutoff = MAX (strlen (op), strlen (best)) / 2;
> + if (best_distance > cutoff)
> + return NULL;
> + }
> + return best;
> +}
Here again, I think.
> /* Resolve an operator expression node. This can involve replacing the
> operation with a user defined function call. */
> @@ -3703,7 +3823,16 @@ resolve_operator (gfc_expr *e)
>
> case INTRINSIC_USER:
> if (e->value.op.uop->op == NULL)
> - sprintf (msg, _("Unknown operator '%s' at %%L"), e->value.op.uop->name);
> + {
> + const char *name = e->value.op.uop->name;
> + const char *guessed;
> + guessed = lookup_uop_fuzzy (name, e->value.op.uop->ns->uop_root);
> + if (guessed)
> + sprintf (msg, _("Unknown operator '%s' at %%L; did you mean '%s'?"),
> + name, guessed);
> + else
> + sprintf (msg, _("Unknown operator '%s' at %%L"), name);
> + }
> else if (op2 == NULL)
> sprintf (msg, _("Operand of user operator '%s' at %%L is %s"),
> e->value.op.uop->name, gfc_typename (&op1->ts));
> diff --git a/gcc/fortran/symbol.c b/gcc/fortran/symbol.c
> index ff9aff9..212f7d8 100644
> --- a/gcc/fortran/symbol.c
> +++ b/gcc/fortran/symbol.c
> @@ -27,6 +27,7 @@ along with GCC; see the file COPYING3. If not see
> #include "parse.h"
> #include "match.h"
> #include "constructor.h"
> +#include "spellcheck.h"
>
>
> /* Strings for all symbol attributes. We use these for dumping the
> @@ -235,6 +236,62 @@ gfc_get_default_type (const char *name, gfc_namespace *ns)
> }
>
>
> +/* Recursively append candidate SYM to CANDIDATES. */
> +
> +static void
> +lookup_symbol_fuzzy_find_candidates (gfc_symtree *sym,
> + vec<const char *> *candidates)
> +{
> + gfc_symtree *p;
> + for (p = sym->right; p; p = p->right)
> + {
> + lookup_symbol_fuzzy_find_candidates (p, candidates);
> + if (p->n.sym->ts.type != BT_UNKNOWN)
> + candidates->safe_push (p->name);
> + }
> + for (p = sym->left; p; p = p->left)
> + {
> + lookup_symbol_fuzzy_find_candidates (p, candidates);
> + if (p->n.sym->ts.type != BT_UNKNOWN)
> + candidates->safe_push (p->name);
> + }
> +}
> +
> +
> +/* Lookup symbol SYM fuzzily, taking names in SYMBOL into account. */
> +
> +static const char*
> +lookup_symbol_fuzzy (const char *sym, gfc_symbol *symbol)
> +{
> + auto_vec <const char *> candidates;
> + lookup_symbol_fuzzy_find_candidates (symbol->ns->sym_root, &candidates);
> +
> + /* Determine closest match. */
> + int i;
> + const char *name, *best = NULL;
> + edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> + FOR_EACH_VEC_ELT (candidates, i, name)
> + {
> + edit_distance_t dist = levenshtein_distance (sym, name);
> + if (dist < best_distance)
> + {
> + best_distance = dist;
> + best = name;
> + }
> + }
> + /* If more than half of the letters were misspelled, the suggestion is
> + likely to be meaningless. */
> + if (best)
> + {
> + unsigned int cutoff = MAX (strlen (sym), strlen (best)) / 2;
> + if (best_distance > cutoff)
> + return NULL;
> + }
> + return best;
> +}
Here again, I think.
> +
> /* Given a pointer to a symbol, set its type according to the first
> letter of its name. Fails if the letter in question has no default
> type. */
> @@ -253,8 +310,15 @@ gfc_set_default_type (gfc_symbol *sym, int error_flag, gfc_namespace *ns)
> {
> if (error_flag && !sym->attr.untyped)
> {
> - gfc_error ("Symbol %qs at %L has no IMPLICIT type",
> - sym->name, &sym->declared_at);
> + const char *guessed
> + = lookup_symbol_fuzzy (sym->name, sym);
> + if (guessed)
> + gfc_error ("Symbol %qs at %L has no IMPLICIT type"
> + "; did you mean %qs?",
> + sym->name, &sym->declared_at, guessed);
> + else
> + gfc_error ("Symbol %qs at %L has no IMPLICIT type",
> + sym->name, &sym->declared_at);
> sym->attr.untyped = 1; /* Ensure we only give an error once. */
> }
>
> @@ -2188,6 +2252,55 @@ bad:
> }
>
>
> +/* Recursively append candidate COMPONENT structures to CANDIDATES. */
> +
> +static void
> +lookup_component_fuzzy_find_candidates (gfc_component *component,
> + vec<const char *> *candidates)
> +{
> + for (gfc_component *p = component; p; p = p->next)
> + {
> + if (00 && p->ts.type == BT_DERIVED)
> + /* ??? There's no (suitable) DERIVED_TYPE which would come in
> + handy throughout the frontend; Use CLASS_DATA here for brevity. */
> + lookup_component_fuzzy_find_candidates (CLASS_DATA (p), candidates);
> + candidates->safe_push (p->name);
> + }
> +}
> +
> +/* Lookup component MEMBER fuzzily, taking names in COMPONENT into account. */
> +
> +static const char*
> +lookup_component_fuzzy (const char *member, gfc_component *component)
> +{
> + auto_vec <const char *> candidates;
> + lookup_component_fuzzy_find_candidates (component, &candidates);
> +
> + /* Determine closest match. */
> + int i;
> + const char *name, *best = NULL;
> + edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> + FOR_EACH_VEC_ELT (candidates, i, name)
> + {
> + edit_distance_t dist = levenshtein_distance (member, name);
> + if (dist < best_distance)
> + {
> + best_distance = dist;
> + best = name;
> + }
> + }
> + /* If more than half of the letters were misspelled, the suggestion is
> + likely to be meaningless. */
> + if (best)
> + {
> + unsigned int cutoff = MAX (strlen (member), strlen (best)) / 2;
> + if (best_distance > cutoff)
> + return NULL;
> + }
> + return best;
> +}
...and here again.
> /* Given a derived type node and a component name, try to locate the
> component structure. Returns the NULL pointer if the component is
> not found or the components are private. If noaccess is set, no access
> @@ -2238,8 +2351,16 @@ gfc_find_component (gfc_symbol *sym, const char *name,
> }
>
> if (p == NULL && !silent)
> - gfc_error ("%qs at %C is not a member of the %qs structure",
> - name, sym->name);
> + {
> + const char *guessed = lookup_component_fuzzy (name, sym->components);
> + if (guessed)
> + gfc_error ("%qs at %C is not a member of the %qs structure"
> + "; did you mean %qs?",
> + name, sym->name, guessed);
> + else
> + gfc_error ("%qs at %C is not a member of the %qs structure",
> + name, sym->name);
> + }
>
> return p;
> }
> diff --git a/gcc/testsuite/gfortran.dg/spellcheck-operator.f90 b/gcc/testsuite/gfortran.dg/spellcheck-operator.f90
> new file mode 100644
> index 0000000..810a770
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/spellcheck-operator.f90
> @@ -0,0 +1,30 @@
> +! { dg-do compile }
> +! test levenshtein based spelling suggestions
> +
> +module mymod1
> + implicit none
> + contains
> + function something_good (iarg1)
> + integer :: something_good
> + integer, intent(in) :: iarg1
> + something_good = iarg1 + 42
> + end function something_good
> +end module mymod1
> +
> +program spellchekc
> + use mymod1
> + implicit none
> +
> + interface operator (.mywrong.)
> + module procedure something_wring ! { dg-error "Procedure .something_wring. in operator interface .mywrong. at .1. is neither function nor subroutine; did you mean .something_good.\\?|User operator procedure .something_wring. at .1. must be a FUNCTION" }
> + end interface
> +
> + interface operator (.mygood.)
> + module procedure something_good
> + end interface
> +
> + integer :: i, j, added
> + i = 0
> + j = 0
> + added = .mygoof. j ! { dg-error "Unknown operator .mygoof. at .1.; did you mean .mygood.\\?" }
> +end program spellchekc
> diff --git a/gcc/testsuite/gfortran.dg/spellcheck-procedure.f90 b/gcc/testsuite/gfortran.dg/spellcheck-procedure.f90
> new file mode 100644
> index 0000000..7923081
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/spellcheck-procedure.f90
> @@ -0,0 +1,41 @@
> +! { dg-do compile }
> +! test levenshtein based spelling suggestions
> +
> +module mymod1
> + implicit none
> + contains
> + function something_good (iarg1)
> + integer :: something_good
> + integer, intent(in) :: iarg1
> + something_good = iarg1 + 42
> + end function something_good
> +end module mymod1
> +
> +subroutine bark_unless_zero(iarg)
> + implicit none
> + integer, intent(in) :: iarg
> + if (iarg /= 0) call abort
> +end subroutine bark_unless_zero
> +
> +function myadd(iarg1, iarg2)
> + implicit none
> + integer :: myadd
> + integer, intent(in) :: iarg1, iarg2
> + myadd = iarg1 + iarg2
> +end function myadd
> +
> +program spellchekc
> + use mymod1
> + implicit none
> +
> + integer :: i, j, myadd
> + i = 0
> + j = 0
> +! I suppose this cannot be made to work, no\\?
> +! call barf_unless_zero(i) ! { -dg-error "; did you mean .bark_unless_zero.\\?" }
> + j = something_goof(j) ! { dg-error "no IMPLICIT type; did you mean .something_good.\\?" }
> + j = myaddd(i, j) ! { dg-error "no IMPLICIT type; did you mean .myadd.\\?" }
> + j = mya(i, j) ! { dg-error "no IMPLICIT type; did you mean .myadd.\\?" }
> + if (j /= 42) call abort
> +
> +end program spellchekc
> diff --git a/gcc/testsuite/gfortran.dg/spellcheck-structure.f90 b/gcc/testsuite/gfortran.dg/spellcheck-structure.f90
> new file mode 100644
> index 0000000..929e05f
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/spellcheck-structure.f90
> @@ -0,0 +1,35 @@
> +! { dg-do compile }
> +! test levenshtein based spelling suggestions
> +implicit none
> +
> +!!!!!!!!!!!!!! structure tests !!!!!!!!!!!!!!
> +type type1
> + real :: radius
> + integer :: i
> +end type type1
> +
> +type type2
> + integer :: myint
> + type(type1) :: mytype
> +end type type2
> +
> +type type3
> + type(type2) :: type_2
> +end type type3
> +type type4
> + type(type3) :: type_3
> +end type type4
> +
> +type(type1) :: t1
> +t1%radiuz = .0 ! { dg-error ".radiuz. at .1. is not a member of the .type1. structure; did you mean .radius.\\?" }
> +t1%x = .0 ! { dg-error ".x. at .1. is not a member of the .type1. structure" }
> +type(type2) :: t2
> +t2%mytape%radius = .0 ! { dg-error ".mytape. at .1. is not a member of the .type2. structure; did you mean .mytype.\\?" }
> +t2%mytype%radious = .0 ! { dg-error ".radious. at .1. is not a member of the .type1. structure; did you mean .radius.\\?" }
> +type(type4) :: t4
> +t4%type_3%type_2%mytype%radium = 88.0 ! { dg-error ".radium. at .1. is not a member of the .type1. structure; did you mean .radius.\\?" }
> +
> +!!!!!!!!!!!!!! symbol tests !!!!!!!!!!!!!!
> +integer :: iarg1
> +iarg2 = 1 ! { dg-error "Symbol .iarg2. at .1. has no IMPLICIT type; did you mean .iarg1.\\?" }
> +end
More information about the Gcc-patches
mailing list