This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 0/2] Levenshtein-based suggestions (v3)


On Fri, 2015-11-13 at 07:57 +0100, Marek Polacek wrote:
> Probably coming too late, sorry.

> On Thu, Nov 12, 2015 at 09:08:36PM -0500, David Malcolm wrote:
> > index 4335a87..eb4e1fc 100644
> > --- a/gcc/c/c-typeck.c
> > +++ b/gcc/c/c-typeck.c
> > @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "c-family/c-ubsan.h"
> >  #include "cilk.h"
> >  #include "gomp-constants.h"
> > +#include "spellcheck.h"
> >  
> >  /* Possible cases of implicit bad conversions.  Used to select
> >     diagnostic messages in convert_for_assignment.  */
> > @@ -2242,6 +2243,72 @@ lookup_field (tree type, tree component)
> >    return tree_cons (NULL_TREE, field, NULL_TREE);
> >  }
> >  
> > +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES.  */
> > +
> > +static void
> > +lookup_field_fuzzy_find_candidates (tree type, tree component,
> > +				    vec<tree> *candidates)
> > +{
> > +  tree field;
> > +  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
> 
> I'd prefer declaring field in the for loop, so
>   for (tree field = TYPE_FIELDS...
> 
> > +	  && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
> > +	      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
> 
> This is RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)).

I based this code on the code in lookup_field right above it;
I copied-and-pasted that conditional, so presumably it should also be
changed in lookup_field (which has the condition twice)?

FWIW I notice RECORD_OR_UNION_TYPE_P also covers QUAL_UNION_TYPE.

/* Nonzero if TYPE is a record or union type.  */
#define RECORD_OR_UNION_TYPE_P(TYPE)		\
  (TREE_CODE (TYPE) == RECORD_TYPE		\
   || TREE_CODE (TYPE) == UNION_TYPE		\
   || TREE_CODE (TYPE) == QUAL_UNION_TYPE)

FWIW I've made the change in the attached patch (both to the new
function, and to lookup_field).

> > +	{
> > +	  lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
> > +					      component,
> > +					      candidates);
> > +	}
> 
> Lose the brackets around a single statement.

Done.

> > +      if (DECL_NAME (field))
> > +	candidates->safe_push (DECL_NAME (field));
> > +    }
> > +}
> > +
> > +/* Like "lookup_field", but find the closest matching IDENTIFIER_NODE,
> > +   rather than returning a TREE_LIST for an exact match.  */
> > +
> > +static tree
> > +lookup_field_fuzzy (tree type, tree component)
> > +{
> > +  gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE);
> > +
> > +  /* First, gather a list of candidates.  */
> > +  auto_vec <tree> candidates;
> > +
> > +  lookup_field_fuzzy_find_candidates (type, component,
> > +				      &candidates);
> > +
> > +  /* Now determine which is closest.  */
> > +  int i;
> > +  tree identifier;
> > +  tree best_identifier = NULL;
> 
> NULL_TREE

Fixed.

> > +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> > +  FOR_EACH_VEC_ELT (candidates, i, identifier)
> > +    {
> > +      gcc_assert (TREE_CODE (identifier) == IDENTIFIER_NODE);
> > +      edit_distance_t dist = levenshtein_distance (component, identifier);
> > +      if (dist < best_distance)
> > +	{
> > +	  best_distance = dist;
> > +	  best_identifier = identifier;
> > +	}
> > +    }
> > +
> > +  /* If more than half of the letters were misspelled, the suggestion is
> > +     likely to be meaningless.  */
> > +  if (best_identifier)
> > +    {
> > +      unsigned int cutoff = MAX (IDENTIFIER_LENGTH (component),
> > +				 IDENTIFIER_LENGTH (best_identifier)) / 2;
> > +      if (best_distance > cutoff)
> > +	return NULL;
> 
> NULL_TREE

Fixed.

> > +/* The Levenshtein distance is an "edit-distance": the minimal
> > +   number of one-character insertions, removals or substitutions
> > +   that are needed to change one string into another.
> > +
> > +   This implementation uses the Wagner-Fischer algorithm.  */
> > +
> > +static edit_distance_t
> > +levenshtein_distance (const char *s, int len_s,
> > +		      const char *t, int len_t)
> > +{
> > +  const bool debug = false;
> > +
> > +  if (debug)
> > +    {
> > +      printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
> > +      printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
> > +    }
> 
> Did you leave this debug stuff here intentionally?

I find it useful, but I believe it's against our policy, so I've deleted
it in the attached patch.

> > +      /* Build the rest of the row by considering neighbours to
> > +	 the north, west and northwest.  */
> > +      for (int j = 0; j < len_s; j++)
> > +	{
> > +	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
> > +	  edit_distance_t deletion     = v1[j] + 1;
> > +	  edit_distance_t insertion    = v0[j + 1] + 1;
> 
> The formatting doesn't look right here.

It's correct; it's "diff" inserting two spaces before a tab combined
with our mixed spaces+tab convention: the "for" is at column 6 (6
spaces), whereas the other lines are at column 8 (1 tab), which looks
weird in a diff.

Patch attached; only tested lightly so far (compiles, and passes
spellcheck subset of tests).

OK for trunk if it passes bootstrap&regrtest?


>From b8ed3cbe9cc000416941e0108036f24f4483cdb0 Mon Sep 17 00:00:00 2001
From: David Malcolm <dmalcolm@redhat.com>
Date: Fri, 13 Nov 2015 07:22:16 -0500
Subject: [PATCH] Cleanups of spellchecking code

gcc/c/ChangeLog:
	* c-typeck.c (lookup_field): Use RECORD_OR_UNION_TYPE_P
	in two places.
	(lookup_field_fuzzy_find_candidates): Use RECORD_OR_UNION_TYPE_P;
	formatting cleanups.
	(lookup_field_fuzzy): Use NULL_TREE rather than NULL.

gcc/ChangeLog:
	* spellcheck.c (levenshtein_distance): Remove debug code.
---
 gcc/c/c-typeck.c | 24 +++++++++---------------
 gcc/spellcheck.c | 24 ------------------------
 2 files changed, 9 insertions(+), 39 deletions(-)

diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index eb4e1fc..b084ca5 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -2166,8 +2166,7 @@ lookup_field (tree type, tree component)
 	      while (DECL_NAME (field_array[bot]) == NULL_TREE)
 		{
 		  field = field_array[bot++];
-		  if (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
-		      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE)
+		  if (RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
 		    {
 		      tree anon = lookup_field (TREE_TYPE (field), component);
 
@@ -2213,8 +2212,7 @@ lookup_field (tree type, tree component)
       for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
 	{
 	  if (DECL_NAME (field) == NULL_TREE
-	      && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
-		  || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
+	      && RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
 	    {
 	      tree anon = lookup_field (TREE_TYPE (field), component);
 
@@ -2249,17 +2247,13 @@ static void
 lookup_field_fuzzy_find_candidates (tree type, tree component,
 				    vec<tree> *candidates)
 {
-  tree field;
-  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+  for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
     {
       if (DECL_NAME (field) == NULL_TREE
-	  && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
-	      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
-	{
-	  lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
-					      component,
-					      candidates);
-	}
+	  && RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
+	lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
+					    component,
+					    candidates);
 
       if (DECL_NAME (field))
 	candidates->safe_push (DECL_NAME (field));
@@ -2283,7 +2277,7 @@ lookup_field_fuzzy (tree type, tree component)
   /* Now determine which is closest.  */
   int i;
   tree identifier;
-  tree best_identifier = NULL;
+  tree best_identifier = NULL_TREE;
   edit_distance_t best_distance = MAX_EDIT_DISTANCE;
   FOR_EACH_VEC_ELT (candidates, i, identifier)
     {
@@ -2303,7 +2297,7 @@ lookup_field_fuzzy (tree type, tree component)
       unsigned int cutoff = MAX (IDENTIFIER_LENGTH (component),
 				 IDENTIFIER_LENGTH (best_identifier)) / 2;
       if (best_distance > cutoff)
-	return NULL;
+	return NULL_TREE;
     }
 
   return best_identifier;
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
index 32854cf..6432e385 100644
--- a/gcc/spellcheck.c
+++ b/gcc/spellcheck.c
@@ -34,14 +34,6 @@ edit_distance_t
 levenshtein_distance (const char *s, int len_s,
 		      const char *t, int len_t)
 {
-  const bool debug = false;
-
-  if (debug)
-    {
-      printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
-      printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
-    }
-
   if (len_s == 0)
     return len_t;
   if (len_t == 0)
@@ -67,14 +59,6 @@ levenshtein_distance (const char *s, int len_s,
   /* Build successive rows.  */
   for (int i = 0; i < len_t; i++)
     {
-      if (debug)
-	{
-	  printf ("i:%i v0 = ", i);
-	  for (int j = 0; j < len_s + 1; j++)
-	    printf ("%i ", v0[j]);
-	  printf ("\n");
-	}
-
       /* The initial column is for the case of an empty source string; we
 	 can reach prefixes of the target string of length i
 	 by inserting i characters.  */
@@ -98,14 +82,6 @@ levenshtein_distance (const char *s, int len_s,
 	v0[j] = v1[j];
     }
 
-  if (debug)
-    {
-      printf ("final v1 = ");
-      for (int j = 0; j < len_s + 1; j++)
-	printf ("%i ", v1[j]);
-      printf ("\n");
-    }
-
   edit_distance_t result = v1[len_s];
   delete[] v0;
   delete[] v1;
-- 
1.8.5.3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]