This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH 0/2] Levenshtein-based suggestions (v3)
- From: Marek Polacek <polacek at redhat dot com>
- To: David Malcolm <dmalcolm at redhat dot com>
- Cc: Jeff Law <law at redhat dot com>, Richard Biener <richard dot guenther at gmail dot com>, Manuel López-Ibáñez <lopezibanez at gmail dot com>, GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Fri, 13 Nov 2015 07:57:38 +0100
- Subject: Re: [PATCH 0/2] Levenshtein-based suggestions (v3)
- Authentication-results: sourceware.org; auth=none
- References: <55FB150C dot 5090105 at redhat dot com> <1446209267-49800-1-git-send-email-dmalcolm at redhat dot com> <56370667 dot 8000005 at redhat dot com> <1447380516 dot 7830 dot 34 dot camel at surprise>
Probably coming too late, sorry.
On Thu, Nov 12, 2015 at 09:08:36PM -0500, David Malcolm wrote:
> index 4335a87..eb4e1fc 100644
> --- a/gcc/c/c-typeck.c
> +++ b/gcc/c/c-typeck.c
> @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. If not see
> #include "c-family/c-ubsan.h"
> #include "cilk.h"
> #include "gomp-constants.h"
> +#include "spellcheck.h"
>
> /* Possible cases of implicit bad conversions. Used to select
> diagnostic messages in convert_for_assignment. */
> @@ -2242,6 +2243,72 @@ lookup_field (tree type, tree component)
> return tree_cons (NULL_TREE, field, NULL_TREE);
> }
>
> +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES. */
> +
> +static void
> +lookup_field_fuzzy_find_candidates (tree type, tree component,
> + vec<tree> *candidates)
> +{
> + tree field;
> + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
I'd prefer declaring field in the for loop, so
for (tree field = TYPE_FIELDS...
> + && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
> + || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
This is RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)).
> + {
> + lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
> + component,
> + candidates);
> + }
Lose the brackets around a single statement.
> + if (DECL_NAME (field))
> + candidates->safe_push (DECL_NAME (field));
> + }
> +}
> +
> +/* Like "lookup_field", but find the closest matching IDENTIFIER_NODE,
> + rather than returning a TREE_LIST for an exact match. */
> +
> +static tree
> +lookup_field_fuzzy (tree type, tree component)
> +{
> + gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE);
> +
> + /* First, gather a list of candidates. */
> + auto_vec <tree> candidates;
> +
> + lookup_field_fuzzy_find_candidates (type, component,
> + &candidates);
> +
> + /* Now determine which is closest. */
> + int i;
> + tree identifier;
> + tree best_identifier = NULL;
NULL_TREE
> + edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> + FOR_EACH_VEC_ELT (candidates, i, identifier)
> + {
> + gcc_assert (TREE_CODE (identifier) == IDENTIFIER_NODE);
> + edit_distance_t dist = levenshtein_distance (component, identifier);
> + if (dist < best_distance)
> + {
> + best_distance = dist;
> + best_identifier = identifier;
> + }
> + }
> +
> + /* If more than half of the letters were misspelled, the suggestion is
> + likely to be meaningless. */
> + if (best_identifier)
> + {
> + unsigned int cutoff = MAX (IDENTIFIER_LENGTH (component),
> + IDENTIFIER_LENGTH (best_identifier)) / 2;
> + if (best_distance > cutoff)
> + return NULL;
NULL_TREE
> +/* The Levenshtein distance is an "edit-distance": the minimal
> + number of one-character insertions, removals or substitutions
> + that are needed to change one string into another.
> +
> + This implementation uses the Wagner-Fischer algorithm. */
> +
> +static edit_distance_t
> +levenshtein_distance (const char *s, int len_s,
> + const char *t, int len_t)
> +{
> + const bool debug = false;
> +
> + if (debug)
> + {
> + printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
> + printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
> + }
Did you leave this debug stuff here intentionally?
> + /* Build the rest of the row by considering neighbours to
> + the north, west and northwest. */
> + for (int j = 0; j < len_s; j++)
> + {
> + edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
> + edit_distance_t deletion = v1[j] + 1;
> + edit_distance_t insertion = v0[j + 1] + 1;
The formatting doesn't look right here.
Marek