RFC: Provide diagnostic hints for missing inttypes.h string constants.

Fri May 22 13:42:28 GMT 2020

On Fri, 2020-05-22 at 01:30 +0200, Mark Wielaard wrote:
Hi Mark

> This is on top of the stdbool.h and stdint.h patches.

Sorry, I didn't see those patches; I've replied to them now.

> This adds a flag to c_parser so we know when we were trying to
> constract a string literal. If there is a parse error and we were
> constructing a string literal, and the next token is an unknown
> identifier name, and we know there is a standard header that defines
> that name as a string literal, then add a missing header hint to
> the error messsage.

By comparison, what's the existing behavior for the example?
Is it just what you posted below, but without the "note" diagnostics?

> I haven't checked yet if we can do something similar for the C++
> parser. And the testcase needs to be extended a bit. But I hope the
> direction is OK.

I think it's a worthy goal; as noted below I'd want a C frontend
maintainer to approve those parts of it.

> For the following source:
> 
> const char *hex64_fmt =  PRIx64;
> const char *msg_fmt = "Provide %" PRIx64 "\n";
> 
> void foo (uint32_t v)
> {
>   printf ("%" PRIu32, v);
> }
> 
> We will get the following:
> 
> $ /opt/local/gcc/bin/gcc -c t.c
> t.c:4:26: error: ‘PRIx64’ undeclared here (not in a function)
>     4 | const char *hex64_fmt =  PRIx64;
>       |                          ^~~~~~
> t.c:3:1: note: ‘PRIx64’ is defined in header ‘<inttypes.h>’; did you forget to ‘#include <inttypes.h>’?
>     2 | #include <stdint.h>
>   +++ |+#include <inttypes.h>
>     3 |
> t.c:5:35: error: expected ‘,’ or ‘;’ before ‘PRIx64’
>     5 | const char *msg_fmt = "Provide %" PRIx64 "\n";
>       |                                   ^~~~~~
> t.c:5:35: note: ‘PRIx64’ is defined in header ‘<inttypes.h>’; did you forget to ‘#include <inttypes.h>’?

It's a big improvement, but it's still a little clunky.  The
  expected ‘,’ or ‘;’ before ‘PRIx64’
is the sort of thing we've gotten used to with years of GCC messages,
but might make little sense to a neophyte.
I wonder if it's possible to improve this further, but I fear that that
might overcomplicate things (if I understand things correctly, the
string concatenation fails in the tokenizer, and then the PRIx64 fails
in the parser, so we have a bad interaction involving two different
levels of abstraction within the frontend - I think).

> t.c: In function ‘foo’:
> t.c:9:14: error: expected ‘)’ before ‘PRIu32’
>     9 |   printf ("%" PRIu32, v);
>       |              ^~~~~~~
>       |              )
> t.c:9:15: note: ‘PRIu32’ is defined in header ‘<inttypes.h>’; did you forget to ‘#include <inttypes.h>’?
>     9 |   printf ("%" PRIu32, v);
>       |               ^~~~~~
> ---

[...snip...]

> +/* These can be used as string macros or as identifiers. Must all be
> +   string-literals.  Used in get_stdlib_header_for_name and
> +   get_c_stdlib_header_for_string_macro_name.  */
> +static const stdlib_hint c99_cxx11_macro_hints[] = {
> +    /* <inttypes.h> and <cinttypes>.  */
> +    {"PRId8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRId16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRId32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRId64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIi8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIi16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIi32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIi64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIo8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIo16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIo32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIo64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIu8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIu16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIu32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIu64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIx8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIx16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIx32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIx64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIX8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIX16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIX32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIX64", {"<inttypes.h>", "<cinttypes>"} },
> +
> +    {"PRIdPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIiPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIoPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIuPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIxPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"PRIXPTR", {"<inttypes.h>", "<cinttypes>"} },
> +
> +    {"SCNd8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNd16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNd32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNd64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNi8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNi16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNi32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNi64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNo8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNo16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNo32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNo64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNu8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNu16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNu32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNu64", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNx8", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNx16", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNx32", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNx64", {"<inttypes.h>", "<cinttypes>"} },
> +
> +    {"SCNdPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNiPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNoPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNuPTR", {"<inttypes.h>", "<cinttypes>"} },
> +    {"SCNxPTR", {"<inttypes.h>", "<cinttypes>"} }
> +};

I found myself squinting at the array trying to decide if every entry
had
   {"<inttypes.h>", "<cinttypes>"}
as its second element.  I *think* that's the case, right? 

I know there's a lot of pre-existing duplication in this file, but
maybe this would be better as simply an array of const char *?
It could probably be reformatted to take up far fewer lines by grouping
them logically.

Maybe have a

  static const char *
  get_c99_cxx11_macro_hint (const char *, enum stdlib lib);

to do the predicate, and return one of "<inttypes.h>", "<cinttypes>", or NULL?

[...snip...]

> diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
> index 5d11e7e73c16..6b2ae5688a72 100644
> --- a/gcc/c/c-parser.c
> +++ b/gcc/c/c-parser.c
> @@ -69,6 +69,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "c-family/name-hint.h"
>  #include "tree-iterator.h"
>  #include "memmodel.h"
> +#include "c-family/known-headers.h"
>  
>  /* We need to walk over decls with incomplete struct/union/enum types
>     after parsing the whole translation unit.
> @@ -223,6 +224,13 @@ struct GTY(()) c_parser {
>       keywords are valid.  */
>    BOOL_BITFIELD objc_property_attr_context : 1;
>  
> +  /* Whether we have just seen/constructed a string-literal.  Set when
> +     returning a string-literal from c_parser_string_literal.  Reset
> +     in consume_token.  Useful when we get a parse error and see an
> +     unknown token, which could have been a string-literal constant
> +     macro.  */
> +  BOOL_BITFIELD seen_string_literal : 1;
> +
>    /* Location of the last consumed token.  */
>    location_t last_token_location;
>  };
> @@ -853,6 +861,7 @@ c_parser_consume_token (c_parser *parser)
>          }
>      }
>    parser->tokens_avail--;
> +  parser->seen_string_literal = false;
>  }
>

These hunks need review from a C frontend maintainer, as they're adding
a little extra work to every token.

>  /* Expect the current token to be a #pragma.  Consume it and remember
> @@ -966,6 +975,25 @@ c_parser_error_richloc (c_parser *parser, const char *gmsgid,
>         }
>      }
>  
> +  /* If we were parsing a string-literal and there is an unknown name
> +     token right after, then check to see if that could also have been
> +     a literal string by checking the name against a list of known
> +     standard string literal constants defined in header files. If
> +     there is one, then add that as an hint to the error message. */
> +  auto_diagnostic_group d;
> +  name_hint h;
> +  if (parser->seen_string_literal && token->type == CPP_NAME)
> +    {
> +      tree name = token->value;
> +      const char *header_hint
> +       = get_c_stdlib_header_for_name (IDENTIFIER_POINTER (name));
> +      if (header_hint != NULL)
> +       h = name_hint (NULL,
> +                      new suggest_missing_header (token->location,
> +                                                  IDENTIFIER_POINTER (name),
> +                                                  header_hint));
> +    }
> +
>    c_parse_error (gmsgid,
>                  /* Because c_parse_error does not understand
>                     CPP_KEYWORD, keywords are treated like
> @@ -7539,6 +7567,7 @@ c_parser_string_literal (c_parser *parser, bool translate, bool wide_ok)
>    ret.original_code = STRING_CST;
>    ret.original_type = NULL_TREE;
>    set_c_expr_source_range (&ret, get_range_from_loc (line_table, loc));
> +  parser->seen_string_literal = true;
>    return ret;
>  }

[...snip...]

Hope this is constructive
Dave