This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

cpplib: Charconsts and escape sequences


This patch is further preparation for cpplib having a correct
implementation of arithmetic to the correct target-precision,
so that, in particular, it has no dependence on HOST_*_INT
macros.

cpp_parse_escape and cpp_interpret_charconst return (possibly
wide) character values, and thus should really calculate in,
and have a return value of, type cppchar_t.  This patch does
that.  Since they form part of the library interface, it
seems wrong to me that cpp_interpret_charconst leaves the
caller to determine the signedness of the result, and that
cpp_parse_escape expects the caller to divine a bitmask,
particularly as these things are target- and command-line
dependent in general, so I've fixed that too.

This leaves dequote_string() in a hole, but I think it was
in a hole anyway since cpp_parse_escape is implemented for
the target, so I've not worried about that unduly.

The patch readies cpplib for the capability of specifying
calculation precision at runtime; all that remains is for
cppexp.c to have an implementation of two-integer arithmetic,
which I hope to supply soon.  It includes some enable-checking
sanity checks that cpplib's implicit assumptions about host
vs. target precision are satisfied (in the post_options
section, since target precision is going to be a run-time thing
in general).

Zack, I think this obsoletes your recent patch to lex_charconst
but I'd appreciate your checking.  In fact, I'll wait for any
musings you may have about the whole patch before applying it.

Thanks,

Neil.

	* c-lex.c (lex_string): Let cpp_parse_escape handles truncation
	and sign-extension.
	(lex_charconst): Update for change in prototype of
	cpp_interpret_charconst.  Extend from cppchar_t to HOST_WIDE_INT
	appropriately.
	* cpphash.h (BITS_PER_CPPCHAR_T): New.
	* cppinit.c (cpp_create_reader): Initialize them for no
	change in semantics.
	(cpp_post_options): Add sanity checks.
	* cpplex.c (cpp_parse_escape): Handle precision, sign-extension
	and truncation issues.  Calculate in type cppchar_t.
	(MAX_CHAR_TYPE_SIZE, MAX_WCHAR_TYPE_SIZE): Remove.
	(cpp_interpret_charconst): Calculate in type cppchar_t.  Handle
	run-time dependent precision correctly.  Return whether the
	result is signed or not.
	* cpplib.c (dequote_string): Use cppchar_t; update.
	* cpplib.h (cppchar_signed_t): New.
	struct cpp_options): New precision members.
	(cpp_interpret_charconst, cpp_parse_escape): Update prototypes.

============================================================
Index: gcc/c-lex.c
--- gcc/c-lex.c	27 Apr 2002 06:53:05 -0000	1.173
+++ gcc/c-lex.c	3 May 2002 20:12:41 -0000
@@ -1238,9 +1238,7 @@ lex_string (str, len, wide)
   char *buf = alloca ((len + 1) * (wide ? WCHAR_BYTES : 1));
   char *q = buf;
   const unsigned char *p = str, *limit = str + len;
-  unsigned int c;
-  unsigned width = wide ? WCHAR_TYPE_SIZE
-			: TYPE_PRECISION (char_type_node);
+  cppchar_t c;
 
 #ifdef MULTIBYTE_CHARS
   /* Reset multibyte conversion state.  */
@@ -1270,15 +1268,7 @@ lex_string (str, len, wide)
 #endif
 
       if (c == '\\' && !ignore_escape_flag)
-	{
-	  unsigned int mask;
-
-	  if (width < HOST_BITS_PER_INT)
-	    mask = ((unsigned int) 1 << width) - 1;
-	  else
-	    mask = ~0;
-	  c = cpp_parse_escape (parse_in, &p, limit, mask);
-	}
+	c = cpp_parse_escape (parse_in, &p, limit, wide);
 	
       /* Add this single character into the buffer either as a wchar_t,
 	 a multibyte sequence, or as a single byte.  */
@@ -1345,45 +1335,31 @@ static tree
 lex_charconst (token)
      const cpp_token *token;
 {
-  HOST_WIDE_INT result;
+  cppchar_t result;
   tree type, value;
   unsigned int chars_seen;
+  int unsignedp;
  
   result = cpp_interpret_charconst (parse_in, token, warn_multichar,
- 				    &chars_seen);
-  if (token->type == CPP_WCHAR)
-    {
-      value = build_int_2 (result, 0);
-      type = wchar_type_node;
-    }
-  else
-    {
-      if (result < 0)
- 	value = build_int_2 (result, -1);
-      else
- 	value = build_int_2 (result, 0);
- 
-      /* In C, a character constant has type 'int'.
- 	 In C++ 'char', but multi-char charconsts have type 'int'.  */
-      if (c_language == clk_cplusplus && chars_seen <= 1)
-	type = char_type_node;
-      else
-	type = integer_type_node;
-    }
-
-  /* cpp_interpret_charconst issues a warning if the constant
-     overflows, but if the number fits in HOST_WIDE_INT anyway, it
-     will return it un-truncated, which may cause problems down the
-     line.  So set the type to widest_integer_literal_type, call
-     convert to truncate it to the proper type, then clear
-     TREE_OVERFLOW so we don't get a second warning.
+ 				    &chars_seen, &unsignedp);
 
-     FIXME: cpplib's assessment of overflow may not be accurate on a
-     platform where the final type can change at (compiler's) runtime.  */
+  /* Cast to cppchar_signed_t to get correct sign-extension of RESULT
+     before possibly widening to HOST_WIDE_INT for build_int_2.  */
+  if (unsignedp || (cppchar_signed_t) result >= 0)
+    value = build_int_2 (result, 0);
+  else
+    value = build_int_2 ((cppchar_signed_t) result, -1);
 
-  TREE_TYPE (value) = widest_integer_literal_type_node;
-  value = convert (type, value);
-  TREE_OVERFLOW (value) = 0;
+  if (token->type == CPP_WCHAR)
+    type = wchar_type_node;
+  /* In C, a character constant has type 'int'.
+     In C++ 'char', but multi-char charconsts have type 'int'.  */
+  else if ((c_language == clk_c || c_language == clk_objective_c)
+	   || chars_seen > 1)
+    type = integer_type_node;
+  else
+    type = char_type_node;
 
+  TREE_TYPE (value) = type;
   return value;
 }
============================================================
Index: gcc/cpphash.h
--- gcc/cpphash.h	28 Apr 2002 23:14:52 -0000	1.147
+++ gcc/cpphash.h	3 May 2002 20:12:47 -0000
@@ -29,6 +29,8 @@ struct directive;		/* Deliberately incom
 struct pending_option;
 struct op;
 
+#define BITS_PER_CPPCHAR_T (CHAR_BIT * sizeof (cppchar_t))
+
 /* Test if a sign is valid within a preprocessing number.  */
 #define VALID_SIGN(c, prevc) \
   (((c) == '+' || (c) == '-') && \
============================================================
Index: gcc/cppinit.c
--- gcc/cppinit.c	1 May 2002 20:07:36 -0000	1.215
+++ gcc/cppinit.c	3 May 2002 20:12:54 -0000
@@ -502,6 +502,18 @@ cpp_create_reader (lang)
   CPP_OPTION (pfile, pending) =
     (struct cpp_pending *) xcalloc (1, sizeof (struct cpp_pending));
 
+  /* CPP arithmetic done to existing rules for now.  */
+#define BITS_PER_HOST_WIDEST_INT (CHAR_BIT * sizeof (HOST_WIDEST_INT))
+  CPP_OPTION (pfile, precision) = BITS_PER_HOST_WIDEST_INT;
+#ifndef MAX_CHAR_TYPE_SIZE
+#define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
+#endif
+  CPP_OPTION (pfile, char_precision) = MAX_CHAR_TYPE_SIZE;
+#ifndef MAX_WCHAR_TYPE_SIZE
+#define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
+#endif
+  CPP_OPTION (pfile, wchar_precision) = MAX_WCHAR_TYPE_SIZE;
+
   /* It's simplest to just create this struct whether or not it will
      be needed.  */
   pfile->deps = deps_init ();
@@ -1795,6 +1807,27 @@ cpp_post_options (pfile)
 #endif
       fputc ('\n', stderr);
     }
+
+#if ENABLE_CHECKING
+  /* Sanity checks for CPP arithmetic.  */
+  if (CPP_OPTION (pfile, precision) > BITS_PER_HOST_WIDEST_INT)
+    cpp_error (pfile, DL_FATAL,
+	       "preprocessor arithmetic has maximum precision of %u bits; target requires %u bits",
+	       BITS_PER_HOST_WIDEST_INT, CPP_OPTION (pfile, precision));
+
+  if (CPP_OPTION (pfile, char_precision) > BITS_PER_CPPCHAR_T
+      || CPP_OPTION (pfile, wchar_precision) > BITS_PER_CPPCHAR_T)
+    cpp_error (pfile, DL_FATAL,
+	       "CPP cannot handle (wide) character constants over %u bits",
+	       BITS_PER_CPPCHAR_T);
+
+  {
+    cppchar_t test = 0;
+    test--;
+    if (test < 1)
+      cpp_error (pfile, DL_FATAL, "cppchar_t must be an unsigned type");
+  }
+#endif
 
   /* Canonicalize in_fname and out_fname.  We guarantee they are not
      NULL, and that the empty string represents stdin / stdout.  */
============================================================
Index: gcc/cpplex.c
--- gcc/cpplex.c	30 Apr 2002 20:48:50 -0000	1.198
+++ gcc/cpplex.c	3 May 2002 20:12:59 -0000
@@ -1710,23 +1710,33 @@ maybe_read_ucs (pfile, pstr, limit, pc)
   return 0;
 }
 
-/* Interpret an escape sequence, and return its value.  PSTR points to
-   the input pointer, which is just after the backslash.  LIMIT is how
-   much text we have.  MASK is a bitmask for the precision for the
-   destination type (char or wchar_t).
-
-   Handles all relevant diagnostics.  */
-unsigned int
-cpp_parse_escape (pfile, pstr, limit, mask)
+/* Returns the value of an escape sequence, truncated to the correct
+   target precision.  PSTR points to the input pointer, which is just
+   after the backslash.  LIMIT is how much text we have.  WIDE is true
+   if the escape sequence is part of a wide character constant or
+   string literal.  Handles all relevant diagnostics.  */
+cppchar_t
+cpp_parse_escape (pfile, pstr, limit, wide)
      cpp_reader *pfile;
      const unsigned char **pstr;
      const unsigned char *limit;
-     unsigned HOST_WIDE_INT mask;
+     int wide;
 {
   int unknown = 0;
   const unsigned char *str = *pstr;
-  unsigned int c = *str++;
+  cppchar_t c, mask;
+  unsigned int width;
 
+  if (wide)
+    width = CPP_OPTION (pfile, wchar_precision);
+  else
+    width = CPP_OPTION (pfile, char_precision);
+  if (width < BITS_PER_CPPCHAR_T)
+    mask = ((cppchar_t) 1 << width) - 1;
+  else
+    mask = ~0;
+
+  c = *str++;
   switch (c)
     {
     case '\\': case '\'': case '"': case '?': break;
@@ -1767,7 +1777,7 @@ cpp_parse_escape (pfile, pstr, limit, ma
 		   "the meaning of '\\x' is different in traditional C");
 
 	{
-	  unsigned int i = 0, overflow = 0;
+	  cppchar_t i = 0, overflow = 0;
 	  int digits_found = 0;
 
 	  while (str < limit)
@@ -1798,8 +1808,8 @@ cpp_parse_escape (pfile, pstr, limit, ma
     case '0':  case '1':  case '2':  case '3':
     case '4':  case '5':  case '6':  case '7':
       {
-	unsigned int i = c - '0';
-	int count = 0;
+	size_t count = 0;
+	cppchar_t i = c - '0';
 
 	while (str < limit && ++count < 3)
 	  {
@@ -1834,36 +1844,33 @@ cpp_parse_escape (pfile, pstr, limit, ma
     }
 
   if (c > mask)
-    cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for type");
+    {
+      cpp_error (pfile, DL_PEDWARN, "escape sequence out of range for type");
+      c &= mask;
+    }
 
   *pstr = str;
   return c;
 }
 
-#ifndef MAX_CHAR_TYPE_SIZE
-#define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
-#endif
-
-#ifndef MAX_WCHAR_TYPE_SIZE
-#define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
-#endif
-
 /* Interpret a (possibly wide) character constant in TOKEN.
-   WARN_MULTI warns about multi-character charconsts.  PCHARS_SEEN points
-   to a variable that is filled in with the number of characters seen.  */
-HOST_WIDE_INT
-cpp_interpret_charconst (pfile, token, warn_multi, pchars_seen)
+   WARN_MULTI warns about multi-character charconsts.  PCHARS_SEEN
+   points to a variable that is filled in with the number of
+   characters seen, and UNSIGNEDP to a variable that indicates whether
+   the result has signed type.  */
+cppchar_t
+cpp_interpret_charconst (pfile, token, warn_multi, pchars_seen, unsignedp)
      cpp_reader *pfile;
      const cpp_token *token;
      int warn_multi;
      unsigned int *pchars_seen;
+     int *unsignedp;
 {
   const unsigned char *str = token->val.str.text;
   const unsigned char *limit = str + token->val.str.len;
   unsigned int chars_seen = 0;
-  unsigned int width, max_chars, c;
-  unsigned HOST_WIDE_INT mask;
-  HOST_WIDE_INT result = 0;
+  unsigned int width, max_chars;
+  cppchar_t c, mask, result = 0;
   bool unsigned_p;
 
 #ifdef MULTIBYTE_CHARS
@@ -1873,20 +1880,20 @@ cpp_interpret_charconst (pfile, token, w
   /* Width in bits.  */
   if (token->type == CPP_CHAR)
     {
-      width = MAX_CHAR_TYPE_SIZE;
+      width = CPP_OPTION (pfile, char_precision);
       unsigned_p = CPP_OPTION (pfile, signed_char) == 0;
     }
   else
     {
-      width = MAX_WCHAR_TYPE_SIZE;
+      width = CPP_OPTION (pfile, wchar_precision);
       unsigned_p = WCHAR_UNSIGNED;
     }
 
-  if (width < HOST_BITS_PER_WIDE_INT)
-    mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
+  if (width < BITS_PER_CPPCHAR_T)
+    mask = ((cppchar_t) 1 << width) - 1;
   else
     mask = ~0;
-  max_chars = HOST_BITS_PER_WIDE_INT / width;
+  max_chars = CPP_OPTION (pfile, precision) / width;
 
   while (str < limit)
     {
@@ -1911,7 +1918,7 @@ cpp_interpret_charconst (pfile, token, w
 #endif
 
       if (c == '\\')
-	c = cpp_parse_escape (pfile, &str, limit, mask);
+	c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
 
 #ifdef MAP_CHARACTER
       if (ISPRINT (c))
@@ -1921,7 +1928,7 @@ cpp_interpret_charconst (pfile, token, w
       /* Merge character into result; ignore excess chars.  */
       if (++chars_seen <= max_chars)
 	{
-	  if (width < HOST_BITS_PER_WIDE_INT)
+	  if (width < BITS_PER_CPPCHAR_T)
 	    result = (result << width) | (c & mask);
 	  else
 	    result = c;
@@ -1943,7 +1950,7 @@ cpp_interpret_charconst (pfile, token, w
     {
       unsigned int nbits = chars_seen * width;
 
-      mask = (unsigned HOST_WIDE_INT) ~0 >> (HOST_BITS_PER_WIDE_INT - nbits);
+      mask = (cppchar_t) ~0 >> (BITS_PER_CPPCHAR_T - nbits);
       if (unsigned_p || ((result >> (nbits - 1)) & 1) == 0)
 	result &= mask;
       else
@@ -1951,6 +1958,7 @@ cpp_interpret_charconst (pfile, token, w
     }
 
   *pchars_seen = chars_seen;
+  *unsignedp = unsigned_p;
   return result;
 }
 
============================================================
Index: gcc/cpplib.c
--- gcc/cpplib.c	28 Apr 2002 23:14:52 -0000	1.300
+++ gcc/cpplib.c	3 May 2002 20:13:02 -0000
@@ -726,23 +726,15 @@ dequote_string (pfile, str, len)
   uchar *result = _cpp_unaligned_alloc (pfile, len + 1);
   uchar *dst = result;
   const uchar *limit = str + len;
-  unsigned int c;
-  unsigned HOST_WIDE_INT mask;
+  cppchar_t c;
 
-  /* We need the mask to match the host's 'unsigned char', not the
-     target's.  */
-  if (CHAR_BIT < HOST_BITS_PER_WIDE_INT)
-    mask = ((unsigned HOST_WIDE_INT) 1 << CHAR_BIT) - 1;
-  else
-    mask = ~(unsigned HOST_WIDE_INT)0;
-  
   while (str < limit)
     {
       c = *str++;
       if (c != '\\')
 	*dst++ = c;
       else
-	*dst++ = cpp_parse_escape (pfile, (const uchar **)&str, limit, mask);
+	*dst++ = cpp_parse_escape (pfile, &str, limit, 0);
     }
   *dst++ = '\0';
   return result;
============================================================
Index: gcc/cpplib.h
--- gcc/cpplib.h	28 Apr 2002 19:42:31 -0000	1.210
+++ gcc/cpplib.h	3 May 2002 20:13:04 -0000
@@ -190,9 +190,12 @@ struct cpp_token
   } val;
 };
 
-/* A standalone character.  It is unsigned for the same reason we use
-   unsigned char - to avoid signedness issues.  */
+/* A type wide enough to hold any multibyte source character.
+   cpplib's character constant interpreter uses shifts, and so
+   requires an unsigned type.  */
 typedef unsigned int cppchar_t;
+/* Its signed equivalent.  */
+typedef int cppchar_signed_t;
 
 /* Values for opts.dump_macros.
   dump_only means inhibit output of the preprocessed text
@@ -237,6 +240,10 @@ struct cpp_options
   /* -fleading_underscore sets this to "_".  */
   const char *user_label_prefix;
 
+  /* Precision for target CPP arithmetic, target characters and target
+     wide characters, respectively.  */
+  size_t precision, char_precision, wchar_precision;
+
   /* The language we're preprocessing.  */
   enum c_lang lang;
 
@@ -535,9 +542,9 @@ extern const unsigned char *cpp_macro_de
 extern void _cpp_backup_tokens PARAMS ((cpp_reader *, unsigned int));
 
 /* Evaluate a CPP_CHAR or CPP_WCHAR token.  */
-extern HOST_WIDE_INT
+extern cppchar_t
 cpp_interpret_charconst PARAMS ((cpp_reader *, const cpp_token *,
-				 int, unsigned int *));
+				 int, unsigned int *, int *));
 
 extern void cpp_define PARAMS ((cpp_reader *, const char *));
 extern void cpp_assert PARAMS ((cpp_reader *, const char *));
@@ -600,10 +607,15 @@ extern int cpp_ideq			PARAMS ((const cpp
 extern void cpp_output_line		PARAMS ((cpp_reader *, FILE *));
 extern void cpp_output_token		PARAMS ((const cpp_token *, FILE *));
 extern const char *cpp_type2name	PARAMS ((enum cpp_ttype));
-extern unsigned int cpp_parse_escape	PARAMS ((cpp_reader *,
-						 const unsigned char **,
-						 const unsigned char *,
-						 unsigned HOST_WIDE_INT));
+/* Returns the value of an escape sequence, truncated to the correct
+   target precision.  PSTR points to the input pointer, which is just
+   after the backslash.  LIMIT is how much text we have.  WIDE is true
+   if the escape sequence is part of a wide character constant or
+   string literal.  Handles all relevant diagnostics.  */
+extern cppchar_t cpp_parse_escape	PARAMS ((cpp_reader *,
+						 const unsigned char ** pstr,
+						 const unsigned char *limit,
+						 int wide));
 
 /* In cpphash.c */
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]