[gcc r12-6256] libstdc++: Improve std::regex_error::what() strings

Jonathan Wakely redi@gcc.gnu.org
Wed Jan 5 13:47:28 GMT 2022


https://gcc.gnu.org/g:260a5334ee963f66745d0cb98316ee831737b22d

commit r12-6256-g260a5334ee963f66745d0cb98316ee831737b22d
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Thu Dec 16 13:39:09 2021 +0000

    libstdc++: Improve std::regex_error::what() strings
    
    This replaces the vague "regex_error" for std::regex_error::what() with
    a string that corresponds to the error_type enum passed to the
    constructor. This allows us to remove many of the strings passed to
    __throw_regex_error, because the default string is at least as good.
    When a string argument to __throw_regex_error is kept it should add some
    context-specific detail absent from the default string.
    
    Also remove full stops (periods) from the end of those strings, to make
    it easier to include them in logs and other output. I've left them
    starting with an upper-case letter, which is consistent with strerror
    output for (at least) Glibc, Solaris and BSD. I'm ambivalent whether
    that's the right choice.
    
    This also adds the missing noreturn attribute to __throw_regex_error.
    
    libstdc++-v3/ChangeLog:
    
            * include/bits/regex_compiler.tcc: Adjust all calls to
            __throw_regex_error.
            * include/bits/regex_error.h (__throw_regex_error): Add noreturn
            attribute.
            * include/bits/regex_scanner.tcc: Likewise.
            * src/c++11/regex.cc (desc): New helper function.
            (regex_error::regex_error(error_type)): Use desc to get a string
            corresponding to the error code.

Diff:
---
 libstdc++-v3/include/bits/regex_compiler.tcc | 37 ++++++--------
 libstdc++-v3/include/bits/regex_error.h      | 27 ++++++----
 libstdc++-v3/include/bits/regex_scanner.tcc  | 76 ++++++++++++----------------
 libstdc++-v3/src/c++11/regex.cc              | 47 ++++++++++++++++-
 4 files changed, 111 insertions(+), 76 deletions(-)

diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index ce834b12255..c12f7502538 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -157,8 +157,7 @@ namespace __detail
 	  auto __neg = _M_value[0] == 'n';
 	  this->_M_disjunction();
 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
-	    __throw_regex_error(regex_constants::error_paren,
-				"Parenthesis is not closed.");
+	    __throw_regex_error(regex_constants::error_paren);
 	  auto __tmp = _M_pop();
 	  __tmp._M_append(_M_nfa->_M_insert_accept());
 	  _M_stack.push(
@@ -180,8 +179,7 @@ namespace __detail
       auto __init = [this, &__neg]()
 	{
 	  if (_M_stack.empty())
-	    __throw_regex_error(regex_constants::error_badrepeat,
-				"Nothing to repeat before a quantifier.");
+	    __throw_regex_error(regex_constants::error_badrepeat);
 	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
 	};
       if (_M_match_token(_ScannerT::_S_token_closure0))
@@ -217,11 +215,9 @@ namespace __detail
       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
 	{
 	  if (_M_stack.empty())
-	    __throw_regex_error(regex_constants::error_badrepeat,
-				"Nothing to repeat before a quantifier.");
+	    __throw_regex_error(regex_constants::error_badrepeat);
 	  if (!_M_match_token(_ScannerT::_S_token_dup_count))
-	    __throw_regex_error(regex_constants::error_badbrace,
-				"Unexpected token in brace expression.");
+	    __throw_regex_error(regex_constants::error_badbrace);
 	  _StateSeqT __r(_M_pop());
 	  _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
 	  long __min_rep = _M_cur_int_value(10);
@@ -237,8 +233,7 @@ namespace __detail
 		__infi = true;
 	    }
 	  if (!_M_match_token(_ScannerT::_S_token_interval_end))
-	    __throw_regex_error(regex_constants::error_brace,
-				"Unexpected end of brace expression.");
+	    __throw_regex_error(regex_constants::error_brace);
 
 	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
 
@@ -257,8 +252,7 @@ namespace __detail
 	  else
 	    {
 	      if (__n < 0)
-		__throw_regex_error(regex_constants::error_badbrace,
-				    "Invalid range in brace expression.");
+		__throw_regex_error(regex_constants::error_badbrace);
 	      auto __end = _M_nfa->_M_insert_dummy();
 	      // _M_alt is the "match more" branch, and _M_next is the
 	      // "match less" one. Switch _M_alt and _M_next of all created
@@ -325,8 +319,7 @@ namespace __detail
 	  _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
 	  this->_M_disjunction();
 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
-	    __throw_regex_error(regex_constants::error_paren,
-				"Parenthesis is not closed.");
+	    __throw_regex_error(regex_constants::error_paren);
 	  __r._M_append(_M_pop());
 	  _M_stack.push(__r);
 	}
@@ -335,8 +328,7 @@ namespace __detail
 	  _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
 	  this->_M_disjunction();
 	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
-	    __throw_regex_error(regex_constants::error_paren,
-				"Parenthesis is not closed.");
+	    __throw_regex_error(regex_constants::error_paren);
 	  __r._M_append(_M_pop());
 	  __r._M_append(_M_nfa->_M_insert_subexpr_end());
 	  _M_stack.push(__r);
@@ -503,7 +495,8 @@ namespace __detail
 	    {
 	      // "\\w-" is invalid, start of range must be a single char.
 	      __throw_regex_error(regex_constants::error_range,
-		    "Invalid start of range in bracket expression.");
+				  "Invalid start of '[x-x]' range in "
+				  "regular expression");
 	    }
 	  else if (__last_char._M_is_char())
 	    {
@@ -521,7 +514,8 @@ namespace __detail
 		}
 	      else
 		__throw_regex_error(regex_constants::error_range,
-		      "Invalid end of range in bracket expression.");
+				    "Invalid end of '[x-x]' range in "
+				    "regular expression");
 	    }
 	  else if (_M_flags & regex_constants::ECMAScript)
 	    {
@@ -532,7 +526,8 @@ namespace __detail
 	    }
 	  else
 	    __throw_regex_error(regex_constants::error_range,
-				"Invalid dash in bracket expression.");
+				"Invalid location of '-' within '[...]' in "
+				"POSIX regular expression");
 	}
       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
 	{
@@ -543,8 +538,8 @@ namespace __detail
 	}
       else
 	__throw_regex_error(regex_constants::error_brack,
-			    "Unexpected character in bracket expression.");
-
+			    "Unexpected character within '[...]' in "
+			    "regular expression");
       return true;
     }
 
diff --git a/libstdc++-v3/include/bits/regex_error.h b/libstdc++-v3/include/bits/regex_error.h
index 767600ccdab..77d4925921b 100644
--- a/libstdc++-v3/include/bits/regex_error.h
+++ b/libstdc++-v3/include/bits/regex_error.h
@@ -133,7 +133,9 @@ namespace regex_constants
    */
   class regex_error : public std::runtime_error
   {
-    regex_constants::error_type _M_code;
+    using error_type = regex_constants::error_type;
+
+    error_type _M_code;
 
   public:
     /**
@@ -142,7 +144,7 @@ namespace regex_constants
      * @param __ecode the regex error code.
      */
     explicit
-    regex_error(regex_constants::error_type __ecode);
+    regex_error(error_type __ecode);
 
     virtual ~regex_error() throw();
 
@@ -156,23 +158,30 @@ namespace regex_constants
     { return _M_code; }
 
   private:
-    regex_error(regex_constants::error_type __ecode, const char* __what)
+    regex_error(error_type __ecode, const char* __what)
     : std::runtime_error(__what), _M_code(__ecode)
     { }
 
-    friend void __throw_regex_error(regex_constants::error_type, const char*);
+    [[__noreturn__]]
+    friend void
+    __throw_regex_error(error_type __ecode __attribute__((__unused__)),
+			const char* __what __attribute__((__unused__)))
+    { _GLIBCXX_THROW_OR_ABORT(regex_error(__ecode, __what)); }
   };
 
-  ///@} // group regex
+  /// @cond undocumented
 
+  [[__noreturn__]]
   void
   __throw_regex_error(regex_constants::error_type __ecode);
 
+  [[__noreturn__]]
   inline void
-  __throw_regex_error(regex_constants::error_type __ecode
-			__attribute__((__unused__)),
-		      const char* __what __attribute__((__unused__)))
-  { _GLIBCXX_THROW_OR_ABORT(regex_error(__ecode, __what)); }
+  __throw_regex_error(regex_constants::error_type __ecode, const char* __what);
+
+  /// @endcond
+
+  ///@} // group regex
 
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc
index 1431cfc37e6..2a1745b42f6 100644
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -108,7 +108,7 @@ namespace __detail
 	  if (_M_current == _M_end)
 	    __throw_regex_error(
 	      regex_constants::error_escape,
-	      "Unexpected end of regex when escaping.");
+	      "Invalid escape at end of regular expression");
 
 	  if (!_M_is_basic()
 	      || (*_M_current != '('
@@ -125,9 +125,7 @@ namespace __detail
 	  if (_M_is_ecma() && *_M_current == '?')
 	    {
 	      if (++_M_current == _M_end)
-		__throw_regex_error(
-		  regex_constants::error_paren,
-		  "Unexpected end of regex when in an open parenthesis.");
+		__throw_regex_error(regex_constants::error_paren);
 
 	      if (*_M_current == ':')
 		{
@@ -147,9 +145,9 @@ namespace __detail
 		  _M_value.assign(1, 'n');
 		}
 	      else
-		__throw_regex_error(
-		  regex_constants::error_paren,
-		  "Invalid special open parenthesis.");
+		__throw_regex_error(regex_constants::error_paren,
+				    "Invalid '(?...)' zero-width assertion "
+				    "in regular expression");
 	    }
 	  else if (_M_flags & regex_constants::nosubs)
 	    _M_token = _S_token_subexpr_no_group_begin;
@@ -178,10 +176,7 @@ namespace __detail
       else if (__builtin_expect(__c == _CharT(0), false))
 	{
 	  if (!_M_is_ecma())
-	    {
-	      __throw_regex_error(regex_constants::_S_null,
-		  "Unexpected null character in regular expression");
-	    }
+	    __throw_regex_error(regex_constants::_S_null);
 	  _M_token = _S_token_ord_char;
 	  _M_value.assign(1, __c);
 	}
@@ -213,9 +208,7 @@ namespace __detail
     _M_scan_in_bracket()
     {
       if (_M_current == _M_end)
-	__throw_regex_error(
-	  regex_constants::error_brack,
-	  "Unexpected end of regex when in bracket expression.");
+	__throw_regex_error(regex_constants::error_brack);
 
       auto __c = *_M_current++;
 
@@ -225,7 +218,8 @@ namespace __detail
 	{
 	  if (_M_current == _M_end)
 	    __throw_regex_error(regex_constants::error_brack,
-				"Unexpected character class open bracket.");
+				"Incomplete '[[' character class in "
+				"regular expression");
 
 	  if (*_M_current == '.')
 	    {
@@ -250,7 +244,7 @@ namespace __detail
 	}
       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
       // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
-      // `*/empty_range.cc`.
+      // `.../empty_range.cc`.
       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
 	{
 	  _M_token = _S_token_bracket_end;
@@ -275,9 +269,7 @@ namespace __detail
     _M_scan_in_brace()
     {
       if (_M_current == _M_end)
-	__throw_regex_error(
-	  regex_constants::error_brace,
-	  "Unexpected end of regex when in brace expression.");
+	__throw_regex_error(regex_constants::error_brace);
 
       auto __c = *_M_current++;
 
@@ -301,8 +293,7 @@ namespace __detail
 	      ++_M_current;
 	    }
 	  else
-	    __throw_regex_error(regex_constants::error_badbrace,
-				"Unexpected character in brace expression.");
+	    __throw_regex_error(regex_constants::error_badbrace);
 	}
       else if (__c == '}')
 	{
@@ -310,8 +301,7 @@ namespace __detail
 	  _M_token = _S_token_interval_end;
 	}
       else
-	__throw_regex_error(regex_constants::error_badbrace,
-			    "Unexpected character in brace expression.");
+	__throw_regex_error(regex_constants::error_badbrace);
     }
 
   template<typename _CharT>
@@ -320,8 +310,7 @@ namespace __detail
     _M_eat_escape_ecma()
     {
       if (_M_current == _M_end)
-	__throw_regex_error(regex_constants::error_escape,
-			    "Unexpected end of regex when escaping.");
+	__throw_regex_error(regex_constants::error_escape);
 
       auto __c = *_M_current++;
       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
@@ -355,22 +344,26 @@ namespace __detail
       else if (__c == 'c')
 	{
 	  if (_M_current == _M_end)
-	    __throw_regex_error(
-	      regex_constants::error_escape,
-	      "Unexpected end of regex when reading control code.");
+	    __throw_regex_error(regex_constants::error_escape,
+				"invalid '\\cX' control character in "
+				"regular expression");
 	  _M_token = _S_token_ord_char;
 	  _M_value.assign(1, *_M_current++);
 	}
       else if (__c == 'x' || __c == 'u')
 	{
-	  _M_value.erase();
-	  for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
+	  _M_value.clear();
+	  const int __n = __c == 'x' ? 2 : 4;
+	  for (int __i = 0; __i < __n; __i++)
 	    {
 	      if (_M_current == _M_end
 		  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
-		__throw_regex_error(
-		  regex_constants::error_escape,
-		  "Unexpected end of regex when ascii character.");
+		__throw_regex_error(regex_constants::error_escape,
+				    __n == 2
+				    ? "Invalid '\\xNN' control character in "
+				      "regular expression"
+				    : "Invalid '\\uNNNN' control character in "
+				      "regular expression");
 	      _M_value += *_M_current++;
 	    }
 	  _M_token = _S_token_hex_num;
@@ -399,8 +392,7 @@ namespace __detail
     _M_eat_escape_posix()
     {
       if (_M_current == _M_end)
-	__throw_regex_error(regex_constants::error_escape,
-			    "Unexpected end of regex when escaping.");
+	__throw_regex_error(regex_constants::error_escape);
 
       auto __c = *_M_current;
       auto __pos = __builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
@@ -425,8 +417,7 @@ namespace __detail
 	{
 #ifdef __STRICT_ANSI__
 	  // POSIX says it is undefined to escape ordinary characters
-	  __throw_regex_error(regex_constants::error_escape,
-			      "Unexpected escape character.");
+	  __throw_regex_error(regex_constants::error_escape);
 #else
 	  _M_token = _S_token_ord_char;
 	  _M_value.assign(1, __c);
@@ -466,8 +457,7 @@ namespace __detail
 	  return;
 	}
       else
-	__throw_regex_error(regex_constants::error_escape,
-			    "Unexpected escape character.");
+	__throw_regex_error(regex_constants::error_escape);
     }
 
   // Eats a character class or throws an exception.
@@ -485,12 +475,8 @@ namespace __detail
 	  || _M_current == _M_end // skip __ch
 	  || *_M_current++ != ']') // skip ']'
 	{
-	  if (__ch == ':')
-	    __throw_regex_error(regex_constants::error_ctype,
-				"Unexpected end of character class.");
-	  else
-	    __throw_regex_error(regex_constants::error_collate,
-				"Unexpected end of character class.");
+	  __throw_regex_error(__ch == ':' ? regex_constants::error_ctype
+					  : regex_constants::error_collate);
 	}
     }
 
diff --git a/libstdc++-v3/src/c++11/regex.cc b/libstdc++-v3/src/c++11/regex.cc
index 0a4a5524b22..d5e1cc7612a 100644
--- a/libstdc++-v3/src/c++11/regex.cc
+++ b/libstdc++-v3/src/c++11/regex.cc
@@ -35,8 +35,53 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		      __attribute__((unused)))
   { _GLIBCXX_THROW_OR_ABORT(regex_error(__ecode)); }
 
+namespace
+{
+  const char*
+  desc(regex_constants::error_type e)
+  {
+    using namespace regex_constants;
+    switch (e)
+    {
+    case error_collate:
+      return "Invalid collating element in regular expression";
+    case error_ctype:
+      return "Invalid character class in regular expression";
+    case error_escape:
+      return "Invalid escape in regular expression";
+    case error_backref:
+      return "Invalid back reference in regular expression";
+    case error_brack:
+      return "Mismatched '[' and ']' in regular expression";
+    case error_paren:
+      return "Mismatched '(' and ')' in regular expression";
+    case error_brace:
+      return "Mismatched '{' and '}' in regular expression";
+    case error_badbrace:
+      return "Invalid range in '{}' in regular expression";
+    case error_range:
+      return "Invalid character range in regular expression";
+    case error_space:
+      return "Insufficient memory to compile regular expression";
+    case error_badrepeat:
+      return "Invalid '?', '*', or '+' in regular expression";
+    case error_complexity:
+      return "Complexity of regex match exceeded implementation limits";
+    case error_stack:
+      return "Insufficient memory to determine regex match";
+    case _S_null:
+      return "Unexpected null character in regular expression";
+    case _S_grammar:
+      return "Conflicting regex grammar options";
+    default:
+      return "regex error";
+    };
+
+  }
+}
+
   regex_error::regex_error(regex_constants::error_type __ecode)
-  : std::runtime_error("regex_error"), _M_code(__ecode)
+  : std::runtime_error(desc(__ecode)), _M_code(__ecode)
   { }
 
   regex_error::~regex_error() throw() { }


More information about the Libstdc++-cvs mailing list