[PATCH] libstdc++: Handle errors from strxfrm in std::collate::transform [PR85824]

Jonathan Wakely jwakely@redhat.com
Wed Dec 18 21:15:36 GMT 2024


std::regex builds a cache of equivalence classes by calling
std::regex_traits<char>::transform_primary(c) for every char, which then
calls std::collate<char>::transform which calls strxfrm. On several
targets strxfrm fails for non-ASCII characters. Because strxfrm has no
return value reserved to indicate an error, some implementations return
INT_MAX or SIZE_MAX. This causes std::collate::transform to try to
allocate a huge buffer, which is either very slow or throws
std::bad_alloc. We should check errno after calling strxfrm to detect
errors and then throw a more appropriate exception instead of trying to
allocate a huge buffer.

Unfortunately the std::collate<C>::_M_transform function has a
non-throwing exception specifier, so we can't do the error handling
there.

As well as checking errno, this patch changes std::collate::do_transform
to use __builtin_alloca for small inputs, and to use RAII to deallocate
the buffers used for large inputs.

This change isn't sufficient to fix the three std::regex bugs caused by
the lack of error handling in std::collate::do_transform, we also need
to make std::regex_traits::transform_primary handle exceptions. This
change also attempts to make transform_primary closer to the effects
described in the standard, by not even attempting to use std::collate
if the locale's std::collate facet has been replaced (see PR 118105).

Arguably, we should not even try to call transform_primary for any char
values over 127, since they're never valid in locales that use UTF-8 or
7-bit ASCII, and probably for other charsets too. Handling 128
exceptions for every std::regex compilation is very inefficient, but at
least it now works instead of failing with std::bad_alloc, and no longer
allocates 128 x 2GB. Maybe for C++26 we could check the locale's
std::text_encoding and use that to decide whether to cache equivalence
classes for char values over 127.

I'm unsure if std::regex_traits<C>::transform_primary is supposed to
convert the string to lower case or not.  The general regex traits
requirements ([re.req] p20) do say "when character case is not
considered" but the specification for the std::regex_traits<char> and
std::regex_traits<wchar_t> specializations ([re.traits] p7) don't say
anything about that.

libstdc++-v3/ChangeLog:

	PR libstdc++/85824
	PR libstdc++/94409
	PR libstdc++/98723
	PR libstdc++/118105
	* include/bits/locale_classes.tcc (collate::do_transform): Check
	errno after calling _M_transform. Use RAII type to manage the
	buffer and to restore errno.
	* include/bits/regex.h (regex_traits::transform_primary): Handle
	exceptions from std::collate::transform and do not try to use
	std::collate for user-defined facets.
---

Tested x86_64-linux.

 libstdc++-v3/include/bits/locale_classes.tcc | 94 ++++++++++++++------
 libstdc++-v3/include/bits/regex.h            | 43 ++++++---
 2 files changed, 96 insertions(+), 41 deletions(-)

diff --git a/libstdc++-v3/include/bits/locale_classes.tcc b/libstdc++-v3/include/bits/locale_classes.tcc
index 2b78008e9ae..6e8f27bf0d9 100644
--- a/libstdc++-v3/include/bits/locale_classes.tcc
+++ b/libstdc++-v3/include/bits/locale_classes.tcc
@@ -37,6 +37,9 @@
 #ifdef _GLIBCXX_SYSHDR
 #pragma GCC system_header
 #endif
+
+#include <cerrno>
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wc++11-extensions" // extern template
 #pragma GCC diagnostic ignored "-Wvariadic-macros"
@@ -295,43 +298,76 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       size_t __len = (__hi - __lo) * 2;
 
-      _CharT* __c = new _CharT[__len];
+      struct _Buf
+      {
+	_Buf(size_t __n, void* __buf, int __e)
+	: _M_c(__buf ? (_CharT*)__buf : new _CharT[__n]),
+	  _M_stackbuf(__buf),
+	  _M_errno(__e)
+	{ }
 
-      __try
+	~_Buf()
 	{
-	  // strxfrm stops when it sees a nul character so we break
-	  // the string into zero-terminated substrings and pass those
-	  // to strxfrm.
-	  for (;;)
+	  if (_M_c != _M_stackbuf)
+	    delete[] _M_c;
+	  if (errno == 0)
+	    errno = _M_errno;
+	}
+
+	void _M_realloc(size_t __len)
+	{
+	  _CharT* __p = new _CharT[__len];
+	  if (_M_c != _M_stackbuf)
+	    delete[] _M_c;
+	  _M_c = __p;
+	}
+
+	_CharT* _M_c;
+	void* const _M_stackbuf;
+	int _M_errno;
+      };
+
+      const size_t __bytes = __len * sizeof(_CharT);
+      _Buf __buf(__len, __bytes <= 256 ? __builtin_alloca(__bytes) : 0, errno);
+      errno = 0;
+
+      // strxfrm stops when it sees a nul character so we break
+      // the string into zero-terminated substrings and pass those
+      // to strxfrm.
+      for (;;)
+	{
+	  // First try a buffer perhaps big enough.
+	  size_t __res = _M_transform(__buf._M_c, __p, __len);
+	  // If the buffer was not large enough, try again with the
+	  // correct size.
+	  if (__res >= __len)
 	    {
-	      // First try a buffer perhaps big enough.
-	      size_t __res = _M_transform(__c, __p, __len);
-	      // If the buffer was not large enough, try again with the
-	      // correct size.
-	      if (__res >= __len)
+	      if (__builtin_expect(errno, 0))
 		{
-		  __len = __res + 1;
-		  delete [] __c, __c = 0;
-		  __c = new _CharT[__len];
-		  __res = _M_transform(__c, __p, __len);
+#if __cpp_exceptions
+		  __throw_system_error(errno);
+#else
+		  // std::regex can call this function internally with
+		  // char values that always fail, so we don't want to
+		  // use _GLIBCXX_THROW_OR_ABORT here.
+		  __ret.clear();
+		  break;
+#endif
 		}
 
-	      __ret.append(__c, __res);
-	      __p += char_traits<_CharT>::length(__p);
-	      if (__p == __pend)
-		break;
-
-	      __p++;
-	      __ret.push_back(_CharT());
+	      __len = __res + 1;
+	      __buf._M_realloc(__len);
+	      __res = _M_transform(__buf._M_c, __p, __len);
 	    }
-	}
-      __catch(...)
-	{
-	  delete [] __c;
-	  __throw_exception_again;
-	}
 
-      delete [] __c;
+	  __ret.append(__buf._M_c, __res);
+	  __p += char_traits<_CharT>::length(__p);
+	  if (__p == __pend)
+	    break;
+
+	  __p++;
+	  __ret.push_back(_CharT());
+	}
 
       return __ret;
     }
diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h
index 68ff479c905..57ea68e7ee9 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -253,9 +253,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
        * @param __first beginning of the character sequence.
        * @param __last  one-past-the-end of the character sequence.
        *
-       * Effects: if typeid(use_facet<collate<_Ch_type> >) ==
-       * typeid(collate_byname<_Ch_type>) and the form of the sort key
-       * returned by collate_byname<_Ch_type>::transform(__first, __last)
+       * Effects: if `typeid(use_facet<collate<_Ch_type>>(getloc())) ==
+       * typeid(collate_byname<_Ch_type>)` and the form of the sort key
+       * returned by `collate_byname<_Ch_type>::transform(__first, __last)`
        * is known and can be converted into a primary sort key
        * then returns that key, otherwise returns an empty string.
        *
@@ -265,17 +265,36 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
 	string_type
 	transform_primary(_Fwd_iter __first, _Fwd_iter __last) const
 	{
+	  string_type __ret;
+#if __cpp_rtti
+	  const auto& __fclt = use_facet<collate<char_type>>(_M_locale);
+	  if (typeid(__fclt) != typeid(collate<char_type>)) // FIXME: PR 118110
+	    return __ret;
+
 	  // TODO : this is not entirely correct.
 	  // This function requires extra support from the platform.
-	  //
-	  // Read http://gcc.gnu.org/ml/libstdc++/2013-09/msg00117.html and
-	  // http://www.open-std.org/Jtc1/sc22/wg21/docs/papers/2003/n1429.htm
-	  // for details.
-	  typedef std::ctype<char_type> __ctype_type;
-	  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
-	  _GLIBCXX_STD_C::vector<char_type> __s(__first, __last);
-	  __fctyp.tolower(__s.data(), __s.data() + __s.size());
-	  return this->transform(__s.data(), __s.data() + __s.size());
+	  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118105
+
+	  const auto& __fctyp(use_facet<ctype<char_type>>(_M_locale));
+	  basic_string<char_type> __s(__first, __last);
+	  const auto __p = const_cast<char_type*>(__s.c_str());
+	  const auto __pend = __p + __s.size();
+	  // XXX: should we use tolower here? The regex traits requirements
+	  // say that transform_primary ignores case, but the specification
+	  // for the std::regex_traits<char> and std::regex_traits<wchar_t>
+	  // specializations don't, they seem to suggest just using the
+	  // collate::transform function to get a primary sort key.
+	  __fctyp.tolower(__p, __pend);
+
+	  __try
+	    {
+	      __ret = __fclt.transform(__p, __pend);
+	    }
+	  __catch (const exception&)
+	    {
+	    }
+#endif
+	  return __ret;
 	}
 
       /**
-- 
2.47.1



More information about the Libstdc++ mailing list